From 582fc32574a3b158c81e49cb00e6ae59205e66ba Mon Sep 17 00:00:00 2001 From: Antonin Kral Date: Thu, 17 Mar 2011 00:05:43 +0100 Subject: Imported Upstream version 1.8.0 --- .gitignore | 4 +- SConstruct | 720 ++- bson/bson-inl.h | 665 +++ bson/bson.h | 30 +- bson/bson_db.h | 28 +- bson/bsondemo/bsondemo.cpp | 26 +- bson/bsonelement.h | 662 +-- bson/bsoninlines.h | 588 --- bson/bsonmisc.h | 78 +- bson/bsonobj.h | 212 +- bson/bsonobjbuilder.h | 265 +- bson/bsonobjiterator.h | 70 +- bson/bsontypes.h | 112 +- bson/inline_decls.h | 2 +- bson/oid.cpp | 154 + bson/oid.h | 95 +- bson/ordering.h | 10 +- bson/stringdata.h | 37 +- bson/util/atomic_int.h | 40 +- bson/util/builder.h | 149 +- bson/util/misc.h | 4 +- buildscripts/distmirror.py | 2 +- buildscripts/errorcodes.py | 21 +- buildscripts/frob_version.py | 2 +- buildscripts/hacks_ubuntu.py | 2 +- buildscripts/makealldists.py | 20 +- buildscripts/makedist.py | 32 +- buildscripts/mergerepositories.py | 2 +- buildscripts/s3del.py | 36 + buildscripts/smoke.py | 425 +- buildscripts/utils.py | 21 +- client/clientOnly.cpp | 16 +- client/connpool.cpp | 223 +- client/connpool.h | 182 +- client/constants.h | 20 +- client/dbclient.cpp | 587 +-- client/dbclient.h | 506 +- client/dbclient_rs.cpp | 594 +++ client/dbclient_rs.h | 276 ++ client/dbclientcursor.cpp | 102 +- client/dbclientcursor.h | 132 +- client/dbclientmockcursor.h | 40 + client/distlock.cpp | 272 +- client/distlock.h | 55 +- client/distlock_test.cpp | 58 +- client/examples/authTest.cpp | 11 +- client/examples/clientTest.cpp | 35 +- client/examples/first.cpp | 11 +- client/examples/httpClientTest.cpp | 8 +- client/examples/rs.cpp | 58 + client/examples/second.cpp | 2 +- client/examples/tail.cpp | 40 +- client/examples/tutorial.cpp | 64 +- client/examples/whereExample.cpp | 10 +- client/gridfs.cpp | 60 +- client/gridfs.h | 28 +- client/model.cpp | 44 +- client/model.h | 8 +- client/mongo_client_lib.cpp | 66 + client/parallel.cpp | 376 +- client/parallel.h | 98 +- client/redef_macros.h | 3 + client/simple_client_demo.cpp | 36 + client/syncclusterconnection.cpp | 195 +- client/syncclusterconnection.h | 41 +- client/undef_macros.h | 3 + db/background.h | 12 +- db/btree.cpp | 1242 +++-- db/btree.h | 585 ++- db/btreecursor.cpp | 145 +- db/cap.cpp | 239 +- db/client.cpp | 396 +- db/client.h | 277 +- db/clientcursor.cpp | 310 +- db/clientcursor.h | 296 +- db/cloner.cpp | 169 +- db/cmdline.cpp | 191 +- db/cmdline.h | 114 +- db/commands.cpp | 26 +- db/commands.h | 20 +- db/commands/distinct.cpp | 150 + db/commands/group.cpp | 202 + db/commands/isself.cpp | 220 + db/commands/mr.cpp | 1074 ++++ db/commands/mr.h | 291 ++ db/common.cpp | 4 + db/compact.cpp | 199 + db/concurrency.h | 249 +- db/curop-inl.h | 42 + db/curop.h | 358 +- db/cursor.cpp | 23 +- db/cursor.h | 98 +- db/database.cpp | 211 +- db/database.h | 214 +- db/db.cpp | 747 ++- db/db.h | 119 +- db/db.sln | 86 - db/db.vcproj | 1885 ------- db/db.vcxproj | 72 +- db/db.vcxproj.filters | 1229 ++--- db/db_10.sln | 8 - db/dbcommands.cpp | 936 ++-- db/dbcommands_admin.cpp | 233 +- db/dbcommands_generic.cpp | 198 +- db/dbeval.cpp | 23 +- db/dbhelpers.cpp | 166 +- db/dbhelpers.h | 38 +- db/dbmessage.h | 39 +- db/dbwebserver.cpp | 316 +- db/dbwebserver.h | 31 +- db/diskloc.h | 101 +- db/driverHelpers.cpp | 16 +- db/dur.cpp | 635 +++ db/dur.h | 201 + db/dur_commitjob.cpp | 210 + db/dur_commitjob.h | 221 + db/dur_journal.cpp | 576 +++ db/dur_journal.h | 68 + db/dur_journalformat.h | 166 + db/dur_journalimpl.h | 101 + db/dur_preplogbuffer.cpp | 192 + db/dur_recover.cpp | 457 ++ db/dur_recover.h | 45 + db/dur_stats.h | 46 + db/dur_writetodatafiles.cpp | 99 + db/durop.cpp | 160 + db/durop.h | 111 + db/extsort.cpp | 147 +- db/extsort.h | 50 +- db/filever.h | 8 +- db/geo/2d.cpp | 949 ++-- db/geo/core.h | 153 +- db/geo/haystack.cpp | 146 +- db/helpers/dblogger.h | 4 +- db/index.cpp | 148 +- db/index.h | 46 +- db/indexkey.cpp | 161 +- db/indexkey.h | 67 +- db/instance.cpp | 557 ++- db/instance.h | 50 +- db/introspect.cpp | 3 +- db/jsobj.cpp | 409 +- db/jsobj.h | 4 +- db/jsobjmanipulator.h | 44 +- db/json.cpp | 69 +- db/lasterror.cpp | 109 +- db/lasterror.h | 40 +- db/matcher.cpp | 495 +- db/matcher.h | 71 +- db/matcher_covered.cpp | 53 +- db/minilex.h | 190 +- db/module.cpp | 16 +- db/module.h | 10 +- db/modules/mms.cpp | 88 +- db/mongommf.cpp | 391 ++ db/mongommf.h | 140 + db/mongomutex.h | 239 + db/mr.cpp | 721 --- db/namespace-inl.h | 130 + db/namespace.cpp | 398 +- db/namespace.h | 517 +- db/nonce.cpp | 54 +- db/nonce.h | 22 +- db/oplog.cpp | 228 +- db/oplog.h | 133 +- db/oplogreader.h | 46 +- db/pdfile.cpp | 800 +-- db/pdfile.h | 236 +- db/projection.cpp | 301 ++ db/projection.h | 127 + db/query.cpp | 560 ++- db/query.h | 106 +- db/queryoptimizer.cpp | 657 +-- db/queryoptimizer.h | 224 +- db/queryutil.cpp | 840 ++-- db/queryutil.h | 209 +- db/rec.h | 137 - db/reccache.cpp | 419 -- db/reccache.h | 262 - db/reci.h | 64 - db/recstore.h | 126 - db/repl.cpp | 631 +-- db/repl.h | 70 +- db/repl/connections.h | 49 +- db/repl/consensus.cpp | 124 +- db/repl/health.cpp | 161 +- db/repl/health.h | 8 +- db/repl/heartbeat.cpp | 71 +- db/repl/manager.cpp | 70 +- db/repl/multicmd.h | 29 +- db/repl/replset_commands.cpp | 106 +- db/repl/rs.cpp | 282 +- db/repl/rs.h | 115 +- db/repl/rs_config.cpp | 174 +- db/repl/rs_config.h | 20 +- db/repl/rs_exception.h | 18 +- db/repl/rs_initialsync.cpp | 205 +- db/repl/rs_initiate.cpp | 66 +- db/repl/rs_member.h | 35 +- db/repl/rs_optime.h | 114 +- db/repl/rs_rollback.cpp | 661 +-- db/repl/rs_sync.cpp | 368 +- db/repl_block.cpp | 92 +- db/repl_block.h | 10 +- db/replpair.h | 30 +- db/resource.h | 32 +- db/restapi.cpp | 60 +- db/restapi.h | 34 + db/scanandorder.h | 36 +- db/security.cpp | 20 +- db/security.h | 28 +- db/security_commands.cpp | 89 +- db/security_key.cpp | 105 + db/security_key.h | 47 + db/stats/counters.cpp | 110 +- db/stats/counters.h | 77 +- db/stats/fine_clock.h | 13 +- db/stats/service_stats.cpp | 6 +- db/stats/snapshots.cpp | 121 +- db/stats/snapshots.h | 20 +- db/stats/top.cpp | 115 +- db/stats/top.h | 57 +- db/storage.cpp | 81 - db/taskqueue.h | 106 + db/tests.cpp | 2 +- db/update.cpp | 617 ++- db/update.h | 276 +- dbtests/background_job_test.cpp | 109 + dbtests/balancer_policy_tests.cpp | 203 + dbtests/basictests.cpp | 277 +- dbtests/btreetests.cpp | 1412 +++++- dbtests/clienttests.cpp | 77 +- dbtests/commandtests.cpp | 18 +- dbtests/cursortests.cpp | 33 +- dbtests/d_chunk_manager_tests.cpp | 467 ++ dbtests/dbtests.cpp | 4 +- dbtests/directclienttests.cpp | 80 + dbtests/framework.cpp | 142 +- dbtests/framework.h | 52 +- dbtests/histogram_test.cpp | 20 +- dbtests/jsobjtests.cpp | 370 +- dbtests/jsontests.cpp | 74 +- dbtests/jstests.cpp | 363 +- dbtests/matchertests.cpp | 66 +- dbtests/mmaptests.cpp | 219 + dbtests/mockdbclient.h | 4 +- dbtests/namespacetests.cpp | 56 +- dbtests/pairingtests.cpp | 24 +- dbtests/pdfiletests.cpp | 131 +- dbtests/perf/btreeperf.cpp | 442 ++ dbtests/perf/perftest.cpp | 88 +- dbtests/perftests.cpp | 336 ++ dbtests/queryoptimizertests.cpp | 555 ++- dbtests/querytests.cpp | 302 +- dbtests/repltests.cpp | 411 +- dbtests/sharding.cpp | 12 +- dbtests/socktests.cpp | 13 +- dbtests/spin_lock_test.cpp | 68 +- dbtests/test.vcproj | 1453 ------ dbtests/test.vcxproj | 57 +- dbtests/test.vcxproj.filters | 141 +- dbtests/threadedtests.cpp | 154 +- dbtests/updatetests.cpp | 195 +- debian/changelog | 134 - debian/compat | 1 - debian/control | 29 - debian/copyright | 23 - debian/dirs | 3 - debian/init.d | 243 - debian/lintian-overrides | 11 - debian/mongo.1 | 62 - debian/mongod.1 | 16 - debian/mongodb.conf | 95 - debian/mongodump.1 | 36 - debian/mongoexport.1 | 51 - debian/mongofiles.1 | 52 - debian/mongoimport.1 | 63 - debian/mongorestore.1 | 36 - debian/mongos.1 | 39 - debian/mongosniff.1 | 30 - debian/mongostat.1 | 39 - debian/postinst | 55 - debian/postrm | 39 - debian/prerm | 41 - debian/rules | 107 - debian/watch | 10 - distsrc/client/SConstruct | 48 +- doxygenConfig | 4 +- jstests/_tst.js | 41 + jstests/apitest_db.js | 5 + jstests/array4.js | 30 + jstests/arrayfind3.js | 21 + jstests/auth/auth1.js | 2 +- jstests/basic3.js | 32 +- jstests/big_object1.js | 46 + jstests/capped3.js | 6 +- jstests/capped6.js | 25 + jstests/capped7.js | 19 +- jstests/capped8.js | 86 + jstests/check_shard_index.js | 45 + jstests/conc_update.js | 45 - jstests/coveredIndex1.js | 59 + jstests/coveredIndex2.js | 18 + jstests/cursora.js | 41 +- jstests/datasize3.js | 8 +- jstests/dbcase.js | 4 +- jstests/disk/directoryperdb.js | 4 +- jstests/disk/diskfull.js | 12 +- jstests/disk/killall.js | 42 + jstests/disk/preallocate.js | 8 +- jstests/disk/preallocate2.js | 6 +- jstests/disk/preallocate_directoryperdb.js | 50 + jstests/distinct1.js | 2 + jstests/distinct_array1.js | 1 + jstests/distinct_index1.js | 50 + jstests/distinct_index2.js | 35 + jstests/drop2.js | 43 + jstests/dropIndex.js | 16 - jstests/drop_index.js | 16 + jstests/dur/a_quick.js | 123 + jstests/dur/closeall.js | 80 + jstests/dur/diskfull.js | 136 + jstests/dur/dropdb.js | 163 + jstests/dur/dur1.js | 154 + jstests/dur/dur2.js | 92 + jstests/dur/lsn.js | 126 + jstests/dur/manyRestart.js | 191 + jstests/dur/md5.js | 101 + jstests/dur/oplog.js | 159 + jstests/error5.js | 2 +- jstests/eval_nolock.js | 16 + jstests/evalc.js | 14 - jstests/evald.js | 68 + jstests/evale.js | 5 + jstests/evalf.js | 26 + jstests/exists.js | 3 +- jstests/explain1.js | 2 +- jstests/explain2.js | 6 +- jstests/explain3.js | 24 + jstests/find_and_modify3.js | 4 +- jstests/geo_borders.js | 189 + jstests/geo_center_sphere1.js | 93 + jstests/geo_circle2.js | 3 + jstests/geo_circle2a.js | 36 + jstests/geo_near_random1.js | 12 + jstests/geo_near_random2.js | 21 + jstests/geo_sort1.js | 22 + jstests/geo_update1.js | 38 + jstests/geo_update2.js | 40 + jstests/geof.js | 19 + jstests/group6.js | 1 + jstests/in3.js | 2 +- jstests/in4.js | 4 +- jstests/index11.js | 13 + jstests/index_check6.js | 45 +- jstests/index_check7.js | 2 +- jstests/index_many2.js | 2 + jstests/index_sparse1.js | 46 + jstests/index_sparse2.js | 21 + jstests/indexh.js | 7 + jstests/indexi.js | 16 + jstests/indexj.js | 44 + jstests/insert2.js | 8 + jstests/jni2.js | 4 +- jstests/killop.js | 43 + jstests/libs/concurrent.js | 30 + jstests/libs/fun.js | 32 + jstests/libs/geo_near_random.js | 78 + jstests/libs/grid.js | 172 + jstests/libs/network.js | 37 + jstests/misc/biginsert.js | 18 + jstests/mr1.js | 22 +- jstests/mr2.js | 27 +- jstests/mr3.js | 10 +- jstests/mr4.js | 4 +- jstests/mr5.js | 4 +- jstests/mr_bigobject.js | 13 +- jstests/mr_comments.js | 28 + jstests/mr_errorhandling.js | 8 +- jstests/mr_index.js | 43 + jstests/mr_index2.js | 22 + jstests/mr_index3.js | 50 + jstests/mr_killop.js | 127 + jstests/mr_merge.js | 51 + jstests/mr_optim.js | 47 + jstests/mr_outreduce.js | 41 + jstests/mr_outreduce2.js | 27 + jstests/mr_replaceIntoDB.js | 45 + jstests/mr_sort.js | 6 +- jstests/multiClient/rsDurKillRestart1.js | 139 + jstests/ne2.js | 21 + jstests/ne3.js | 12 + jstests/not2.js | 5 +- jstests/notablescan.js | 22 + jstests/objid5.js | 9 +- jstests/or4.js | 11 +- jstests/or6.js | 14 +- jstests/orc.js | 29 + jstests/ord.js | 34 + jstests/ore.js | 13 + jstests/orf.js | 15 + jstests/parallel/del.js | 79 + jstests/parallel/repl.js | 4 +- jstests/perf/geo_near1.js | 11 + jstests/profile1.js | 7 + jstests/proj_key1.js | 28 + jstests/pull_remove1.js | 14 + jstests/push2.js | 2 + jstests/queryoptimizer2.js | 62 + jstests/regex3.js | 2 +- jstests/regex6.js | 4 +- jstests/regex9.js | 2 +- jstests/remove_undefined.js | 28 + jstests/rename4.js | 121 + jstests/repl/basic1.js | 4 +- jstests/repl/block2.js | 15 +- jstests/repl/mastermaster1.js | 8 +- jstests/repl/pair1.js | 4 +- jstests/repl/repl1.js | 2 + jstests/repl/repl11.js | 4 + jstests/repl/repl2.js | 2 + jstests/repl/snapshot3.js | 4 +- jstests/replsets/auth1.js | 184 + jstests/replsets/buildindexes.js | 86 + jstests/replsets/cloneDb.js | 52 + jstests/replsets/config1.js | 21 + jstests/replsets/fastsync.js | 117 + jstests/replsets/getlasterror_w2.js | 36 + jstests/replsets/groupAndMapReduce.js | 105 + jstests/replsets/initial_sync1.js | 129 + jstests/replsets/initial_sync2.js | 179 + jstests/replsets/initial_sync3.js | 87 + jstests/replsets/ismaster1.js | 36 + jstests/replsets/key1 | 1 + jstests/replsets/key2 | 1 + jstests/replsets/remove1.js | 132 + jstests/replsets/replset2.js | 252 +- jstests/replsets/replset3.js | 130 +- jstests/replsets/replset5.js | 42 +- jstests/replsets/replset_remove_node.js | 9 +- jstests/replsets/replsetarb2.js | 2 + jstests/replsets/replsetarb3.js | 144 + jstests/replsets/replsetfreeze.js | 105 + jstests/replsets/rollback.js | 333 +- jstests/replsets/rollback2.js | 423 +- jstests/replsets/rollback3.js | 39 +- jstests/replsets/rslib.js | 63 + jstests/replsets/slaveDelay2.js | 104 + jstests/replsets/slavedelay1.js | 127 + jstests/replsets/sync1.js | 30 +- jstests/replsets/sync_passive.js | 89 + jstests/replsets/sync_passive2.js | 120 + jstests/replsets/toostale.js | 121 + jstests/replsets/two_initsync.js | 1 + jstests/replsets/twosets.js | 19 +- jstests/rs/rs_basic.js | 88 +- jstests/set_param1.js | 9 + jstests/sharding/addshard3.js | 9 + jstests/sharding/addshard4.js | 24 + jstests/sharding/auto1.js | 5 + jstests/sharding/bigMapReduce.js | 62 +- jstests/sharding/count1.js | 10 +- jstests/sharding/cursor1.js | 2 +- jstests/sharding/features1.js | 24 +- jstests/sharding/features2.js | 29 +- jstests/sharding/features3.js | 3 +- jstests/sharding/geo_near_random1.js | 37 + jstests/sharding/geo_near_random2.js | 44 + jstests/sharding/key_many.js | 6 +- jstests/sharding/key_string.js | 13 +- jstests/sharding/limit_push.js | 47 + jstests/sharding/migrateBig.js | 45 + jstests/sharding/multi_mongos1.js | 70 + jstests/sharding/rename.js | 1 + jstests/sharding/shard1.js | 1 + jstests/sharding/shard3.js | 36 +- jstests/sharding/shard_insert_getlasterror_w2.js | 89 + jstests/sharding/sort1.js | 46 +- jstests/sharding/splitpick.js | 39 - jstests/sharding/sync1.js | 15 +- jstests/sharding/update1.js | 7 + jstests/sharding/version1.js | 40 +- jstests/sharding/version2.js | 35 +- jstests/shellkillop.js | 83 +- jstests/shellspawn.js | 2 + jstests/shellstartparallel.js | 17 + jstests/slowNightly/32bit.js | 125 + jstests/slowNightly/btreedel.js | 43 + jstests/slowNightly/capped4.js | 2 +- jstests/slowNightly/command_line_parsing.js | 9 + jstests/slowNightly/dur_big_atomic_update.js | 31 + jstests/slowNightly/dur_passthrough.js | 89 + jstests/slowNightly/dur_remove_old_journals.js | 53 + jstests/slowNightly/geo_near_random1.js | 13 + jstests/slowNightly/geo_near_random2.js | 21 + jstests/slowNightly/index_check9.js | 118 + jstests/slowNightly/large_chunk.js | 51 + jstests/slowNightly/moveprimary-replset.js | 67 + jstests/slowNightly/newcollection2.js | 11 + jstests/slowNightly/run_sharding_passthrough.js | 94 - jstests/slowNightly/sharding_balance1.js | 2 +- jstests/slowNightly/sharding_balance2.js | 2 +- jstests/slowNightly/sharding_balance3.js | 4 +- jstests/slowNightly/sharding_balance4.js | 43 +- .../slowNightly/sharding_balance_randomorder1.js | 54 + jstests/slowNightly/sharding_cursors1.js | 6 +- .../slowNightly/sharding_multiple_collections.js | 53 + jstests/slowNightly/sharding_passthrough.js | 94 + jstests/slowNightly/sharding_rs1.js | 13 +- jstests/slowNightly/sharding_rs2.js | 163 + jstests/slowNightly/unix_socket1.js | 26 + jstests/slowWeekly/conc_update.js | 29 +- jstests/slowWeekly/disk_reuse1.js | 41 + jstests/slowWeekly/dur_passthrough.js | 44 + jstests/slowWeekly/geo_near_random1.js | 13 + jstests/slowWeekly/geo_near_random2.js | 21 + jstests/slowWeekly/indexbg_dur.js | 67 + jstests/slowWeekly/query_yield1.js | 6 +- jstests/slowWeekly/query_yield2.js | 6 +- jstests/slowWeekly/update_yield1.js | 21 +- jstests/sort2.js | 2 +- jstests/splitvector.js | 144 +- jstests/tempCleanup.js | 16 - jstests/temp_cleanup.js | 16 + jstests/tool/dumprestore2.js | 3 + jstests/tool/dumprestore3.js | 60 + jstests/tool/dumprestore4.js | 42 + jstests/tool/tool1.js | 2 +- jstests/ts1.js | 38 + jstests/update_addToSet3.js | 18 + jstests/update_arraymatch6.js | 14 + jstests/update_multi6.js | 10 + lib/libboost_thread-gcc41-mt-d-1_34_1.a | Bin 0 -> 692920 bytes mongo.xcodeproj/project.pbxproj | 1879 ------- mongo_astyle | 16 + pch.cpp | 2 +- pch.h | 40 +- rpm/init.d-mongod | 3 +- rpm/mongo.spec | 3 +- rpm/mongod.conf | 11 - s/balance.cpp | 197 +- s/balance.h | 69 +- s/balancer_policy.cpp | 227 +- s/balancer_policy.h | 28 +- s/chunk.cpp | 1097 ++-- s/chunk.h | 279 +- s/client.cpp | 292 ++ s/client.h | 120 + s/commands_admin.cpp | 568 ++- s/commands_public.cpp | 541 +- s/config.cpp | 409 +- s/config.h | 120 +- s/config_migrate.cpp | 76 +- s/cursors.cpp | 153 +- s/cursors.h | 37 +- s/d_chunk_manager.cpp | 328 ++ s/d_chunk_manager.h | 150 + s/d_logic.cpp | 37 +- s/d_logic.h | 213 +- s/d_migrate.cpp | 1197 +++-- s/d_split.cpp | 722 ++- s/d_state.cpp | 694 +-- s/d_util.cpp | 41 - s/d_writeback.cpp | 97 +- s/d_writeback.h | 75 + s/dbgrid.vcproj | 1048 ---- s/dbgrid.vcxproj | 18 + s/dbgrid.vcxproj.filters | 36 + s/grid.cpp | 257 +- s/grid.h | 33 +- s/request.cpp | 168 +- s/request.h | 63 +- s/s_only.cpp | 63 +- s/server.cpp | 193 +- s/server.h | 4 +- s/shard.cpp | 271 +- s/shard.h | 97 +- s/shard_version.cpp | 151 + s/shard_version.h | 31 + s/shardconnection.cpp | 173 +- s/shardkey.cpp | 152 +- s/shardkey.h | 40 +- s/stats.cpp | 2 +- s/stats.h | 2 +- s/strategy.cpp | 308 +- s/strategy.h | 13 +- s/strategy_shard.cpp | 196 +- s/strategy_single.cpp | 184 +- s/util.h | 68 +- s/writeback_listener.cpp | 254 + s/writeback_listener.h | 67 + scripting/bench.cpp | 173 + scripting/engine.cpp | 253 +- scripting/engine.h | 134 +- scripting/engine_java.cpp | 77 +- scripting/engine_java.h | 16 +- scripting/engine_none.cpp | 2 +- scripting/engine_spidermonkey.cpp | 624 +-- scripting/engine_spidermonkey.h | 16 +- scripting/engine_v8.cpp | 323 +- scripting/engine_v8.h | 55 +- scripting/sm_db.cpp | 628 +-- scripting/utils.cpp | 23 +- scripting/v8_db.cpp | 498 +- scripting/v8_db.h | 92 +- scripting/v8_utils.cpp | 141 +- scripting/v8_utils.h | 4 +- scripting/v8_wrapper.cpp | 282 +- scripting/v8_wrapper.h | 4 +- shell/collection.js | 79 +- shell/db.js | 144 +- shell/dbshell.cpp | 351 +- shell/mongo.js | 9 +- shell/mongo_vstudio.cpp | 5223 +++++++++++++------- shell/msvc/mongo.vcxproj | 7 +- shell/msvc/mongo.vcxproj.filters | 13 +- shell/query.js | 55 +- shell/servers.js | 110 +- shell/shell_utils.cpp | 383 +- shell/utils.h | 8 +- shell/utils.js | 495 +- tools/bridge.cpp | 80 +- tools/bsondump.cpp | 57 +- tools/dump.cpp | 286 +- tools/export.cpp | 69 +- tools/files.cpp | 54 +- tools/import.cpp | 181 +- tools/restore.cpp | 148 +- tools/sniffer.cpp | 200 +- tools/stat.cpp | 653 ++- tools/tool.cpp | 218 +- tools/tool.h | 58 +- util/admin_access.h | 52 + util/alignedbuilder.cpp | 102 + util/alignedbuilder.h | 123 + util/allocator.h | 8 +- util/array.h | 52 +- util/assert_util.cpp | 60 +- util/assert_util.h | 133 +- util/background.cpp | 146 +- util/background.h | 135 +- util/base64.cpp | 40 +- util/base64.h | 25 +- util/bufreader.h | 98 + util/concurrency/README | 19 + util/concurrency/list.h | 96 +- util/concurrency/msg.h | 8 +- util/concurrency/mutex.h | 129 +- util/concurrency/mvar.h | 28 +- util/concurrency/race.h | 72 + util/concurrency/readme.txt | 15 - util/concurrency/rwlock.h | 170 +- util/concurrency/shared_mutex_win.hpp | 573 +++ util/concurrency/spin_lock.cpp | 34 +- util/concurrency/spin_lock.h | 26 +- util/concurrency/synchronization.cpp | 56 + util/concurrency/synchronization.h | 73 + util/concurrency/task.cpp | 56 +- util/concurrency/task.h | 14 +- util/concurrency/thread_pool.cpp | 45 +- util/concurrency/thread_pool.h | 110 +- util/concurrency/value.h | 24 +- util/concurrency/vars.cpp | 24 +- util/debug_util.cpp | 9 +- util/debug_util.h | 19 +- util/embedded_builder.h | 16 +- util/file.h | 225 +- util/file_allocator.cpp | 282 ++ util/file_allocator.h | 278 +- util/goodies.h | 367 +- util/hashtab.h | 58 +- util/heapcheck.h | 33 + util/hex.h | 12 +- util/histogram.cpp | 40 +- util/histogram.h | 8 +- util/hostandport.h | 67 +- util/httpclient.cpp | 52 +- util/httpclient.h | 22 +- util/log.cpp | 45 +- util/log.h | 166 +- util/logfile.cpp | 157 + util/logfile.h | 50 + util/lruishmap.h | 4 +- util/md5.h | 16 +- util/md5.hpp | 9 +- util/md5main.cpp | 104 +- util/message.cpp | 342 +- util/message.h | 200 +- util/message_server.h | 10 +- util/message_server_asio.cpp | 112 +- util/message_server_port.cpp | 55 +- util/miniwebserver.cpp | 32 +- util/miniwebserver.h | 2 +- util/mmap.cpp | 113 +- util/mmap.h | 289 +- util/mmap_mm.cpp | 4 +- util/mmap_posix.cpp | 132 +- util/mmap_win.cpp | 158 +- util/mongoutils/README | 8 +- util/mongoutils/checksum.h | 4 +- util/mongoutils/hash.h | 41 + util/mongoutils/html.h | 44 +- util/mongoutils/mongoutils.vcxproj | 2 + util/mongoutils/str.h | 126 +- util/mongoutils/test.cpp | 79 +- util/moveablebuffer.h | 51 + util/ntservice.cpp | 418 +- util/ntservice.h | 17 +- util/optime.h | 31 +- util/password.h | 4 +- util/paths.h | 79 + util/processinfo.cpp | 33 +- util/processinfo.h | 12 +- util/processinfo_darwin.cpp | 48 +- util/processinfo_linux2.cpp | 195 +- util/processinfo_none.cpp | 24 +- util/processinfo_win32.cpp | 28 +- util/queue.h | 54 +- util/ramlog.h | 24 +- util/ramstore.cpp | 93 - util/ramstore.h | 86 - util/signal_handlers.cpp | 122 + util/signal_handlers.h | 34 + util/sock.cpp | 70 +- util/sock.h | 124 +- util/stringutils.cpp | 8 +- util/stringutils.h | 12 +- util/text.cpp | 92 +- util/text.h | 48 +- util/time_support.h | 201 + util/timer.h | 67 + util/unittest.h | 3 + util/util.cpp | 93 +- util/version.cpp | 84 +- util/version.h | 3 +- 735 files changed, 60850 insertions(+), 37351 deletions(-) create mode 100644 bson/bson-inl.h delete mode 100644 bson/bsoninlines.h create mode 100644 bson/oid.cpp create mode 100644 buildscripts/s3del.py create mode 100644 client/dbclient_rs.cpp create mode 100644 client/dbclient_rs.h create mode 100644 client/dbclientmockcursor.h create mode 100644 client/examples/rs.cpp create mode 100644 client/mongo_client_lib.cpp create mode 100644 client/simple_client_demo.cpp create mode 100644 db/commands/distinct.cpp create mode 100644 db/commands/group.cpp create mode 100644 db/commands/isself.cpp create mode 100644 db/commands/mr.cpp create mode 100644 db/commands/mr.h create mode 100644 db/compact.cpp create mode 100644 db/curop-inl.h delete mode 100644 db/db.sln delete mode 100644 db/db.vcproj mode change 100644 => 100755 db/db_10.sln create mode 100644 db/dur.cpp create mode 100644 db/dur.h create mode 100644 db/dur_commitjob.cpp create mode 100644 db/dur_commitjob.h create mode 100644 db/dur_journal.cpp create mode 100644 db/dur_journal.h create mode 100644 db/dur_journalformat.h create mode 100644 db/dur_journalimpl.h create mode 100644 db/dur_preplogbuffer.cpp create mode 100644 db/dur_recover.cpp create mode 100644 db/dur_recover.h create mode 100644 db/dur_stats.h create mode 100644 db/dur_writetodatafiles.cpp create mode 100644 db/durop.cpp create mode 100644 db/durop.h create mode 100644 db/mongommf.cpp create mode 100644 db/mongommf.h create mode 100644 db/mongomutex.h delete mode 100644 db/mr.cpp create mode 100644 db/namespace-inl.h create mode 100644 db/projection.cpp create mode 100644 db/projection.h delete mode 100644 db/rec.h delete mode 100644 db/reccache.cpp delete mode 100644 db/reccache.h delete mode 100644 db/reci.h delete mode 100644 db/recstore.h mode change 100755 => 100644 db/repl/rs_exception.h mode change 100755 => 100644 db/resource.h create mode 100644 db/restapi.h create mode 100644 db/security_key.cpp create mode 100644 db/security_key.h delete mode 100644 db/storage.cpp create mode 100644 db/taskqueue.h create mode 100644 dbtests/background_job_test.cpp create mode 100644 dbtests/balancer_policy_tests.cpp create mode 100644 dbtests/d_chunk_manager_tests.cpp create mode 100644 dbtests/directclienttests.cpp create mode 100644 dbtests/mmaptests.cpp create mode 100644 dbtests/perf/btreeperf.cpp create mode 100644 dbtests/perftests.cpp delete mode 100644 dbtests/test.vcproj delete mode 100644 debian/changelog delete mode 100644 debian/compat delete mode 100644 debian/control delete mode 100644 debian/copyright delete mode 100644 debian/dirs delete mode 100644 debian/init.d delete mode 100644 debian/lintian-overrides delete mode 100644 debian/mongo.1 delete mode 100644 debian/mongod.1 delete mode 100644 debian/mongodb.conf delete mode 100644 debian/mongodump.1 delete mode 100644 debian/mongoexport.1 delete mode 100644 debian/mongofiles.1 delete mode 100644 debian/mongoimport.1 delete mode 100644 debian/mongorestore.1 delete mode 100644 debian/mongos.1 delete mode 100644 debian/mongosniff.1 delete mode 100644 debian/mongostat.1 delete mode 100644 debian/postinst delete mode 100644 debian/postrm delete mode 100644 debian/prerm delete mode 100644 debian/rules delete mode 100644 debian/watch create mode 100644 jstests/_tst.js create mode 100644 jstests/array4.js create mode 100644 jstests/arrayfind3.js create mode 100644 jstests/big_object1.js create mode 100644 jstests/capped8.js create mode 100644 jstests/check_shard_index.js delete mode 100644 jstests/conc_update.js create mode 100644 jstests/coveredIndex1.js create mode 100644 jstests/coveredIndex2.js create mode 100644 jstests/disk/killall.js create mode 100644 jstests/disk/preallocate_directoryperdb.js create mode 100644 jstests/distinct_index1.js create mode 100644 jstests/distinct_index2.js create mode 100644 jstests/drop2.js delete mode 100644 jstests/dropIndex.js create mode 100644 jstests/drop_index.js create mode 100755 jstests/dur/a_quick.js create mode 100644 jstests/dur/closeall.js create mode 100644 jstests/dur/diskfull.js create mode 100644 jstests/dur/dropdb.js create mode 100755 jstests/dur/dur1.js create mode 100644 jstests/dur/dur2.js create mode 100755 jstests/dur/lsn.js create mode 100755 jstests/dur/manyRestart.js create mode 100644 jstests/dur/md5.js create mode 100755 jstests/dur/oplog.js create mode 100644 jstests/eval_nolock.js create mode 100644 jstests/evald.js create mode 100644 jstests/evale.js create mode 100644 jstests/evalf.js create mode 100644 jstests/explain3.js create mode 100644 jstests/geo_borders.js create mode 100644 jstests/geo_center_sphere1.js create mode 100644 jstests/geo_circle2a.js create mode 100644 jstests/geo_near_random1.js create mode 100644 jstests/geo_near_random2.js create mode 100644 jstests/geo_sort1.js create mode 100644 jstests/geo_update1.js create mode 100644 jstests/geo_update2.js create mode 100644 jstests/geof.js create mode 100644 jstests/index11.js create mode 100644 jstests/index_sparse1.js create mode 100644 jstests/index_sparse2.js create mode 100644 jstests/indexi.js create mode 100644 jstests/indexj.js create mode 100644 jstests/insert2.js create mode 100644 jstests/killop.js create mode 100644 jstests/libs/concurrent.js create mode 100644 jstests/libs/fun.js create mode 100644 jstests/libs/geo_near_random.js create mode 100644 jstests/libs/grid.js create mode 100644 jstests/libs/network.js create mode 100755 jstests/misc/biginsert.js create mode 100644 jstests/mr_comments.js create mode 100644 jstests/mr_index.js create mode 100644 jstests/mr_index2.js create mode 100644 jstests/mr_index3.js create mode 100644 jstests/mr_killop.js create mode 100644 jstests/mr_merge.js create mode 100644 jstests/mr_optim.js create mode 100644 jstests/mr_outreduce.js create mode 100644 jstests/mr_outreduce2.js create mode 100644 jstests/mr_replaceIntoDB.js create mode 100644 jstests/multiClient/rsDurKillRestart1.js create mode 100644 jstests/ne2.js create mode 100644 jstests/ne3.js create mode 100644 jstests/notablescan.js create mode 100644 jstests/orc.js create mode 100644 jstests/ord.js create mode 100644 jstests/ore.js create mode 100644 jstests/orf.js create mode 100644 jstests/parallel/del.js create mode 100644 jstests/perf/geo_near1.js create mode 100644 jstests/proj_key1.js create mode 100644 jstests/pull_remove1.js create mode 100644 jstests/queryoptimizer2.js create mode 100644 jstests/remove_undefined.js create mode 100644 jstests/rename4.js create mode 100644 jstests/replsets/auth1.js create mode 100644 jstests/replsets/buildindexes.js create mode 100644 jstests/replsets/cloneDb.js create mode 100644 jstests/replsets/config1.js create mode 100644 jstests/replsets/fastsync.js create mode 100644 jstests/replsets/getlasterror_w2.js create mode 100644 jstests/replsets/groupAndMapReduce.js create mode 100644 jstests/replsets/initial_sync1.js create mode 100644 jstests/replsets/initial_sync2.js create mode 100644 jstests/replsets/initial_sync3.js create mode 100644 jstests/replsets/ismaster1.js create mode 100644 jstests/replsets/key1 create mode 100644 jstests/replsets/key2 create mode 100644 jstests/replsets/remove1.js create mode 100644 jstests/replsets/replsetarb3.js create mode 100644 jstests/replsets/replsetfreeze.js create mode 100644 jstests/replsets/rslib.js create mode 100644 jstests/replsets/slaveDelay2.js create mode 100644 jstests/replsets/slavedelay1.js create mode 100644 jstests/replsets/sync_passive.js create mode 100644 jstests/replsets/sync_passive2.js create mode 100644 jstests/replsets/toostale.js create mode 100644 jstests/set_param1.js create mode 100644 jstests/sharding/addshard3.js create mode 100644 jstests/sharding/addshard4.js create mode 100644 jstests/sharding/geo_near_random1.js create mode 100644 jstests/sharding/geo_near_random2.js create mode 100644 jstests/sharding/limit_push.js create mode 100644 jstests/sharding/migrateBig.js create mode 100644 jstests/sharding/multi_mongos1.js create mode 100644 jstests/sharding/shard_insert_getlasterror_w2.js delete mode 100644 jstests/sharding/splitpick.js create mode 100644 jstests/shellstartparallel.js create mode 100755 jstests/slowNightly/32bit.js create mode 100644 jstests/slowNightly/btreedel.js create mode 100644 jstests/slowNightly/command_line_parsing.js create mode 100644 jstests/slowNightly/dur_big_atomic_update.js create mode 100644 jstests/slowNightly/dur_passthrough.js create mode 100644 jstests/slowNightly/dur_remove_old_journals.js create mode 100644 jstests/slowNightly/geo_near_random1.js create mode 100644 jstests/slowNightly/geo_near_random2.js create mode 100644 jstests/slowNightly/index_check9.js create mode 100644 jstests/slowNightly/large_chunk.js create mode 100755 jstests/slowNightly/moveprimary-replset.js create mode 100644 jstests/slowNightly/newcollection2.js delete mode 100644 jstests/slowNightly/run_sharding_passthrough.js create mode 100644 jstests/slowNightly/sharding_balance_randomorder1.js create mode 100644 jstests/slowNightly/sharding_multiple_collections.js create mode 100644 jstests/slowNightly/sharding_passthrough.js create mode 100644 jstests/slowNightly/sharding_rs2.js create mode 100644 jstests/slowNightly/unix_socket1.js create mode 100644 jstests/slowWeekly/disk_reuse1.js create mode 100644 jstests/slowWeekly/dur_passthrough.js create mode 100644 jstests/slowWeekly/geo_near_random1.js create mode 100644 jstests/slowWeekly/geo_near_random2.js create mode 100644 jstests/slowWeekly/indexbg_dur.js delete mode 100644 jstests/tempCleanup.js create mode 100644 jstests/temp_cleanup.js create mode 100644 jstests/tool/dumprestore3.js create mode 100644 jstests/tool/dumprestore4.js create mode 100644 jstests/ts1.js create mode 100644 jstests/update_addToSet3.js create mode 100644 jstests/update_arraymatch6.js create mode 100644 jstests/update_multi6.js create mode 100644 lib/libboost_thread-gcc41-mt-d-1_34_1.a delete mode 100644 mongo.xcodeproj/project.pbxproj create mode 100644 mongo_astyle create mode 100644 s/client.cpp create mode 100644 s/client.h create mode 100644 s/d_chunk_manager.cpp create mode 100644 s/d_chunk_manager.h delete mode 100644 s/d_util.cpp create mode 100644 s/d_writeback.h delete mode 100644 s/dbgrid.vcproj create mode 100644 s/shard_version.cpp create mode 100644 s/shard_version.h create mode 100644 s/writeback_listener.cpp create mode 100644 s/writeback_listener.h create mode 100644 scripting/bench.cpp create mode 100644 util/admin_access.h create mode 100644 util/alignedbuilder.cpp create mode 100644 util/alignedbuilder.h create mode 100644 util/bufreader.h create mode 100644 util/concurrency/README create mode 100644 util/concurrency/race.h delete mode 100644 util/concurrency/readme.txt create mode 100755 util/concurrency/shared_mutex_win.hpp create mode 100644 util/concurrency/synchronization.cpp create mode 100644 util/concurrency/synchronization.h create mode 100644 util/file_allocator.cpp create mode 100644 util/heapcheck.h create mode 100644 util/logfile.cpp create mode 100644 util/logfile.h create mode 100644 util/mongoutils/hash.h mode change 100755 => 100644 util/mongoutils/test.cpp create mode 100644 util/moveablebuffer.h create mode 100644 util/paths.h delete mode 100644 util/ramstore.cpp delete mode 100644 util/ramstore.h create mode 100644 util/signal_handlers.cpp create mode 100644 util/signal_handlers.h create mode 100644 util/time_support.h create mode 100644 util/timer.h diff --git a/.gitignore b/.gitignore index 2c7d1bd..3847ca4 100644 --- a/.gitignore +++ b/.gitignore @@ -95,6 +95,8 @@ libmongoshellfiles.* firstExample secondExample whereExample +bsondemo +rsExample #tests test @@ -119,4 +121,4 @@ debian/mongodb *.creator.user *.files *.includes - +*.orig diff --git a/SConstruct b/SConstruct index e3046ff..41383b1 100644 --- a/SConstruct +++ b/SConstruct @@ -10,6 +10,8 @@ # scons --distname=0.8 s3dist # all s3 pushes require settings.py and simples3 +EnsureSConsVersion(0, 98, 4) # this is a common version known to work + import os import sys import imp @@ -24,241 +26,137 @@ from buildscripts import utils buildscripts.bb.checkOk() +def findSettingsSetup(): + sys.path.append( "." ) + sys.path.append( ".." ) + sys.path.append( "../../" ) + + + # --- options ---- -AddOption('--prefix', - dest='prefix', - type='string', - nargs=1, - action='store', - metavar='DIR', - help='installation prefix') - -AddOption('--distname', - dest='distname', - type='string', - nargs=1, - action='store', - metavar='DIR', - help='dist name (0.8.0)') - -AddOption('--distmod', - dest='distmod', - type='string', - nargs=1, - action='store', - metavar='DIR', - help='additional piece for full dist name') - -AddOption( "--64", - dest="force64", - type="string", - nargs=0, - action="store", - help="whether to force 64 bit" ) - - -AddOption( "--32", - dest="force32", - type="string", - nargs=0, - action="store", - help="whether to force 32 bit" ) - - -AddOption( "--mm", - dest="mm", - type="string", - nargs=0, - action="store", - help="use main memory instead of memory mapped files" ) - - -AddOption( "--release", - dest="release", - type="string", - nargs=0, - action="store", - help="relase build") - - -AddOption( "--static", - dest="static", - type="string", - nargs=0, - action="store", - help="fully static build") - - -AddOption('--usesm', - dest='usesm', - type="string", - nargs=0, - action="store", - help="use spider monkey for javascript" ) - -AddOption('--usev8', - dest='usev8', - type="string", - nargs=0, - action="store", - help="use v8 for javascript" ) - -AddOption('--asio', - dest='asio', - type="string", - nargs=0, - action="store", - help="Use Asynchronous IO (NOT READY YET)" ) - -AddOption( "--d", - dest="debugBuild", - type="string", - nargs=0, - action="store", - help="debug build no optimization, etc..." ) - -AddOption( "--dd", - dest="debugBuildAndLogging", - type="string", - nargs=0, - action="store", - help="debug build no optimization, additional debug logging, etc..." ) - -AddOption( "--recstore", - dest="recstore", - type="string", - nargs=0, - action="store", - help="use new recstore" ) - -AddOption( "--noshell", - dest="noshell", - type="string", - nargs=0, - action="store", - help="don't build shell" ) - -AddOption( "--safeshell", - dest="safeshell", - type="string", - nargs=0, - action="store", - help="don't let shell scripts run programs (still, don't run untrusted scripts)" ) - -AddOption( "--extrapath", - dest="extrapath", - type="string", - nargs=1, - action="store", - help="comma separated list of add'l paths (--extrapath /opt/foo/,/foo) static linking" ) - -AddOption( "--extrapathdyn", - dest="extrapathdyn", - type="string", - nargs=1, - action="store", - help="comma separated list of add'l paths (--extrapath /opt/foo/,/foo) dynamic linking" ) - - -AddOption( "--extralib", - dest="extralib", - type="string", - nargs=1, - action="store", - help="comma separated list of libraries (--extralib js_static,readline" ) - -AddOption( "--staticlib", - dest="staticlib", - type="string", - nargs=1, - action="store", - help="comma separated list of libs to link statically (--staticlib js_static,boost_program_options-mt,..." ) - -AddOption( "--staticlibpath", - dest="staticlibpath", - type="string", - nargs=1, - action="store", - help="comma separated list of dirs to search for staticlib arguments" ) - -AddOption( "--cxx", - dest="cxx", - type="string", - nargs=1, - action="store", - help="compiler to use" ) - - -AddOption( "--boost-compiler", - dest="boostCompiler", - type="string", - nargs=1, - action="store", - help="compiler used for boost (gcc41)" ) - -AddOption( "--boost-version", - dest="boostVersion", - type="string", - nargs=1, - action="store", - help="boost version for linking(1_38)" ) - -AddOption( "--cpppath", - dest="cpppath", - type="string", - nargs=1, - action="store", - help="Include path if you have headers in a nonstandard directory" ) - -AddOption( "--libpath", - dest="libpath", - type="string", - nargs=1, - action="store", - help="Library path if you have libraries in a nonstandard directory" ) - -# + +options = {} + +def add_option( name, help , nargs , contibutesToVariantDir , dest=None ): + + if dest is None: + dest = name + + AddOption( "--" + name , + dest=dest, + type="string", + nargs=nargs, + action="store", + help=help ) + + options[name] = { "help" : help , + "nargs" : nargs , + "contibutesToVariantDir" : contibutesToVariantDir , + "dest" : dest } + +def get_option( name ): + return GetOption( name ) + +def has_option( name ): + x = get_option( name ) + if x is None: + return False + + if x == False: + return False + + if x == "": + return False + + return True + +def get_variant_dir(): + + a = [] + + for name in options: + o = options[name] + if not has_option( o["dest"] ): + continue + if not o["contibutesToVariantDir"]: + continue + + if o["nargs"] == 0: + a.append( name ) + else: + a.append( name + "-" + get_option( name ) ) + + s = "build/" + + if len(a) > 0: + a.sort() + s += "/".join( a ) + "/" + + return s + + + +# installation/packaging +add_option( "prefix" , "installation prefix" , 1 , False ) +add_option( "distname" , "dist name (0.8.0)" , 1 , False ) +add_option( "distmod", "additional piece for full dist name" , 1 , False ) +add_option( "nostrip", "do not strip installed binaries" , 0 , False ) + +add_option( "sharedclient", "build a libmongoclient.so/.dll" , 0 , False ) +add_option( "full", "include client and headers when doing scons install", 0 , False ) + +# linking options +add_option( "release" , "release build" , 0 , True ) +add_option( "static" , "fully static build" , 0 , True ) + +# base compile flags +add_option( "64" , "whether to force 64 bit" , 0 , True , "force64" ) +add_option( "32" , "whether to force 32 bit" , 0 , True , "force32" ) + +add_option( "cxx", "compiler to use" , 1 , True ) + +add_option( "cpppath", "Include path if you have headers in a nonstandard directory" , 1 , True ) +add_option( "libpath", "Library path if you have libraries in a nonstandard directory" , 1 , True ) + +add_option( "extrapath", "comma separated list of add'l paths (--extrapath /opt/foo/,/foo) static linking" , 1 , True ) +add_option( "extrapathdyn", "comma separated list of add'l paths (--extrapath /opt/foo/,/foo) dynamic linking" , 1 , True ) +add_option( "extralib", "comma separated list of libraries (--extralib js_static,readline" , 1 , True ) +add_option( "staticlib", "comma separated list of libs to link statically (--staticlib js_static,boost_program_options-mt,..." , 1 , True ) +add_option( "staticlibpath", "comma separated list of dirs to search for staticlib arguments" , 1 , True ) + +add_option( "boost-compiler", "compiler used for boost (gcc41)" , 1 , True , "boostCompiler" ) +add_option( "boost-version", "boost version for linking(1_38)" , 1 , True , "boostVersion" ) + + +# experimental features +add_option( "mm", "use main memory instead of memory mapped files" , 0 , True ) +add_option( "asio" , "Use Asynchronous IO (NOT READY YET)" , 0 , True ) + +# library choices +add_option( "usesm" , "use spider monkey for javascript" , 0 , True ) +add_option( "usev8" , "use v8 for javascript" , 0 , True ) + +# mongo feature options +add_option( "noshell", "don't build shell" , 0 , True ) +add_option( "safeshell", "don't let shell scripts run programs (still, don't run untrusted scripts)" , 0 , True ) + +# dev tools +add_option( "d", "debug build no optimization, etc..." , 0 , True , "debugBuild" ) +add_option( "dd", "debug build no optimization, additional debug logging, etc..." , 0 , False , "debugBuildAndLogging" ) +add_option( "durableDefaultOn" , "have durable default to on" , 0 , True ) + +add_option( "pch" , "use precompiled headers to speed up the build (experimental)" , 0 , True , "usePCH" ) +add_option( "distcc" , "use distcc for distributing builds" , 0 , False ) + +# debugging/profiling help + # to use CPUPROFILE=/tmp/profile # to view pprof -gv mongod /tmp/profile -# -AddOption( "--pg", - dest="profile", - type="string", - nargs=0, - action="store" ) - -AddOption( "--gdbserver", - dest="gdbserver", - type="string", - nargs=0, - action="store" ) - -AddOption("--nostrip", - dest="nostrip", - action="store_true", - help="do not strip installed binaries") - -AddOption("--sharedclient", - dest="sharedclient", - action="store_true", - help="build a libmongoclient.so/.dll") - -AddOption("--full", - dest="full", - action="store_true", - help="include client and headers when doing scons install") - -AddOption("--smokedbprefix", - dest="smokedbprefix", - action="store", - help="prefix to dbpath et al. for smoke tests") - -AddOption( "--pch", - dest="usePCH", - type="string", - nargs=0, - action="store", - help="use precompiled headers to speed up the build (experimental)" ) +add_option( "pg", "link against profiler" , 0 , False , "profile" ) +add_option( "gdbserver" , "build in gdb server support" , 0 , True ) +add_option( "heapcheck", "link to heap-checking malloc-lib and look for memory leaks during tests" , 0 , False ) + +add_option("smokedbprefix", "prefix to dbpath et al. for smoke tests", 1 , False ) # --- environment setup --- @@ -284,7 +182,7 @@ windows = False freebsd = False openbsd = False solaris = False -force64 = not GetOption( "force64" ) is None +force64 = has_option( "force64" ) if not force64 and os.getcwd().endswith( "mongo-64" ): force64 = True print( "*** assuming you want a 64-bit build b/c of directory *** " ) @@ -292,44 +190,45 @@ msarch = None if force64: msarch = "amd64" -force32 = not GetOption( "force32" ) is None -release = not GetOption( "release" ) is None -static = not GetOption( "static" ) is None +force32 = has_option( "force32" ) +release = has_option( "release" ) +static = has_option( "static" ) -debugBuild = ( not GetOption( "debugBuild" ) is None ) or ( not GetOption( "debugBuildAndLogging" ) is None ) -debugLogging = not GetOption( "debugBuildAndLogging" ) is None -noshell = not GetOption( "noshell" ) is None +debugBuild = has_option( "debugBuild" ) or has_option( "debugBuildAndLogging" ) +debugLogging = has_option( "debugBuildAndLogging" ) +noshell = has_option( "noshell" ) -usesm = not GetOption( "usesm" ) is None -usev8 = not GetOption( "usev8" ) is None +usesm = has_option( "usesm" ) +usev8 = has_option( "usev8" ) -asio = not GetOption( "asio" ) is None +asio = has_option( "asio" ) -usePCH = not GetOption( "usePCH" ) is None +usePCH = has_option( "usePCH" ) justClientLib = (COMMAND_LINE_TARGETS == ['mongoclient']) env = Environment( MSVS_ARCH=msarch , tools = ["default", "gch"], toolpath = '.' ) -if GetOption( "cxx" ) is not None: - env["CC"] = GetOption( "cxx" ) - env["CXX"] = GetOption( "cxx" ) +if has_option( "cxx" ): + env["CC"] = get_option( "cxx" ) + env["CXX"] = get_option( "cxx" ) env["LIBPATH"] = [] -if GetOption( "libpath" ) is not None: - env["LIBPATH"] = [GetOption( "libpath" )] +if has_option( "libpath" ): + env["LIBPATH"] = [get_option( "libpath" )] -if GetOption( "cpppath" ) is not None: - env["CPPPATH"] = [GetOption( "cpppath" )] +if has_option( "cpppath" ): + env["CPPPATH"] = [get_option( "cpppath" )] -if GetOption( "recstore" ) != None: - env.Append( CPPDEFINES=[ "_RECSTORE" ] ) env.Append( CPPDEFINES=[ "_SCONS" , "MONGO_EXPOSE_MACROS" ] ) env.Append( CPPPATH=[ "." ] ) -if GetOption( "safeshell" ) != None: +if has_option( "safeshell" ): env.Append( CPPDEFINES=[ "MONGO_SAFE_SHELL" ] ) +if has_option( "durableDefaultOn" ): + env.Append( CPPDEFINES=[ "_DURABLEDEFAULTON" ] ) + boostCompiler = GetOption( "boostCompiler" ) if boostCompiler is None: boostCompiler = "" @@ -356,14 +255,14 @@ def addExtraLibs( s ): env.Append( LIBPATH=[ x + "/lib64" ] ) extraLibPlaces.append( x + "/lib" ) -if GetOption( "extrapath" ) is not None: +if has_option( "extrapath" ): addExtraLibs( GetOption( "extrapath" ) ) - release = True + release = True # this is so we force using .a -if GetOption( "extrapathdyn" ) is not None: +if has_option( "extrapathdyn" ): addExtraLibs( GetOption( "extrapathdyn" ) ) -if GetOption( "extralib" ) is not None: +if has_option( "extralib" ): for x in GetOption( "extralib" ).split( "," ): env.Append( LIBS=[ x ] ) @@ -399,53 +298,61 @@ installSetup = InstallSetup() if distBuild: installSetup.bannerDir = "distsrc" -if GetOption( "full" ): +if has_option( "full" ): installSetup.headers = True installSetup.libraries = True # ------ SOURCE FILE SETUP ----------- -commonFiles = Split( "pch.cpp buildinfo.cpp db/common.cpp db/jsobj.cpp db/json.cpp db/lasterror.cpp db/nonce.cpp db/queryutil.cpp shell/mongo.cpp" ) -commonFiles += [ "util/background.cpp" , "util/mmap.cpp" , "util/ramstore.cpp", "util/sock.cpp" , "util/util.cpp" , "util/message.cpp" , +commonFiles = Split( "pch.cpp buildinfo.cpp db/common.cpp db/indexkey.cpp db/jsobj.cpp bson/oid.cpp db/json.cpp db/lasterror.cpp db/nonce.cpp db/queryutil.cpp db/projection.cpp shell/mongo.cpp db/security_key.cpp" ) +commonFiles += [ "util/background.cpp" , "util/mmap.cpp" , "util/sock.cpp" , "util/util.cpp" , "util/file_allocator.cpp" , "util/message.cpp" , "util/assert_util.cpp" , "util/log.cpp" , "util/httpclient.cpp" , "util/md5main.cpp" , "util/base64.cpp", "util/concurrency/vars.cpp", "util/concurrency/task.cpp", "util/debug_util.cpp", - "util/concurrency/thread_pool.cpp", "util/password.cpp", "util/version.cpp", - "util/histogram.cpp", "util/concurrency/spin_lock.cpp", "util/text.cpp" , "util/stringutils.cpp" , "util/processinfo.cpp" ] + "util/concurrency/thread_pool.cpp", "util/password.cpp", "util/version.cpp", "util/signal_handlers.cpp", + "util/histogram.cpp", "util/concurrency/spin_lock.cpp", "util/text.cpp" , "util/stringutils.cpp" , + "util/concurrency/synchronization.cpp" ] commonFiles += Glob( "util/*.c" ) -commonFiles += Split( "client/connpool.cpp client/dbclient.cpp client/dbclientcursor.cpp client/model.cpp client/syncclusterconnection.cpp client/distlock.cpp s/shardconnection.cpp" ) +commonFiles += Split( "client/connpool.cpp client/dbclient.cpp client/dbclient_rs.cpp client/dbclientcursor.cpp client/model.cpp client/syncclusterconnection.cpp client/distlock.cpp s/shardconnection.cpp" ) #mmap stuff -if GetOption( "mm" ) != None: +if has_option( "mm" ): commonFiles += [ "util/mmap_mm.cpp" ] elif os.sys.platform == "win32": commonFiles += [ "util/mmap_win.cpp" ] else: commonFiles += [ "util/mmap_posix.cpp" ] -if os.path.exists( "util/processinfo_" + os.sys.platform + ".cpp" ): - commonFiles += [ "util/processinfo_" + os.sys.platform + ".cpp" ] -else: - commonFiles += [ "util/processinfo_none.cpp" ] - coreDbFiles = [ "db/commands.cpp" ] coreServerFiles = [ "util/message_server_port.cpp" , "client/parallel.cpp" , "util/miniwebserver.cpp" , "db/dbwebserver.cpp" , - "db/matcher.cpp" , "db/indexkey.cpp" , "db/dbcommands_generic.cpp" ] + "db/matcher.cpp" , "db/dbcommands_generic.cpp" ] + +processInfoFiles = [ "util/processinfo.cpp" ] + +if os.path.exists( "util/processinfo_" + os.sys.platform + ".cpp" ): + processInfoFiles += [ "util/processinfo_" + os.sys.platform + ".cpp" ] +else: + processInfoFiles += [ "util/processinfo_none.cpp" ] + +coreServerFiles += processInfoFiles + + -if GetOption( "asio" ) != None: +if has_option( "asio" ): coreServerFiles += [ "util/message_server_asio.cpp" ] -serverOnlyFiles = Split( "db/query.cpp db/update.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/cursor.cpp db/security_commands.cpp db/security.cpp db/storage.cpp db/queryoptimizer.cpp db/extsort.cpp db/mr.cpp s/d_util.cpp db/cmdline.cpp" ) +serverOnlyFiles = Split( "util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_preplogbuffer.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/query.cpp db/update.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/cursor.cpp db/security_commands.cpp db/security.cpp db/queryoptimizer.cpp db/extsort.cpp db/cmdline.cpp" ) serverOnlyFiles += [ "db/index.cpp" ] + Glob( "db/geo/*.cpp" ) serverOnlyFiles += [ "db/dbcommands.cpp" , "db/dbcommands_admin.cpp" ] +serverOnlyFiles += Glob( "db/commands/*.cpp" ) coreServerFiles += Glob( "db/stats/*.cpp" ) serverOnlyFiles += [ "db/driverHelpers.cpp" ] -scriptingFiles = [ "scripting/engine.cpp" , "scripting/utils.cpp" ] +scriptingFiles = [ "scripting/engine.cpp" , "scripting/utils.cpp" , "scripting/bench.cpp" ] if usesm: scriptingFiles += [ "scripting/engine_spidermonkey.cpp" ] @@ -457,8 +364,8 @@ else: coreServerFiles += scriptingFiles coreShardFiles = [ "s/config.cpp" , "s/grid.cpp" , "s/chunk.cpp" , "s/shard.cpp" , "s/shardkey.cpp" ] -shardServerFiles = coreShardFiles + Glob( "s/strategy*.cpp" ) + [ "s/commands_admin.cpp" , "s/commands_public.cpp" , "s/request.cpp" , "s/cursors.cpp" , "s/server.cpp" , "s/config_migrate.cpp" , "s/s_only.cpp" , "s/stats.cpp" , "s/balance.cpp" , "s/balancer_policy.cpp" , "db/cmdline.cpp" ] -serverOnlyFiles += coreShardFiles + [ "s/d_logic.cpp" , "s/d_writeback.cpp" , "s/d_migrate.cpp" , "s/d_state.cpp" , "s/d_split.cpp" , "client/distlock_test.cpp" ] +shardServerFiles = coreShardFiles + Glob( "s/strategy*.cpp" ) + [ "s/commands_admin.cpp" , "s/commands_public.cpp" , "s/request.cpp" , "s/client.cpp" , "s/cursors.cpp" , "s/server.cpp" , "s/config_migrate.cpp" , "s/s_only.cpp" , "s/stats.cpp" , "s/balance.cpp" , "s/balancer_policy.cpp" , "db/cmdline.cpp" , "s/writeback_listener.cpp" , "s/shard_version.cpp" ] +serverOnlyFiles += coreShardFiles + [ "s/d_logic.cpp" , "s/d_writeback.cpp" , "s/d_migrate.cpp" , "s/d_state.cpp" , "s/d_split.cpp" , "client/distlock_test.cpp" , "s/d_chunk_manager.cpp" ] serverOnlyFiles += [ "db/module.cpp" ] + Glob( "db/modules/*.cpp" ) @@ -471,12 +378,20 @@ for x in os.listdir( "db/modules/" ): print( "adding module: " + x ) moduleNames.append( x ) modRoot = "db/modules/" + x + "/" - serverOnlyFiles += Glob( modRoot + "src/*.cpp" ) + modBuildFile = modRoot + "build.py" + myModule = None if os.path.exists( modBuildFile ): - modules += [ imp.load_module( "module_" + x , open( modBuildFile , "r" ) , modBuildFile , ( ".py" , "r" , imp.PY_SOURCE ) ) ] + myModule = imp.load_module( "module_" + x , open( modBuildFile , "r" ) , modBuildFile , ( ".py" , "r" , imp.PY_SOURCE ) ) + modules.append( myModule ) + + if myModule and "customIncludes" in dir(myModule) and myModule.customIncludes: + pass + else: + serverOnlyFiles += Glob( modRoot + "src/*.cpp" ) -allClientFiles = commonFiles + coreDbFiles + [ "client/clientOnly.cpp" , "client/gridfs.cpp" , "s/d_util.cpp" ]; + +allClientFiles = commonFiles + coreDbFiles + [ "client/clientOnly.cpp" , "client/gridfs.cpp" ]; # ---- other build setup ----- @@ -504,7 +419,7 @@ if distBuild: def isDriverBuild(): return GetOption( "prefix" ) and GetOption( "prefix" ).find( "mongo-cxx-driver" ) >= 0 -if GetOption( "prefix" ): +if has_option( "prefix" ): installDir = GetOption( "prefix" ) if isDriverBuild(): installSetup.justClient() @@ -533,6 +448,7 @@ if "darwin" == os.sys.platform: platform = "osx" # prettier than darwin if env["CXX"] is None: + print( "YO" ) if os.path.exists( "/usr/bin/g++-4.2" ): env["CXX"] = "g++-4.2" @@ -623,8 +539,6 @@ elif "win32" == os.sys.platform: else: print( "boost found at '" + boostDir + "'" ) - serverOnlyFiles += [ "util/ntservice.cpp" ] - boostLibs = [] env.Append(CPPPATH=[ "js/src/" ]) @@ -651,7 +565,7 @@ elif "win32" == os.sys.platform: # some warnings we don't like: env.Append( CPPFLAGS=" /wd4355 /wd4800 /wd4267 /wd4244 " ) - env.Append( CPPDEFINES=["WIN32","_CONSOLE","_CRT_SECURE_NO_WARNINGS","HAVE_CONFIG_H","PCRE_STATIC","SUPPORT_UCP","SUPPORT_UTF8,PSAPI_VERSION=1" ] ) + env.Append( CPPDEFINES=["WIN32","_CONSOLE","_CRT_SECURE_NO_WARNINGS","HAVE_CONFIG_H","PCRE_STATIC","SUPPORT_UCP","SUPPORT_UTF8","PSAPI_VERSION=1" ] ) #env.Append( CPPFLAGS=' /Yu"pch.h" ' ) # this would be for pre-compiled headers, could play with it later @@ -667,14 +581,20 @@ elif "win32" == os.sys.platform: env.Append( CPPFLAGS= " /GL " ) env.Append( LINKFLAGS=" /LTCG " ) else: - env.Append( CPPDEFINES=[ "_DEBUG" ] ) + # /Od disable optimization # /ZI debug info w/edit & continue # /TP it's a c++ file # RTC1 /GZ (Enable Stack Frame Run-Time Error Checking) - env.Append( CPPFLAGS=" /Od /RTC1 /MDd /Z7 /TP /errorReport:none " ) + env.Append( CPPFLAGS=" /RTC1 /MDd /Z7 /TP /errorReport:none " ) env.Append( CPPFLAGS=' /Fd"mongod.pdb" ' ) - env.Append( LINKFLAGS=" /debug " ) + + if debugBuild: + env.Append( LINKFLAGS=" /debug " ) + env.Append( CPPFLAGS=" /Od " ) + + if debugLogging: + env.Append( CPPDEFINES=[ "_DEBUG" ] ) if os.path.exists("../readline/lib") : env.Append( LIBPATH=["../readline/lib"] ) @@ -744,14 +664,24 @@ else: print( "No special config for [" + os.sys.platform + "] which probably means it won't work" ) if nix: + + if has_option( "distcc" ): + env["CXX"] = "distcc " + env["CXX"] + env.Append( CPPFLAGS="-fPIC -fno-strict-aliasing -ggdb -pthread -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch" ) + # env.Append( " -Wconversion" ) TODO: this doesn't really work yet if linux: env.Append( CPPFLAGS=" -Werror " ) + env.Append( CPPFLAGS=" -fno-builtin-memcmp " ) # glibc's memcmp is faster than gcc's env.Append( CXXFLAGS=" -Wnon-virtual-dtor " ) env.Append( LINKFLAGS=" -fPIC -pthread -rdynamic" ) env.Append( LIBS=[] ) - if linux and GetOption( "sharedclient" ): + #make scons colorgcc friendly + env['ENV']['HOME'] = os.environ['HOME'] + env['ENV']['TERM'] = os.environ['TERM'] + + if linux and has_option( "sharedclient" ): env.Append( LINKFLAGS=" -Wl,--as-needed -Wl,-zdefs " ) if debugBuild: @@ -759,6 +689,11 @@ if nix: env['ENV']['GLIBCXX_FORCE_NEW'] = 1; # play nice with valgrind else: env.Append( CPPFLAGS=" -O3" ) + #env.Append( CPPFLAGS=" -fprofile-generate" ) + #env.Append( LINKFLAGS=" -fprofile-generate" ) + # then: + #env.Append( CPPFLAGS=" -fprofile-use" ) + #env.Append( LINKFLAGS=" -fprofile-use" ) if debugLogging: env.Append( CPPFLAGS=" -D_DEBUG" ); @@ -773,10 +708,10 @@ if nix: env.Append( CXXFLAGS="-m32" ) env.Append( LINKFLAGS="-m32" ) - if GetOption( "profile" ) is not None: + if has_option( "profile" ): env.Append( LIBS=[ "profiler" ] ) - if GetOption( "gdbserver" ) is not None: + if has_option( "gdbserver" ): env.Append( CPPDEFINES=["USE_GDBSERVER"] ) # pre-compiled headers @@ -940,8 +875,12 @@ def doConfigure( myenv , needPcre=True , shell=False ): removeIfInList( myenv["LIBS"] , "wpcap" ) for m in modules: - m.configure( conf , myenv ) + if "customIncludes" in dir(m) and m.customIncludes: + m.configure( conf , myenv , serverOnlyFiles ) + else: + m.configure( conf , myenv ) + # XP_* is for spidermonkey. # this is outside of usesm block so don't have to rebuild for java if windows: myenv.Append( CPPDEFINES=[ "XP_WIN" ] ) @@ -1018,14 +957,14 @@ def doConfigure( myenv , needPcre=True , shell=False ): # Handle staticlib,staticlibpath options. staticlibfiles = [] - if GetOption( "staticlib" ) is not None: + if has_option( "staticlib" ): # FIXME: probably this loop ought to do something clever # depending on whether we want to use 32bit or 64bit # libraries. For now, we sort of rely on the user supplying a # sensible staticlibpath option. (myCheckLib implements an # analogous search, but it also does other things I don't # understand, so I'm not using it.) - if GetOption ( "staticlibpath" ) is not None: + if has_option ( "staticlibpath" ): dirs = GetOption ( "staticlibpath" ).split( "," ) else: dirs = [ "/usr/lib64", "/usr/lib" ] @@ -1042,6 +981,28 @@ def doConfigure( myenv , needPcre=True , shell=False ): if not found: raise "can't find a static %s" % l + # 'tcmalloc' needs to be the last library linked. Please, add new libraries before this + # point. + if has_option( "heapcheck" ) and not shell: + if ( not debugBuild ) and ( not debugLogging ): + print( "--heapcheck needs --d or --dd" ) + Exit( 1 ) + + if not conf.CheckCXXHeader( "google/heap-checker.h" ): + print( "--heapcheck neads header 'google/heap-checker.h'" ) + Exit( 1 ) + + myCheckLib( "tcmalloc" , True ); # if successful, appedded 'tcmalloc' to myenv[ LIBS ] + myenv.Append( CPPDEFINES=[ "HEAP_CHECKING" ] ) + myenv.Append( CPPFLAGS="-fno-omit-frame-pointer" ) + + # FIXME doConfigure() is being called twice, in the case of the shell. So if it is called + # with shell==True, it'd be on its second call and it would need to rearrange the libraries' + # order. The following removes tcmalloc from the LIB's list and reinserts it at the end. + if has_option( "heapcheck" ) and shell: + removeIfInList( myenv["LIBS"] , "tcmalloc" ) + myenv.Append( LIBS="tcmalloc" ) + myenv.Append(LINKCOM=" $STATICFILES") myenv.Append(STATICFILES=staticlibfiles) @@ -1049,90 +1010,60 @@ def doConfigure( myenv , needPcre=True , shell=False ): env = doConfigure( env ) -# --- js concat --- -def concatjs(target, source, env): +# --- jsh --- + +def jsToH(target, source, env): outFile = str( target[0] ) - fullSource = "" + h = ['#include "bson/stringdata.h"' + ,'namespace mongo {' + ,'struct JSFile{ const char* name; const StringData& source; };' + ,'namespace JSFiles{' + ] - first = True + def cppEscape(s): + s = s.strip() + s = s.replace( '\\' , '\\\\' ) + s = s.replace( '"' , r'\"' ) + return s for s in source: - f = open( str(s) , 'r' ) - for l in f: - - #strip comments. special case if // is potentially in a string - parts = l.split("//", 1) - if (len(parts) > 1) and ('"' not in parts[1]) and ('"' not in parts[1]): - l = parts[0] + filename = str(s) + objname = os.path.split(filename)[1].split('.')[0] + stringname = '_jscode_raw_' + objname - l = l.strip() - if len ( l ) == 0: - continue - - if l == "}": - fullSource += "}" - continue + h.append('const StringData ' + stringname + " = ") - if first: - first = False - else: - fullSource += "\n" + for l in open( filename , 'r' ): + h.append( '"' + cppEscape(l) + r'\n" ' ) - fullSource += l + h.append(";") + h.append('extern const JSFile %s;'%objname) #symbols aren't exported w/o this + h.append('const JSFile %s = { "%s" , %s };'%(objname, filename.replace('\\', '/'), stringname)) - fullSource += "\n" - - fullSource = re.compile( r'/\*\*.*?\*/' , re.M | re.S ).sub( "" , fullSource ) - - out = open( outFile , 'w' ) - out.write( fullSource ) - - return None - -jsBuilder = Builder(action = concatjs, - suffix = '.jsall', - src_suffix = '.js') + h.append("} // namespace JSFiles") + h.append("} // namespace mongo") + h.append("") -env.Append( BUILDERS={'JSConcat' : jsBuilder}) + text = '\n'.join(h); -# --- jsh --- - -def jsToH(target, source, env): - - outFile = str( target[0] ) - if len( source ) != 1: - raise Exception( "wrong" ) - - h = "const char * jsconcatcode" + outFile.split( "mongo" )[-1].replace( "-" , "_").split( ".cpp")[0] + " = \n" - - for l in open( str(source[0]) , 'r' ): - l = l.strip() - l = l.replace( '\\' , "\\\\" ) - l = l.replace( '"' , "\\\"" ) - - - h += '"' + l + "\\n\"\n " - - h += ";\n\n" - - out = open( outFile , 'w' ) - out.write( h ) + out = open( outFile , 'wb' ) + out.write( text ) out.close() # mongo_vstudio.cpp is in git as the .vcproj doesn't generate this file. if outFile.find( "mongo.cpp" ) >= 0: - out = open( outFile.replace( "mongo" , "mongo_vstudio" ) , 'w' ) - out.write( h ) + out = open( outFile.replace( "mongo" , "mongo_vstudio" ) , 'wb' ) + out.write( text ) out.close() return None jshBuilder = Builder(action = jsToH, suffix = '.cpp', - src_suffix = '.jsall') + src_suffix = '.js') env.Append( BUILDERS={'JSHeader' : jshBuilder}) @@ -1143,7 +1074,7 @@ clientEnv = env.Clone(); clientEnv.Append( CPPPATH=["../"] ) clientEnv.Prepend( LIBS=[ "mongoclient"] ) clientEnv.Prepend( LIBPATH=["."] ) -#clientEnv["CPPDEFINES"].remove( "MONGO_EXPOSE_MACROS" ) +clientEnv["CPPDEFINES"].remove( "MONGO_EXPOSE_MACROS" ) l = clientEnv[ "LIBS" ] removeIfInList( l , "pcre" ) removeIfInList( l , "pcrecpp" ) @@ -1164,7 +1095,10 @@ def checkErrorCodes(): checkErrorCodes() # main db target -mongod = env.Program( "mongod" , commonFiles + coreDbFiles + coreServerFiles + serverOnlyFiles + [ "db/db.cpp" ] ) +mongodOnlyFiles = [ "db/db.cpp" ] +if windows: + mongodOnlyFiles.append( "util/ntservice.cpp" ) +mongod = env.Program( "mongod" , commonFiles + coreDbFiles + coreServerFiles + serverOnlyFiles + mongodOnlyFiles ) Default( mongod ) # tools @@ -1183,7 +1117,7 @@ mongos = env.Program( "mongos" , commonFiles + coreDbFiles + coreServerFiles + s # c++ library clientLibName = str( env.Library( "mongoclient" , allClientFiles )[0] ) -if GetOption( "sharedclient" ): +if has_option( "sharedclient" ): sharedClientLibName = str( env.SharedLibrary( "mongoclient" , allClientFiles )[0] ) env.Library( "mongotestfiles" , commonFiles + coreDbFiles + coreServerFiles + serverOnlyFiles + ["client/gridfs.cpp"]) env.Library( "mongoshellfiles" , allClientFiles + coreServerFiles ) @@ -1192,11 +1126,12 @@ clientTests = [] # examples clientTests += [ clientEnv.Program( "firstExample" , [ "client/examples/first.cpp" ] ) ] +clientTests += [ clientEnv.Program( "rsExample" , [ "client/examples/rs.cpp" ] ) ] clientTests += [ clientEnv.Program( "secondExample" , [ "client/examples/second.cpp" ] ) ] clientTests += [ clientEnv.Program( "whereExample" , [ "client/examples/whereExample.cpp" ] ) ] clientTests += [ clientEnv.Program( "authTest" , [ "client/examples/authTest.cpp" ] ) ] clientTests += [ clientEnv.Program( "httpClientTest" , [ "client/examples/httpClientTest.cpp" ] ) ] -# clientTests += [ clientEnv.Program( "bsondemo" , [ "bson/bsondemo/bsondemo.cpp" ] ) ] #TODO +clientTests += [ clientEnv.Program( "bsondemo" , [ "bson/bsondemo/bsondemo.cpp" ] ) ] # testing test = testEnv.Program( "test" , Glob( "dbtests/*.cpp" ) ) @@ -1210,6 +1145,7 @@ mongosniff_built = False if darwin or clientEnv["_HAVEPCAP"]: mongosniff_built = True sniffEnv = clientEnv.Clone() + sniffEnv.Append( CPPDEFINES="MONGO_EXPOSE_MACROS" ) if not windows: sniffEnv.Append( LIBS=[ "pcap" ] ) else: @@ -1218,11 +1154,9 @@ if darwin or clientEnv["_HAVEPCAP"]: # --- shell --- -env.JSConcat( "shell/mongo.jsall" , ["shell/utils.js","shell/db.js","shell/mongo.js","shell/mr.js","shell/query.js","shell/collection.js"] ) -env.JSHeader( "shell/mongo.jsall" ) +env.JSHeader( "shell/mongo.cpp" , ["shell/utils.js","shell/db.js","shell/mongo.js","shell/mr.js","shell/query.js","shell/collection.js"] ) -env.JSConcat( "shell/mongo-server.jsall" , [ "shell/servers.js"] ) -env.JSHeader( "shell/mongo-server.jsall" ) +env.JSHeader( "shell/mongo-server.cpp" , [ "shell/servers.js"] ) shellEnv = env.Clone(); @@ -1261,8 +1195,9 @@ elif not onlyServer: shell32BitFiles.append( "32bit/" + str( f ) ) for f in scriptingFiles: shell32BitFiles.append( "32bit/" + str( f ) ) - shellEnv.VariantDir( "32bit" , "." ) - shellEnv.Append( CPPPATH=["32bit/"] ) + for f in processInfoFiles: + shell32BitFiles.append( "32bit/" + str( f ) ) + shellEnv.VariantDir( "32bit" , "." , duplicate=1 ) else: shellEnv.Prepend( LIBPATH=[ "." ] ) @@ -1289,7 +1224,7 @@ smokeFlags = [] # Ugh. Frobbing the smokeFlags must precede using them to construct # actions, I think. -if GetOption( 'smokedbprefix') is not None: +if has_option( 'smokedbprefix'): smokeFlags += ['--smoke-db-prefix', GetOption( 'smokedbprefix')] if 'startMongodSmallOplog' in COMMAND_LINE_TARGETS: @@ -1302,7 +1237,15 @@ def addTest(name, deps, actions): smokeEnv.SideEffect( "dummySmokeSideEffect", name ) def addSmoketest( name, deps ): - addTest(name, deps, [ "python buildscripts/smoke.py " + " ".join(smokeFlags) + ' ' + name ]) + # Convert from smoke to test, smokeJs to js, and foo to foo + target = name + if name.startswith("smoke"): + if name == "smoke": + target = "test" + else: + target = name[5].lower() + name[6:] + + addTest(name, deps, [ "python buildscripts/smoke.py " + " ".join(smokeFlags) + ' ' + target ]) addSmoketest( "smoke", [ add_exe( "test" ) ] ) addSmoketest( "smokePerf", [ "perftest" ] ) @@ -1315,15 +1258,16 @@ if not onlyServer and not noshell: addSmoketest( "smokeClone", [ "mongo", "mongod" ] ) addSmoketest( "smokeRepl", [ "mongo", "mongod", "mongobridge" ] ) addSmoketest( "smokeReplSets", [ "mongo", "mongod", "mongobridge" ] ) - addSmoketest( "smokeDisk", [ add_exe( "mongo" ), add_exe( "mongod" ) ] ) + addSmoketest( "smokeDur", [ add_exe( "mongo" ) , add_exe( "mongod" ) ] ) + addSmoketest( "smokeDisk", [ add_exe( "mongo" ), add_exe( "mongod" ), add_exe( "mongodump" ), add_exe( "mongorestore" ) ] ) addSmoketest( "smokeAuth", [ add_exe( "mongo" ), add_exe( "mongod" ) ] ) addSmoketest( "smokeParallel", [ add_exe( "mongo" ), add_exe( "mongod" ) ] ) addSmoketest( "smokeSharding", [ "mongo", "mongod", "mongos" ] ) addSmoketest( "smokeJsPerf", [ "mongo" ] ) - addSmoketest("smokeJsSlowNightly", [add_exe("mongo")]) - addSmoketest("smokeJsSlowWeekly", [add_exe("mongo")]) + addSmoketest( "smokeJsSlowNightly", [add_exe("mongo")]) + addSmoketest( "smokeJsSlowWeekly", [add_exe("mongo")]) addSmoketest( "smokeQuota", [ "mongo" ] ) - addSmoketest( "smokeTool", [ add_exe( "mongo" ) ] ) + addSmoketest( "smokeTool", [ add_exe( "mongo" ), add_exe("mongod"), "tools" ] ) # Note: although the test running logic has been moved to # buildscripts/smoke.py, the interface to running the tests has been @@ -1408,13 +1352,38 @@ def build_docs(env, target, source): env.Alias("docs", [], [build_docs]) env.AlwaysBuild("docs") +# ---- astyle ---- + +def doStyling( env , target , source ): + + res = utils.execsys( "astyle --version" ) + res = " ".join(res) + if res.count( "2." ) == 0: + print( "astyle 2.x needed, found:" + res ) + Exit(-1) + + files = utils.getAllSourceFiles() + files = filter( lambda x: not x.endswith( ".c" ) , files ) + files.remove( "./shell/mongo_vstudio.cpp" ) + + cmd = "astyle --options=mongo_astyle " + " ".join( files ) + res = utils.execsys( cmd ) + print( res[0] ) + print( res[1] ) + + +env.Alias( "style" , [] , [ doStyling ] ) +env.AlwaysBuild( "style" ) + + + # ---- INSTALL ------- def getSystemInstallName(): n = platform + "-" + processor if static: n += "-static" - if GetOption("nostrip"): + if has_option("nostrip"): n += "-debugsymbols" if nix and os.uname()[2].startswith( "8." ): n += "-tiger" @@ -1423,6 +1392,7 @@ def getSystemInstallName(): n += "-" + "-".join( moduleNames ) try: + findSettingsSetup() import settings if "distmod" in dir( settings ): n = n + "-" + str( settings.distmod ) @@ -1503,7 +1473,7 @@ def installBinary( e , name ): fullInstallName = installDir + "/bin/" + name allBinaries += [ name ] - if (solaris or linux) and (not GetOption("nostrip")): + if (solaris or linux) and (not has_option("nostrip")): e.AddPostAction( inst, e.Action( 'strip ' + fullInstallName ) ) if linux and len( COMMAND_LINE_TARGETS ) == 1 and str( COMMAND_LINE_TARGETS[0] ) == "s3dist": @@ -1542,7 +1512,7 @@ if installSetup.clientSrc: #lib if installSetup.libraries: env.Install( installDir + "/" + nixLibPrefix, clientLibName ) - if GetOption( "sharedclient" ): + if has_option( "sharedclient" ): env.Install( installDir + "/" + nixLibPrefix, sharedClientLibName ) @@ -1569,7 +1539,7 @@ if installSetup.clientTestsDir: env.Alias( "install" , installDir ) # aliases -env.Alias( "mongoclient" , GetOption( "sharedclient" ) and sharedClientLibName or clientLibName ) +env.Alias( "mongoclient" , has_option( "sharedclient" ) and sharedClientLibName or clientLibName ) # ---- CONVENIENCE ---- @@ -1605,9 +1575,7 @@ def s3push( localName , remoteName=None , remotePrefix=None , fixName=True , pla else: remotePrefix = "-" + distName - sys.path.append( "." ) - sys.path.append( ".." ) - sys.path.append( "../../" ) + findSettingsSetup() import simples3 import settings @@ -1676,7 +1644,7 @@ def build_and_test_client(env, target, source): call(scons_command + ["libmongoclient.a", "clientTests"], cwd=installDir) return bool(call(["python", "buildscripts/smoke.py", - "--test-path", installDir, "smokeClient"])) + "--test-path", installDir, "client"])) env.Alias("clientBuild", [mongod, installDir], [build_and_test_client]) env.AlwaysBuild("clientBuild") diff --git a/bson/bson-inl.h b/bson/bson-inl.h new file mode 100644 index 0000000..5b4c490 --- /dev/null +++ b/bson/bson-inl.h @@ -0,0 +1,665 @@ +// bsoninlines.h + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "util/atomic_int.h" +#include "util/misc.h" +#include "../util/hex.h" + +namespace mongo { + + inline BSONObjIterator BSONObj::begin() { + return BSONObjIterator(*this); + } + + inline BSONObj BSONElement::embeddedObjectUserCheck() const { + if ( isABSONObj() ) + return BSONObj(value()); + stringstream ss; + ss << "invalid parameter: expected an object (" << fieldName() << ")"; + uasserted( 10065 , ss.str() ); + return BSONObj(); // never reachable + } + + inline BSONObj BSONElement::embeddedObject() const { + assert( isABSONObj() ); + return BSONObj(value()); + } + + inline BSONObj BSONElement::codeWScopeObject() const { + assert( type() == CodeWScope ); + int strSizeWNull = *(int *)( value() + 4 ); + return BSONObj( value() + 4 + 4 + strSizeWNull ); + } + + inline NOINLINE_DECL void BSONObj::_assertInvalid() const { + StringBuilder ss; + int os = objsize(); + ss << "Invalid BSONObj size: " << os << " (0x" << toHex( &os, 4 ) << ')'; + try { + BSONElement e = firstElement(); + ss << " first element: " << e.toString(); + } + catch ( ... ) { } + massert( 10334 , ss.str() , 0 ); + } + + /* the idea with NOINLINE_DECL here is to keep this from inlining in the + getOwned() method. the presumption being that is better. + */ + inline NOINLINE_DECL BSONObj BSONObj::copy() const { + char *p = (char*) malloc(objsize()); + memcpy(p, objdata(), objsize()); + return BSONObj(p, true); + } + + inline BSONObj BSONObj::getOwned() const { + if ( isOwned() ) + return *this; + return copy(); + } + + // wrap this element up as a singleton object. + inline BSONObj BSONElement::wrap() const { + BSONObjBuilder b(size()+6); + b.append(*this); + return b.obj(); + } + + inline BSONObj BSONElement::wrap( const char * newName ) const { + BSONObjBuilder b(size()+6+(int)strlen(newName)); + b.appendAs(*this,newName); + return b.obj(); + } + + inline bool BSONObj::hasElement(const char *name) const { + if ( !isEmpty() ) { + BSONObjIterator it(*this); + while ( it.moreWithEOO() ) { + BSONElement e = it.next(); + if ( strcmp(name, e.fieldName()) == 0 ) + return true; + } + } + return false; + } + + inline BSONElement BSONObj::getField(const StringData& name) const { + BSONObjIterator i(*this); + while ( i.more() ) { + BSONElement e = i.next(); + if ( strcmp(e.fieldName(), name.data()) == 0 ) + return e; + } + return BSONElement(); + } + + /* add all the fields from the object specified to this object */ + inline BSONObjBuilder& BSONObjBuilder::appendElements(BSONObj x) { + BSONObjIterator it(x); + while ( it.moreWithEOO() ) { + BSONElement e = it.next(); + if ( e.eoo() ) break; + append(e); + } + return *this; + } + + /* add all the fields from the object specified to this object if they don't exist */ + inline BSONObjBuilder& BSONObjBuilder::appendElementsUnique(BSONObj x) { + set have; + { + BSONObjIterator i = iterator(); + while ( i.more() ) + have.insert( i.next().fieldName() ); + } + + BSONObjIterator it(x); + while ( it.more() ) { + BSONElement e = it.next(); + if ( have.count( e.fieldName() ) ) + continue; + append(e); + } + return *this; + } + + + inline bool BSONObj::isValid() { + int x = objsize(); + return x > 0 && x <= BSONObjMaxInternalSize; + } + + inline bool BSONObj::getObjectID(BSONElement& e) const { + BSONElement f = getField("_id"); + if( !f.eoo() ) { + e = f; + return true; + } + return false; + } + + inline BSONObjBuilderValueStream::BSONObjBuilderValueStream( BSONObjBuilder * builder ) { + _fieldName = 0; + _builder = builder; + } + + template + inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( T value ) { + _builder->append(_fieldName, value); + _fieldName = 0; + return *_builder; + } + + inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const BSONElement& e ) { + _builder->appendAs( e , _fieldName ); + _fieldName = 0; + return *_builder; + } + + inline Labeler BSONObjBuilderValueStream::operator<<( const Labeler::Label &l ) { + return Labeler( l, this ); + } + + inline void BSONObjBuilderValueStream::endField( const char *nextFieldName ) { + if ( _fieldName && haveSubobj() ) { + _builder->append( _fieldName, subobj()->done() ); + } + _subobj.reset(); + _fieldName = nextFieldName; + } + + inline BSONObjBuilder *BSONObjBuilderValueStream::subobj() { + if ( !haveSubobj() ) + _subobj.reset( new BSONObjBuilder() ); + return _subobj.get(); + } + + template inline + BSONObjBuilder& Labeler::operator<<( T value ) { + s_->subobj()->append( l_.l_, value ); + return *s_->_builder; + } + + inline + BSONObjBuilder& Labeler::operator<<( const BSONElement& e ) { + s_->subobj()->appendAs( e, l_.l_ ); + return *s_->_builder; + } + + // {a: {b:1}} -> {a.b:1} + void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base=""); + inline BSONObj nested2dotted(const BSONObj& obj) { + BSONObjBuilder b; + nested2dotted(b, obj); + return b.obj(); + } + + // {a.b:1} -> {a: {b:1}} + void dotted2nested(BSONObjBuilder& b, const BSONObj& obj); + inline BSONObj dotted2nested(const BSONObj& obj) { + BSONObjBuilder b; + dotted2nested(b, obj); + return b.obj(); + } + + inline BSONObjIterator BSONObjBuilder::iterator() const { + const char * s = _b.buf() + _offset; + const char * e = _b.buf() + _b.len(); + return BSONObjIterator( s , e ); + } + + inline bool BSONObjBuilder::hasField( const StringData& name ) const { + BSONObjIterator i = iterator(); + while ( i.more() ) + if ( strcmp( name.data() , i.next().fieldName() ) == 0 ) + return true; + return false; + } + + /* WARNING: nested/dotted conversions are not 100% reversible + * nested2dotted(dotted2nested({a.b: {c:1}})) -> {a.b.c: 1} + * also, dotted2nested ignores order + */ + + typedef map BSONMap; + inline BSONMap bson2map(const BSONObj& obj) { + BSONMap m; + BSONObjIterator it(obj); + while (it.more()) { + BSONElement e = it.next(); + m[e.fieldName()] = e; + } + return m; + } + + struct BSONElementFieldNameCmp { + bool operator()( const BSONElement &l, const BSONElement &r ) const { + return strcmp( l.fieldName() , r.fieldName() ) <= 0; + } + }; + + typedef set BSONSortedElements; + inline BSONSortedElements bson2set( const BSONObj& obj ) { + BSONSortedElements s; + BSONObjIterator it(obj); + while ( it.more() ) + s.insert( it.next() ); + return s; + } + + inline string BSONObj::toString( bool isArray, bool full ) const { + if ( isEmpty() ) return "{}"; + StringBuilder s; + toString(s, isArray, full); + return s.str(); + } + inline void BSONObj::toString(StringBuilder& s, bool isArray, bool full ) const { + if ( isEmpty() ) { + s << "{}"; + return; + } + + s << ( isArray ? "[ " : "{ " ); + BSONObjIterator i(*this); + bool first = true; + while ( 1 ) { + massert( 10327 , "Object does not end with EOO", i.moreWithEOO() ); + BSONElement e = i.next( true ); + massert( 10328 , "Invalid element size", e.size() > 0 ); + massert( 10329 , "Element too large", e.size() < ( 1 << 30 ) ); + int offset = (int) (e.rawdata() - this->objdata()); + massert( 10330 , "Element extends past end of object", + e.size() + offset <= this->objsize() ); + e.validate(); + bool end = ( e.size() + offset == this->objsize() ); + if ( e.eoo() ) { + massert( 10331 , "EOO Before end of object", end ); + break; + } + if ( first ) + first = false; + else + s << ", "; + e.toString(s, !isArray, full ); + } + s << ( isArray ? " ]" : " }" ); + } + + extern unsigned getRandomNumber(); + + inline void BSONElement::validate() const { + const BSONType t = type(); + + switch( t ) { + case DBRef: + case Code: + case Symbol: + case mongo::String: { + unsigned x = (unsigned) valuestrsize(); + bool lenOk = x > 0 && x < (unsigned) BSONObjMaxInternalSize; + if( lenOk && valuestr()[x-1] == 0 ) + return; + StringBuilder buf; + buf << "Invalid dbref/code/string/symbol size: " << x; + if( lenOk ) + buf << " strnlen:" << mongo::strnlen( valuestr() , x ); + msgasserted( 10321 , buf.str() ); + break; + } + case CodeWScope: { + int totalSize = *( int * )( value() ); + massert( 10322 , "Invalid CodeWScope size", totalSize >= 8 ); + int strSizeWNull = *( int * )( value() + 4 ); + massert( 10323 , "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 ); + massert( 10324 , "Invalid CodeWScope string size", + strSizeWNull > 0 && + (strSizeWNull - 1) == mongo::strnlen( codeWScopeCode(), strSizeWNull ) ); + massert( 10325 , "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 ); + int objSize = *( int * )( value() + 4 + 4 + strSizeWNull ); + massert( 10326 , "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize ); + // Subobject validation handled elsewhere. + } + case Object: + // We expect Object size validation to be handled elsewhere. + default: + break; + } + } + + inline int BSONElement::size( int maxLen ) const { + if ( totalSize >= 0 ) + return totalSize; + + int remain = maxLen - fieldNameSize() - 1; + + int x = 0; + switch ( type() ) { + case EOO: + case Undefined: + case jstNULL: + case MaxKey: + case MinKey: + break; + case mongo::Bool: + x = 1; + break; + case NumberInt: + x = 4; + break; + case Timestamp: + case mongo::Date: + case NumberDouble: + case NumberLong: + x = 8; + break; + case jstOID: + x = 12; + break; + case Symbol: + case Code: + case mongo::String: + massert( 10313 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); + x = valuestrsize() + 4; + break; + case CodeWScope: + massert( 10314 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); + x = objsize(); + break; + + case DBRef: + massert( 10315 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); + x = valuestrsize() + 4 + 12; + break; + case Object: + case mongo::Array: + massert( 10316 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); + x = objsize(); + break; + case BinData: + massert( 10317 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); + x = valuestrsize() + 4 + 1/*subtype*/; + break; + case RegEx: { + const char *p = value(); + size_t len1 = ( maxLen == -1 ) ? strlen( p ) : mongo::strnlen( p, remain ); + //massert( 10318 , "Invalid regex string", len1 != -1 ); // ERH - 4/28/10 - don't think this does anything + p = p + len1 + 1; + size_t len2; + if( maxLen == -1 ) + len2 = strlen( p ); + else { + size_t x = remain - len1 - 1; + assert( x <= 0x7fffffff ); + len2 = mongo::strnlen( p, (int) x ); + } + //massert( 10319 , "Invalid regex options string", len2 != -1 ); // ERH - 4/28/10 - don't think this does anything + x = (int) (len1 + 1 + len2 + 1); + } + break; + default: { + StringBuilder ss; + ss << "BSONElement: bad type " << (int) type(); + string msg = ss.str(); + massert( 10320 , msg.c_str(),false); + } + } + totalSize = x + fieldNameSize() + 1; // BSONType + + return totalSize; + } + + inline string BSONElement::toString( bool includeFieldName, bool full ) const { + StringBuilder s; + toString(s, includeFieldName, full); + return s.str(); + } + inline void BSONElement::toString(StringBuilder& s, bool includeFieldName, bool full ) const { + if ( includeFieldName && type() != EOO ) + s << fieldName() << ": "; + switch ( type() ) { + case EOO: + s << "EOO"; + break; + case mongo::Date: + s << "new Date(" << date() << ')'; + break; + case RegEx: { + s << "/" << regex() << '/'; + const char *p = regexFlags(); + if ( p ) s << p; + } + break; + case NumberDouble: + s.appendDoubleNice( number() ); + break; + case NumberLong: + s << _numberLong(); + break; + case NumberInt: + s << _numberInt(); + break; + case mongo::Bool: + s << ( boolean() ? "true" : "false" ); + break; + case Object: + embeddedObject().toString(s, false, full); + break; + case mongo::Array: + embeddedObject().toString(s, true, full); + break; + case Undefined: + s << "undefined"; + break; + case jstNULL: + s << "null"; + break; + case MaxKey: + s << "MaxKey"; + break; + case MinKey: + s << "MinKey"; + break; + case CodeWScope: + s << "CodeWScope( " + << codeWScopeCode() << ", " << codeWScopeObject().toString(false, full) << ")"; + break; + case Code: + if ( !full && valuestrsize() > 80 ) { + s.write(valuestr(), 70); + s << "..."; + } + else { + s.write(valuestr(), valuestrsize()-1); + } + break; + case Symbol: + case mongo::String: + s << '"'; + if ( !full && valuestrsize() > 80 ) { + s.write(valuestr(), 70); + s << "...\""; + } + else { + s.write(valuestr(), valuestrsize()-1); + s << '"'; + } + break; + case DBRef: + s << "DBRef('" << valuestr() << "',"; + { + mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize()); + s << *x << ')'; + } + break; + case jstOID: + s << "ObjectId('"; + s << __oid() << "')"; + break; + case BinData: + s << "BinData"; + if (full) { + int len; + const char* data = binDataClean(len); + s << '(' << binDataType() << ", " << toHex(data, len) << ')'; + } + break; + case Timestamp: + s << "Timestamp " << timestampTime() << "|" << timestampInc(); + break; + default: + s << "?type=" << type(); + break; + } + } + + /* return has eoo() true if no match + supports "." notation to reach into embedded objects + */ + inline BSONElement BSONObj::getFieldDotted(const char *name) const { + BSONElement e = getField( name ); + if ( e.eoo() ) { + const char *p = strchr(name, '.'); + if ( p ) { + string left(name, p-name); + BSONObj sub = getObjectField(left.c_str()); + return sub.isEmpty() ? BSONElement() : sub.getFieldDotted(p+1); + } + } + + return e; + } + + inline BSONObj BSONObj::getObjectField(const char *name) const { + BSONElement e = getField(name); + BSONType t = e.type(); + return t == Object || t == Array ? e.embeddedObject() : BSONObj(); + } + + inline int BSONObj::nFields() const { + int n = 0; + BSONObjIterator i(*this); + while ( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + n++; + } + return n; + } + + inline BSONObj::BSONObj() { + /* little endian ordering here, but perhaps that is ok regardless as BSON is spec'd + to be little endian external to the system. (i.e. the rest of the implementation of bson, + not this part, fails to support big endian) + */ + static char p[] = { /*size*/5, 0, 0, 0, /*eoo*/0 }; + _objdata = p; + } + + inline BSONObj BSONElement::Obj() const { return embeddedObjectUserCheck(); } + + inline BSONElement BSONElement::operator[] (const string& field) const { + BSONObj o = Obj(); + return o[field]; + } + + inline void BSONObj::elems(vector &v) const { + BSONObjIterator i(*this); + while( i.more() ) + v.push_back(i.next()); + } + + inline void BSONObj::elems(list &v) const { + BSONObjIterator i(*this); + while( i.more() ) + v.push_back(i.next()); + } + + template + void BSONObj::Vals(vector& v) const { + BSONObjIterator i(*this); + while( i.more() ) { + T t; + i.next().Val(t); + v.push_back(t); + } + } + template + void BSONObj::Vals(list& v) const { + BSONObjIterator i(*this); + while( i.more() ) { + T t; + i.next().Val(t); + v.push_back(t); + } + } + + template + void BSONObj::vals(vector& v) const { + BSONObjIterator i(*this); + while( i.more() ) { + try { + T t; + i.next().Val(t); + v.push_back(t); + } + catch(...) { } + } + } + template + void BSONObj::vals(list& v) const { + BSONObjIterator i(*this); + while( i.more() ) { + try { + T t; + i.next().Val(t); + v.push_back(t); + } + catch(...) { } + } + } + + inline ostream& operator<<( ostream &s, const BSONObj &o ) { + return s << o.toString(); + } + + inline ostream& operator<<( ostream &s, const BSONElement &e ) { + return s << e.toString(); + } + + inline StringBuilder& operator<<( StringBuilder &s, const BSONObj &o ) { + o.toString( s ); + return s; + } + inline StringBuilder& operator<<( StringBuilder &s, const BSONElement &e ) { + e.toString( s ); + return s; + } + + + inline void BSONElement::Val(BSONObj& v) const { v = Obj(); } + + template + inline BSONFieldValue BSONField::query( const char * q , const T& t ) const { + BSONObjBuilder b; + b.append( q , t ); + return BSONFieldValue( _name , b.obj() ); + } +} diff --git a/bson/bson.h b/bson/bson.h index 3d92831..ba1b751 100644 --- a/bson/bson.h +++ b/bson/bson.h @@ -1,10 +1,10 @@ -/* NOTE: Standalone bson header for when not using MongoDB. +/* NOTE: Standalone bson header for when not using MongoDB. See also: bsondemo. MongoDB includes ../db/jsobj.h instead. This file, however, pulls in much less code / dependencies. */ -/** @file bson.h +/** @file bson.h BSON classes */ @@ -25,7 +25,7 @@ */ /** - BSONObj and its helpers + bo and its helpers "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be represented in JSON (plus a few extensions useful for databases & other languages). @@ -47,15 +47,15 @@ #include #include "util/builder.h" -namespace bson { +namespace bson { using std::string; using std::stringstream; - class assertion : public std::exception { + class assertion : public std::exception { public: assertion( unsigned u , const string& s ) - : id( u ) , msg( s ){ + : id( u ) , msg( s ) { mongo::StringBuilder ss; ss << "BsonAssertion id: " << u << " " << s; full = ss.str(); @@ -64,7 +64,7 @@ namespace bson { virtual ~assertion() throw() {} virtual const char* what() const throw() { return full.c_str(); } - + unsigned id; string msg; string full; @@ -72,9 +72,9 @@ namespace bson { } namespace mongo { -#if !defined(assert) +#if !defined(assert) inline void assert(bool expr) { - if(!expr) { + if(!expr) { throw bson::assertion( 0 , "assertion failure in bson library" ); } } @@ -88,12 +88,12 @@ namespace mongo { if( !expr ) uasserted( msgid , msg ); } - inline void msgasserted(int msgid, const char *msg) { + inline void msgasserted(int msgid, const char *msg) { throw bson::assertion( msgid , msg ); } inline void msgasserted(int msgid, const std::string &msg) { msgasserted(msgid, msg.c_str()); } - inline void massert(unsigned msgid, std::string msg, bool expr) { - if(!expr) { + inline void massert(unsigned msgid, std::string msg, bool expr) { + if(!expr) { std::cout << "assertion failure in bson library: " << msgid << ' ' << msg << std::endl; throw bson::assertion( msgid , msg ); } @@ -108,15 +108,15 @@ namespace mongo { #include "../bson/bsonmisc.h" #include "../bson/bsonobjbuilder.h" #include "../bson/bsonobjiterator.h" -#include "../bson/bsoninlines.h" +#include "../bson/bson-inl.h" -namespace mongo { +namespace mongo { inline unsigned getRandomNumber() { #if defined(_WIN32) return rand(); #else - return random(); + return random(); #endif } diff --git a/bson/bson_db.h b/bson/bson_db.h index 18cd59f..71f92aa 100644 --- a/bson/bson_db.h +++ b/bson/bson_db.h @@ -1,10 +1,10 @@ -/** @file bson_db.h +/** @file bson_db.h - This file contains the implementation of BSON-related methods that are required + This file contains the implementation of BSON-related methods that are required by the MongoDB database server. - Normally, for standalone BSON usage, you do not want this file - it will tend to - pull in some other files from the MongoDB project. Thus, bson.h (the main file + Normally, for standalone BSON usage, you do not want this file - it will tend to + pull in some other files from the MongoDB project. Thus, bson.h (the main file one would use) does not include this file. */ @@ -26,6 +26,7 @@ #pragma once #include "../util/optime.h" +#include "../util/time_support.h" namespace mongo { @@ -34,10 +35,10 @@ namespace mongo { Append a timestamp element to the object being ebuilt. @param time - in millis (but stored in seconds) */ - inline BSONObjBuilder& BSONObjBuilder::appendTimestamp( const StringData& fieldName , unsigned long long time , unsigned int inc ){ + inline BSONObjBuilder& BSONObjBuilder::appendTimestamp( const StringData& fieldName , unsigned long long time , unsigned int inc ) { OpTime t( (unsigned) (time / 1000) , inc ); appendTimestamp( fieldName , t.asDate() ); - return *this; + return *this; } inline OpTime BSONElement::_opTime() const { @@ -47,7 +48,7 @@ namespace mongo { } inline string BSONElement::_asCode() const { - switch( type() ){ + switch( type() ) { case mongo::String: case Code: return string(valuestr(), valuestrsize()-1); @@ -60,11 +61,22 @@ namespace mongo { return ""; } - inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(DateNowLabeler& id){ + inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(DateNowLabeler& id) { _builder->appendDate(_fieldName, jsTime()); _fieldName = 0; return *_builder; } + inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(MinKeyLabeler& id) { + _builder->appendMinKey(_fieldName); + _fieldName = 0; + return *_builder; + } + + inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(MaxKeyLabeler& id) { + _builder->appendMaxKey(_fieldName); + _fieldName = 0; + return *_builder; + } } diff --git a/bson/bsondemo/bsondemo.cpp b/bson/bsondemo/bsondemo.cpp index b0da1b8..ec83f5e 100644 --- a/bson/bsondemo/bsondemo.cpp +++ b/bson/bsondemo/bsondemo.cpp @@ -1,4 +1,4 @@ -/** @file bsondemo.cpp +/** @file bsondemo.cpp Example of use of BSON from C++. @@ -29,17 +29,16 @@ using namespace std; using namespace bson; -void iter(bo o) { +void iter(bo o) { /* iterator example */ cout << "\niter()\n"; - for( bo::iterator i(o); i.more(); ) { + for( bo::iterator i(o); i.more(); ) { cout << ' ' << i.next().toString() << '\n'; } } -int main() -{ - cout << "build bits: " << 8 * sizeof(char *) << '\n' << endl; +int main() { + cout << "build bits: " << 8 * sizeof(char *) << '\n' << endl; /* a bson object defaults on construction to { } */ bo empty; @@ -47,7 +46,7 @@ int main() /* make a simple { name : 'joe', age : 33.7 } object */ { - bob b; + bob b; b.append("name", "joe"); b.append("age", 33.7); b.obj(); @@ -73,7 +72,7 @@ int main() /* reach in and get subobj.z */ cout << "subobj.z: " << y.getFieldDotted("subobj.z").Number() << endl; - + /* alternate syntax: */ cout << "subobj.z: " << y["subobj"]["z"].Number() << endl; @@ -83,19 +82,19 @@ int main() cout << v[0] << endl; /* into an array */ - list L; + list L; y.elems(L); bo sub = y["subobj"].Obj(); - /* grab all the int's that were in subobj. if it had elements that were not ints, we throw an exception - (capital V on Vals() means exception if wrong type found + /* grab all the int's that were in subobj. if it had elements that were not ints, we throw an exception + (capital V on Vals() means exception if wrong type found */ vector myints; sub.Vals(myints); cout << "my ints: " << myints[0] << ' ' << myints[1] << endl; - /* grab all the string values from x. if the field isn't of string type, just skip it -- + /* grab all the string values from x. if the field isn't of string type, just skip it -- lowercase v on vals() indicates skip don't throw. */ vector strs; @@ -103,5 +102,6 @@ int main() cout << strs.size() << " strings, first one: " << strs[0] << endl; iter(y); - return 0; + return 0; } + diff --git a/bson/bsonelement.h b/bson/bsonelement.h index 534c773..23d59fa 100644 --- a/bson/bsonelement.h +++ b/bson/bsonelement.h @@ -36,378 +36,384 @@ namespace mongo { int compareElementValues(const BSONElement& l, const BSONElement& r); -/** BSONElement represents an "element" in a BSONObj. So for the object { a : 3, b : "abc" }, - 'a : 3' is the first element (key+value). - - The BSONElement object points into the BSONObj's data. Thus the BSONObj must stay in scope - for the life of the BSONElement. - - internals: - - -------- size() ------------ - -fieldNameSize- - value() - type() -*/ -class BSONElement { -public: - /** These functions, which start with a capital letter, throw a UserException if the - element is not of the required type. Example: - - string foo = obj["foo"].String(); // exception if not a string type or DNE + /** BSONElement represents an "element" in a BSONObj. So for the object { a : 3, b : "abc" }, + 'a : 3' is the first element (key+value). + + The BSONElement object points into the BSONObj's data. Thus the BSONObj must stay in scope + for the life of the BSONElement. + + internals: + + -------- size() ------------ + -fieldNameSize- + value() + type() */ - string String() const { return chk(mongo::String).valuestr(); } - Date_t Date() const { return chk(mongo::Date).date(); } - double Number() const { return chk(isNumber()).number(); } - double Double() const { return chk(NumberDouble)._numberDouble(); } - long long Long() const { return chk(NumberLong)._numberLong(); } - int Int() const { return chk(NumberInt)._numberInt(); } - bool Bool() const { return chk(mongo::Bool).boolean(); } - BSONObj Obj() const; - vector Array() const; // see implementation for detailed comments - mongo::OID OID() const { return chk(jstOID).__oid(); } - void Null() const { chk(isNull()); } - void OK() const { chk(ok()); } - - /** populate v with the value of the element. If type does not match, throw exception. - useful in templates -- see also BSONObj::Vals(). + class BSONElement { + public: + /** These functions, which start with a capital letter, throw a UserException if the + element is not of the required type. Example: + + string foo = obj["foo"].String(); // exception if not a string type or DNE */ - void Val(Date_t& v) const { v = Date(); } - void Val(long long& v) const { v = Long(); } - void Val(bool& v) const { v = Bool(); } - void Val(BSONObj& v) const; - void Val(mongo::OID& v) const { v = OID(); } - void Val(int& v) const { v = Int(); } - void Val(double& v) const { v = Double(); } - void Val(string& v) const { v = String(); } - - /** Use ok() to check if a value is assigned: - if( myObj["foo"].ok() ) ... - */ - bool ok() const { return !eoo(); } + string String() const { return chk(mongo::String).valuestr(); } + Date_t Date() const { return chk(mongo::Date).date(); } + double Number() const { return chk(isNumber()).number(); } + double Double() const { return chk(NumberDouble)._numberDouble(); } + long long Long() const { return chk(NumberLong)._numberLong(); } + int Int() const { return chk(NumberInt)._numberInt(); } + bool Bool() const { return chk(mongo::Bool).boolean(); } + vector Array() const; // see implementation for detailed comments + mongo::OID OID() const { return chk(jstOID).__oid(); } + void Null() const { chk(isNull()); } // throw UserException if not null + void OK() const { chk(ok()); } // throw UserException if element DNE + + /** @return the embedded object associated with this field. + Note the returned object is a reference to within the parent bson object. If that + object is out of scope, this pointer will no longer be valid. Call getOwned() on the + returned BSONObj if you need your own copy. + throws UserException if the element is not of type object. + */ + BSONObj Obj() const; + + /** populate v with the value of the element. If type does not match, throw exception. + useful in templates -- see also BSONObj::Vals(). + */ + void Val(Date_t& v) const { v = Date(); } + void Val(long long& v) const { v = Long(); } + void Val(bool& v) const { v = Bool(); } + void Val(BSONObj& v) const; + void Val(mongo::OID& v) const { v = OID(); } + void Val(int& v) const { v = Int(); } + void Val(double& v) const { v = Double(); } + void Val(string& v) const { v = String(); } + + /** Use ok() to check if a value is assigned: + if( myObj["foo"].ok() ) ... + */ + bool ok() const { return !eoo(); } - string toString( bool includeFieldName = true, bool full=false) const; - void toString(StringBuilder& s, bool includeFieldName = true, bool full=false) const; - string jsonString( JsonStringFormat format, bool includeFieldNames = true, int pretty = 0 ) const; - operator string() const { return toString(); } + string toString( bool includeFieldName = true, bool full=false) const; + void toString(StringBuilder& s, bool includeFieldName = true, bool full=false) const; + string jsonString( JsonStringFormat format, bool includeFieldNames = true, int pretty = 0 ) const; + operator string() const { return toString(); } - /** Returns the type of the element */ - BSONType type() const { return (BSONType) *data; } + /** Returns the type of the element */ + BSONType type() const { return (BSONType) *data; } - /** retrieve a field within this element - throws exception if *this is not an embedded object - */ - BSONElement operator[] (const string& field) const; - - /** returns the tyoe of the element fixed for the main type - the main purpose is numbers. any numeric type will return NumberDouble - Note: if the order changes, indexes have to be re-built or than can be corruption - */ - int canonicalType() const; + /** retrieve a field within this element + throws exception if *this is not an embedded object + */ + BSONElement operator[] (const string& field) const; - /** Indicates if it is the end-of-object element, which is present at the end of - every BSON object. - */ - bool eoo() const { return type() == EOO; } + /** returns the tyoe of the element fixed for the main type + the main purpose is numbers. any numeric type will return NumberDouble + Note: if the order changes, indexes have to be re-built or than can be corruption + */ + int canonicalType() const; - /** Size of the element. - @param maxLen If maxLen is specified, don't scan more than maxLen bytes to calculate size. - */ - int size( int maxLen = -1 ) const; + /** Indicates if it is the end-of-object element, which is present at the end of + every BSON object. + */ + bool eoo() const { return type() == EOO; } - /** Wrap this element up as a singleton object. */ - BSONObj wrap() const; + /** Size of the element. + @param maxLen If maxLen is specified, don't scan more than maxLen bytes to calculate size. + */ + int size( int maxLen = -1 ) const; - /** Wrap this element up as a singleton object with a new name. */ - BSONObj wrap( const char* newName) const; + /** Wrap this element up as a singleton object. */ + BSONObj wrap() const; - /** field name of the element. e.g., for - name : "Joe" - "name" is the fieldname - */ - const char * fieldName() const { - if ( eoo() ) return ""; // no fieldname for it. - return data + 1; - } + /** Wrap this element up as a singleton object with a new name. */ + BSONObj wrap( const char* newName) const; - /** raw data of the element's value (so be careful). */ - const char * value() const { - return (data + fieldNameSize() + 1); - } - /** size in bytes of the element's value (when applicable). */ - int valuesize() const { - return size() - fieldNameSize() - 1; - } + /** field name of the element. e.g., for + name : "Joe" + "name" is the fieldname + */ + const char * fieldName() const { + if ( eoo() ) return ""; // no fieldname for it. + return data + 1; + } - bool isBoolean() const { return type() == mongo::Bool; } + /** raw data of the element's value (so be careful). */ + const char * value() const { + return (data + fieldNameSize() + 1); + } + /** size in bytes of the element's value (when applicable). */ + int valuesize() const { + return size() - fieldNameSize() - 1; + } - /** @return value of a boolean element. - You must assure element is a boolean before - calling. */ - bool boolean() const { - return *value() ? true : false; - } + bool isBoolean() const { return type() == mongo::Bool; } - /** Retrieve a java style date value from the element. - Ensure element is of type Date before calling. - */ - Date_t date() const { - return *reinterpret_cast< const Date_t* >( value() ); - } + /** @return value of a boolean element. + You must assure element is a boolean before + calling. */ + bool boolean() const { + return *value() ? true : false; + } - /** Convert the value to boolean, regardless of its type, in a javascript-like fashion - (i.e., treat zero and null as false). - */ - bool trueValue() const; + /** Retrieve a java style date value from the element. + Ensure element is of type Date before calling. + */ + Date_t date() const { + return *reinterpret_cast< const Date_t* >( value() ); + } - /** True if number, string, bool, date, OID */ - bool isSimpleType() const; + /** Convert the value to boolean, regardless of its type, in a javascript-like fashion + (i.e., treat zero and null as false). + */ + bool trueValue() const; + + /** True if number, string, bool, date, OID */ + bool isSimpleType() const; + + /** True if element is of a numeric type. */ + bool isNumber() const; + + /** Return double value for this field. MUST be NumberDouble type. */ + double _numberDouble() const {return *reinterpret_cast< const double* >( value() ); } + /** Return double value for this field. MUST be NumberInt type. */ + int _numberInt() const {return *reinterpret_cast< const int* >( value() ); } + /** Return double value for this field. MUST be NumberLong type. */ + long long _numberLong() const {return *reinterpret_cast< const long long* >( value() ); } + + /** Retrieve int value for the element safely. Zero returned if not a number. */ + int numberInt() const; + /** Retrieve long value for the element safely. Zero returned if not a number. */ + long long numberLong() const; + /** Retrieve the numeric value of the element. If not of a numeric type, returns 0. + Note: casts to double, data loss may occur with large (>52 bit) NumberLong values. + */ + double numberDouble() const; + /** Retrieve the numeric value of the element. If not of a numeric type, returns 0. + Note: casts to double, data loss may occur with large (>52 bit) NumberLong values. + */ + double number() const { return numberDouble(); } - /** True if element is of a numeric type. */ - bool isNumber() const; + /** Retrieve the object ID stored in the object. + You must ensure the element is of type jstOID first. */ + const mongo::OID &__oid() const { return *reinterpret_cast< const mongo::OID* >( value() ); } - /** Return double value for this field. MUST be NumberDouble type. */ - double _numberDouble() const {return *reinterpret_cast< const double* >( value() ); } - /** Return double value for this field. MUST be NumberInt type. */ - int _numberInt() const {return *reinterpret_cast< const int* >( value() ); } - /** Return double value for this field. MUST be NumberLong type. */ - long long _numberLong() const {return *reinterpret_cast< const long long* >( value() ); } + /** True if element is null. */ + bool isNull() const { + return type() == jstNULL; + } - /** Retrieve int value for the element safely. Zero returned if not a number. */ - int numberInt() const; - /** Retrieve long value for the element safely. Zero returned if not a number. */ - long long numberLong() const; - /** Retrieve the numeric value of the element. If not of a numeric type, returns 0. - Note: casts to double, data loss may occur with large (>52 bit) NumberLong values. - */ - double numberDouble() const; - /** Retrieve the numeric value of the element. If not of a numeric type, returns 0. - Note: casts to double, data loss may occur with large (>52 bit) NumberLong values. - */ - double number() const { return numberDouble(); } + /** Size (length) of a string element. + You must assure of type String first. */ + int valuestrsize() const { + return *reinterpret_cast< const int* >( value() ); + } - /** Retrieve the object ID stored in the object. - You must ensure the element is of type jstOID first. */ - const mongo::OID &__oid() const { return *reinterpret_cast< const mongo::OID* >( value() ); } + // for objects the size *includes* the size of the size field + int objsize() const { + return *reinterpret_cast< const int* >( value() ); + } - /** True if element is null. */ - bool isNull() const { - return type() == jstNULL; - } - - /** Size (length) of a string element. - You must assure of type String first. */ - int valuestrsize() const { - return *reinterpret_cast< const int* >( value() ); - } + /** Get a string's value. Also gives you start of the real data for an embedded object. + You must assure data is of an appropriate type first -- see also valuestrsafe(). + */ + const char * valuestr() const { + return value() + 4; + } - // for objects the size *includes* the size of the size field - int objsize() const { - return *reinterpret_cast< const int* >( value() ); - } + /** Get the string value of the element. If not a string returns "". */ + const char *valuestrsafe() const { + return type() == mongo::String ? valuestr() : ""; + } + /** Get the string value of the element. If not a string returns "". */ + string str() const { + return type() == mongo::String ? string(valuestr(), valuestrsize()-1) : string(); + } - /** Get a string's value. Also gives you start of the real data for an embedded object. - You must assure data is of an appropriate type first -- see also valuestrsafe(). - */ - const char * valuestr() const { - return value() + 4; - } + /** Get javascript code of a CodeWScope data element. */ + const char * codeWScopeCode() const { + return value() + 8; + } + /** Get the scope SavedContext of a CodeWScope data element. */ + const char * codeWScopeScopeData() const { + // TODO fix + return codeWScopeCode() + strlen( codeWScopeCode() ) + 1; + } - /** Get the string value of the element. If not a string returns "". */ - const char *valuestrsafe() const { - return type() == mongo::String ? valuestr() : ""; - } - /** Get the string value of the element. If not a string returns "". */ - string str() const { - return type() == mongo::String ? string(valuestr(), valuestrsize()-1) : string(); - } + /** Get the embedded object this element holds. */ + BSONObj embeddedObject() const; - /** Get javascript code of a CodeWScope data element. */ - const char * codeWScopeCode() const { - return value() + 8; - } - /** Get the scope SavedContext of a CodeWScope data element. */ - const char * codeWScopeScopeData() const { - // TODO fix - return codeWScopeCode() + strlen( codeWScopeCode() ) + 1; - } + /* uasserts if not an object */ + BSONObj embeddedObjectUserCheck() const; - /** Get the embedded object this element holds. */ - BSONObj embeddedObject() const; + BSONObj codeWScopeObject() const; - /* uasserts if not an object */ - BSONObj embeddedObjectUserCheck() const; + /** Get raw binary data. Element must be of type BinData. Doesn't handle type 2 specially */ + const char *binData(int& len) const { + // BinData: + assert( type() == BinData ); + len = valuestrsize(); + return value() + 5; + } + /** Get binary data. Element must be of type BinData. Handles type 2 */ + const char *binDataClean(int& len) const { + // BinData: + if (binDataType() != ByteArrayDeprecated) { + return binData(len); + } + else { + // Skip extra size + len = valuestrsize() - 4; + return value() + 5 + 4; + } + } - BSONObj codeWScopeObject() const; + BinDataType binDataType() const { + // BinData: + assert( type() == BinData ); + unsigned char c = (value() + 4)[0]; + return (BinDataType)c; + } - /** Get raw binary data. Element must be of type BinData. Doesn't handle type 2 specially */ - const char *binData(int& len) const { - // BinData: - assert( type() == BinData ); - len = valuestrsize(); - return value() + 5; - } - /** Get binary data. Element must be of type BinData. Handles type 2 */ - const char *binDataClean(int& len) const { - // BinData: - if (binDataType() != ByteArrayDeprecated){ - return binData(len); - } else { - // Skip extra size - len = valuestrsize() - 4; - return value() + 5 + 4; + /** Retrieve the regex string for a Regex element */ + const char *regex() const { + assert(type() == RegEx); + return value(); } - } - - BinDataType binDataType() const { - // BinData: - assert( type() == BinData ); - unsigned char c = (value() + 4)[0]; - return (BinDataType)c; - } - /** Retrieve the regex string for a Regex element */ - const char *regex() const { - assert(type() == RegEx); - return value(); - } + /** Retrieve the regex flags (options) for a Regex element */ + const char *regexFlags() const { + const char *p = regex(); + return p + strlen(p) + 1; + } - /** Retrieve the regex flags (options) for a Regex element */ - const char *regexFlags() const { - const char *p = regex(); - return p + strlen(p) + 1; - } + /** like operator== but doesn't check the fieldname, + just the value. + */ + bool valuesEqual(const BSONElement& r) const { + return woCompare( r , false ) == 0; + } - /** like operator== but doesn't check the fieldname, - just the value. - */ - bool valuesEqual(const BSONElement& r) const { - return woCompare( r , false ) == 0; - } + /** Returns true if elements are equal. */ + bool operator==(const BSONElement& r) const { + return woCompare( r , true ) == 0; + } - /** Returns true if elements are equal. */ - bool operator==(const BSONElement& r) const { - return woCompare( r , true ) == 0; - } + /** Well ordered comparison. + @return <0: l0:l>r + order by type, field name, and field value. + If considerFieldName is true, pay attention to the field name. + */ + int woCompare( const BSONElement &e, bool considerFieldName = true ) const; - /** Well ordered comparison. - @return <0: l0:l>r - order by type, field name, and field value. - If considerFieldName is true, pay attention to the field name. - */ - int woCompare( const BSONElement &e, bool considerFieldName = true ) const; + const char * rawdata() const { return data; } - const char * rawdata() const { - return data; - } - - /** 0 == Equality, just not defined yet */ - int getGtLtOp( int def = 0 ) const; - - /** Constructs an empty element */ - BSONElement(); - - /** Check that data is internally consistent. */ - void validate() const; - - /** True if this element may contain subobjects. */ - bool mayEncapsulate() const { - switch ( type() ){ - case Object: - case mongo::Array: - case CodeWScope: - return true; - default: - return false; + /** 0 == Equality, just not defined yet */ + int getGtLtOp( int def = 0 ) const; + + /** Constructs an empty element */ + BSONElement(); + + /** Check that data is internally consistent. */ + void validate() const; + + /** True if this element may contain subobjects. */ + bool mayEncapsulate() const { + switch ( type() ) { + case Object: + case mongo::Array: + case CodeWScope: + return true; + default: + return false; + } } - } - /** True if this element can be a BSONObj */ - bool isABSONObj() const { - switch( type() ){ - case Object: - case mongo::Array: - return true; - default: - return false; + /** True if this element can be a BSONObj */ + bool isABSONObj() const { + switch( type() ) { + case Object: + case mongo::Array: + return true; + default: + return false; + } } - } - Date_t timestampTime() const{ - unsigned long long t = ((unsigned int*)(value() + 4 ))[0]; - return t * 1000; - } - unsigned int timestampInc() const{ - return ((unsigned int*)(value() ))[0]; - } + Date_t timestampTime() const { + unsigned long long t = ((unsigned int*)(value() + 4 ))[0]; + return t * 1000; + } + unsigned int timestampInc() const { + return ((unsigned int*)(value() ))[0]; + } - const char * dbrefNS() const { - uassert( 10063 , "not a dbref" , type() == DBRef ); - return value() + 4; - } + const char * dbrefNS() const { + uassert( 10063 , "not a dbref" , type() == DBRef ); + return value() + 4; + } - const mongo::OID& dbrefOID() const { - uassert( 10064 , "not a dbref" , type() == DBRef ); - const char * start = value(); - start += 4 + *reinterpret_cast< const int* >( start ); - return *reinterpret_cast< const mongo::OID* >( start ); - } + const mongo::OID& dbrefOID() const { + uassert( 10064 , "not a dbref" , type() == DBRef ); + const char * start = value(); + start += 4 + *reinterpret_cast< const int* >( start ); + return *reinterpret_cast< const mongo::OID* >( start ); + } - bool operator<( const BSONElement& other ) const { - int x = (int)canonicalType() - (int)other.canonicalType(); - if ( x < 0 ) return true; - else if ( x > 0 ) return false; - return compareElementValues(*this,other) < 0; - } - - // If maxLen is specified, don't scan more than maxLen bytes. - explicit BSONElement(const char *d, int maxLen = -1) : data(d) { - fieldNameSize_ = -1; - if ( eoo() ) - fieldNameSize_ = 0; - else { - if ( maxLen != -1 ) { - int size = (int) strnlen( fieldName(), maxLen - 1 ); - massert( 10333 , "Invalid field name", size != -1 ); - fieldNameSize_ = size + 1; - } + bool operator<( const BSONElement& other ) const { + int x = (int)canonicalType() - (int)other.canonicalType(); + if ( x < 0 ) return true; + else if ( x > 0 ) return false; + return compareElementValues(*this,other) < 0; } - totalSize = -1; - } - string _asCode() const; - OpTime _opTime() const; + // If maxLen is specified, don't scan more than maxLen bytes. + explicit BSONElement(const char *d, int maxLen = -1) : data(d) { + fieldNameSize_ = -1; + if ( eoo() ) + fieldNameSize_ = 0; + else { + if ( maxLen != -1 ) { + int size = (int) strnlen( fieldName(), maxLen - 1 ); + massert( 10333 , "Invalid field name", size != -1 ); + fieldNameSize_ = size + 1; + } + } + totalSize = -1; + } -private: - const char *data; - mutable int fieldNameSize_; // cached value - int fieldNameSize() const { - if ( fieldNameSize_ == -1 ) - fieldNameSize_ = (int)strlen( fieldName() ) + 1; - return fieldNameSize_; - } - mutable int totalSize; /* caches the computed size */ + string _asCode() const; + OpTime _opTime() const; - friend class BSONObjIterator; - friend class BSONObj; - const BSONElement& chk(int t) const { - if ( t != type() ){ - StringBuilder ss; - ss << "wrong type for BSONElement (" << fieldName() << ") " << type() << " != " << t; - uasserted(13111, ss.str() ); + private: + const char *data; + mutable int fieldNameSize_; // cached value + int fieldNameSize() const { + if ( fieldNameSize_ == -1 ) + fieldNameSize_ = (int)strlen( fieldName() ) + 1; + return fieldNameSize_; } - return *this; - } - const BSONElement& chk(bool expr) const { - uassert(13118, "unexpected or missing type value in BSON object", expr); - return *this; - } -}; + mutable int totalSize; /* caches the computed size */ + + friend class BSONObjIterator; + friend class BSONObj; + const BSONElement& chk(int t) const { + if ( t != type() ) { + StringBuilder ss; + ss << "wrong type for BSONElement (" << fieldName() << ") " << type() << " != " << t; + uasserted(13111, ss.str() ); + } + return *this; + } + const BSONElement& chk(bool expr) const { + uassert(13118, "unexpected or missing type value in BSON object", expr); + return *this; + } + }; inline int BSONElement::canonicalType() const { BSONType t = type(); - switch ( t ){ + switch ( t ) { case MinKey: case MaxKey: return t; @@ -448,7 +454,7 @@ private: assert(0); return -1; } - } + } inline bool BSONElement::trueValue() const { switch( type() ) { @@ -464,7 +470,7 @@ private: case jstNULL: case Undefined: return false; - + default: ; } @@ -478,13 +484,13 @@ private: case NumberDouble: case NumberInt: return true; - default: + default: return false; } } inline bool BSONElement::isSimpleType() const { - switch( type() ){ + switch( type() ) { case NumberLong: case NumberDouble: case NumberInt: @@ -493,7 +499,7 @@ private: case mongo::Date: case jstOID: return true; - default: + default: return false; } } @@ -512,7 +518,7 @@ private: } /** Retrieve int value for the element safely. Zero returned if not a number. Converted to int if another numeric type. */ - inline int BSONElement::numberInt() const { + inline int BSONElement::numberInt() const { switch( type() ) { case NumberDouble: return (int) _numberDouble(); @@ -526,7 +532,7 @@ private: } /** Retrieve long value for the element safely. Zero returned if not a number. */ - inline long long BSONElement::numberLong() const { + inline long long BSONElement::numberLong() const { switch( type() ) { case NumberDouble: return (long long) _numberDouble(); @@ -537,7 +543,7 @@ private: default: return 0; } - } + } inline BSONElement::BSONElement() { static char z = 0; diff --git a/bson/bsoninlines.h b/bson/bsoninlines.h deleted file mode 100644 index 0a2e59b..0000000 --- a/bson/bsoninlines.h +++ /dev/null @@ -1,588 +0,0 @@ -// bsoninlines.h - -/* Copyright 2009 10gen Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include "util/atomic_int.h" -#include "util/misc.h" -#include "../util/hex.h" - -namespace mongo { - - inline BSONObjIterator BSONObj::begin() { - return BSONObjIterator(*this); - } - - inline BSONObj BSONElement::embeddedObjectUserCheck() const { - uassert( 10065 , "invalid parameter: expected an object", isABSONObj() ); - return BSONObj(value()); - } - - inline BSONObj BSONElement::embeddedObject() const { - assert( isABSONObj() ); - return BSONObj(value()); - } - - inline BSONObj BSONElement::codeWScopeObject() const { - assert( type() == CodeWScope ); - int strSizeWNull = *(int *)( value() + 4 ); - return BSONObj( value() + 4 + 4 + strSizeWNull ); - } - - inline BSONObj BSONObj::copy() const { - char *p = (char*) malloc(objsize()); - memcpy(p, objdata(), objsize()); - return BSONObj(p, true); - } - - // wrap this element up as a singleton object. - inline BSONObj BSONElement::wrap() const { - BSONObjBuilder b(size()+6); - b.append(*this); - return b.obj(); - } - - inline BSONObj BSONElement::wrap( const char * newName ) const { - BSONObjBuilder b(size()+6+(int)strlen(newName)); - b.appendAs(*this,newName); - return b.obj(); - } - - - inline bool BSONObj::hasElement(const char *name) const { - if ( !isEmpty() ) { - BSONObjIterator it(*this); - while ( it.moreWithEOO() ) { - BSONElement e = it.next(); - if ( strcmp(name, e.fieldName()) == 0 ) - return true; - } - } - return false; - } - - inline BSONElement BSONObj::getField(const StringData& name) const { - BSONObjIterator i(*this); - while ( i.more() ) { - BSONElement e = i.next(); - if ( strcmp(e.fieldName(), name.data()) == 0 ) - return e; - } - return BSONElement(); - } - - /* add all the fields from the object specified to this object */ - inline BSONObjBuilder& BSONObjBuilder::appendElements(BSONObj x) { - BSONObjIterator it(x); - while ( it.moreWithEOO() ) { - BSONElement e = it.next(); - if ( e.eoo() ) break; - append(e); - } - return *this; - } - - inline bool BSONObj::isValid(){ - int x = objsize(); - return x > 0 && x <= 1024 * 1024 * 8; - } - - inline bool BSONObj::getObjectID(BSONElement& e) const { - BSONElement f = getField("_id"); - if( !f.eoo() ) { - e = f; - return true; - } - return false; - } - - inline BSONObjBuilderValueStream::BSONObjBuilderValueStream( BSONObjBuilder * builder ) { - _fieldName = 0; - _builder = builder; - } - - template - inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( T value ) { - _builder->append(_fieldName, value); - _fieldName = 0; - return *_builder; - } - - inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const BSONElement& e ) { - _builder->appendAs( e , _fieldName ); - _fieldName = 0; - return *_builder; - } - - inline Labeler BSONObjBuilderValueStream::operator<<( const Labeler::Label &l ) { - return Labeler( l, this ); - } - - inline void BSONObjBuilderValueStream::endField( const char *nextFieldName ) { - if ( _fieldName && haveSubobj() ) { - _builder->append( _fieldName, subobj()->done() ); - } - _subobj.reset(); - _fieldName = nextFieldName; - } - - inline BSONObjBuilder *BSONObjBuilderValueStream::subobj() { - if ( !haveSubobj() ) - _subobj.reset( new BSONObjBuilder() ); - return _subobj.get(); - } - - template inline - BSONObjBuilder& Labeler::operator<<( T value ) { - s_->subobj()->append( l_.l_, value ); - return *s_->_builder; - } - - inline - BSONObjBuilder& Labeler::operator<<( const BSONElement& e ) { - s_->subobj()->appendAs( e, l_.l_ ); - return *s_->_builder; - } - - // {a: {b:1}} -> {a.b:1} - void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base=""); - inline BSONObj nested2dotted(const BSONObj& obj){ - BSONObjBuilder b; - nested2dotted(b, obj); - return b.obj(); - } - - // {a.b:1} -> {a: {b:1}} - void dotted2nested(BSONObjBuilder& b, const BSONObj& obj); - inline BSONObj dotted2nested(const BSONObj& obj){ - BSONObjBuilder b; - dotted2nested(b, obj); - return b.obj(); - } - - inline BSONObjIterator BSONObjBuilder::iterator() const { - const char * s = _b.buf() + _offset; - const char * e = _b.buf() + _b.len(); - return BSONObjIterator( s , e ); - } - - /* WARNING: nested/dotted conversions are not 100% reversible - * nested2dotted(dotted2nested({a.b: {c:1}})) -> {a.b.c: 1} - * also, dotted2nested ignores order - */ - - typedef map BSONMap; - inline BSONMap bson2map(const BSONObj& obj){ - BSONMap m; - BSONObjIterator it(obj); - while (it.more()){ - BSONElement e = it.next(); - m[e.fieldName()] = e; - } - return m; - } - - struct BSONElementFieldNameCmp { - bool operator()( const BSONElement &l, const BSONElement &r ) const { - return strcmp( l.fieldName() , r.fieldName() ) <= 0; - } - }; - - typedef set BSONSortedElements; - inline BSONSortedElements bson2set( const BSONObj& obj ){ - BSONSortedElements s; - BSONObjIterator it(obj); - while ( it.more() ) - s.insert( it.next() ); - return s; - } - - inline string BSONObj::toString( bool isArray, bool full ) const { - if ( isEmpty() ) return "{}"; - StringBuilder s; - toString(s, isArray, full); - return s.str(); - } - inline void BSONObj::toString(StringBuilder& s, bool isArray, bool full ) const { - if ( isEmpty() ){ - s << "{}"; - return; - } - - s << ( isArray ? "[ " : "{ " ); - BSONObjIterator i(*this); - bool first = true; - while ( 1 ) { - massert( 10327 , "Object does not end with EOO", i.moreWithEOO() ); - BSONElement e = i.next( true ); - massert( 10328 , "Invalid element size", e.size() > 0 ); - massert( 10329 , "Element too large", e.size() < ( 1 << 30 ) ); - int offset = (int) (e.rawdata() - this->objdata()); - massert( 10330 , "Element extends past end of object", - e.size() + offset <= this->objsize() ); - e.validate(); - bool end = ( e.size() + offset == this->objsize() ); - if ( e.eoo() ) { - massert( 10331 , "EOO Before end of object", end ); - break; - } - if ( first ) - first = false; - else - s << ", "; - e.toString(s, !isArray, full ); - } - s << ( isArray ? " ]" : " }" ); - } - - extern unsigned getRandomNumber(); - - inline void BSONElement::validate() const { - const BSONType t = type(); - - switch( t ) { - case DBRef: - case Code: - case Symbol: - case mongo::String: { - int x = valuestrsize(); - if ( x > 0 && valuestr()[x-1] == 0 ) - return; - StringBuilder buf; - buf << "Invalid dbref/code/string/symbol size: " << x << " strnlen:" << mongo::strnlen( valuestr() , x ); - msgasserted( 10321 , buf.str() ); - break; - } - case CodeWScope: { - int totalSize = *( int * )( value() ); - massert( 10322 , "Invalid CodeWScope size", totalSize >= 8 ); - int strSizeWNull = *( int * )( value() + 4 ); - massert( 10323 , "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 ); - massert( 10324 , "Invalid CodeWScope string size", - strSizeWNull > 0 && - (strSizeWNull - 1) == mongo::strnlen( codeWScopeCode(), strSizeWNull ) ); - massert( 10325 , "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 ); - int objSize = *( int * )( value() + 4 + 4 + strSizeWNull ); - massert( 10326 , "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize ); - // Subobject validation handled elsewhere. - } - case Object: - // We expect Object size validation to be handled elsewhere. - default: - break; - } - } - - inline int BSONElement::size( int maxLen ) const { - if ( totalSize >= 0 ) - return totalSize; - - int remain = maxLen - fieldNameSize() - 1; - - int x = 0; - switch ( type() ) { - case EOO: - case Undefined: - case jstNULL: - case MaxKey: - case MinKey: - break; - case mongo::Bool: - x = 1; - break; - case NumberInt: - x = 4; - break; - case Timestamp: - case mongo::Date: - case NumberDouble: - case NumberLong: - x = 8; - break; - case jstOID: - x = 12; - break; - case Symbol: - case Code: - case mongo::String: - massert( 10313 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); - x = valuestrsize() + 4; - break; - case CodeWScope: - massert( 10314 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); - x = objsize(); - break; - - case DBRef: - massert( 10315 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); - x = valuestrsize() + 4 + 12; - break; - case Object: - case mongo::Array: - massert( 10316 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); - x = objsize(); - break; - case BinData: - massert( 10317 , "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 ); - x = valuestrsize() + 4 + 1/*subtype*/; - break; - case RegEx: - { - const char *p = value(); - size_t len1 = ( maxLen == -1 ) ? strlen( p ) : mongo::strnlen( p, remain ); - //massert( 10318 , "Invalid regex string", len1 != -1 ); // ERH - 4/28/10 - don't think this does anything - p = p + len1 + 1; - size_t len2 = ( maxLen == -1 ) ? strlen( p ) : mongo::strnlen( p, remain - len1 - 1 ); - //massert( 10319 , "Invalid regex options string", len2 != -1 ); // ERH - 4/28/10 - don't think this does anything - x = (int) (len1 + 1 + len2 + 1); - } - break; - default: { - StringBuilder ss; - ss << "BSONElement: bad type " << (int) type(); - string msg = ss.str(); - massert( 10320 , msg.c_str(),false); - } - } - totalSize = x + fieldNameSize() + 1; // BSONType - - return totalSize; - } - - inline string BSONElement::toString( bool includeFieldName, bool full ) const { - StringBuilder s; - toString(s, includeFieldName, full); - return s.str(); - } - inline void BSONElement::toString(StringBuilder& s, bool includeFieldName, bool full ) const { - if ( includeFieldName && type() != EOO ) - s << fieldName() << ": "; - switch ( type() ) { - case EOO: - s << "EOO"; - break; - case mongo::Date: - s << "new Date(" << date() << ')'; - break; - case RegEx: - { - s << "/" << regex() << '/'; - const char *p = regexFlags(); - if ( p ) s << p; - } - break; - case NumberDouble: - s.appendDoubleNice( number() ); - break; - case NumberLong: - s << _numberLong(); - break; - case NumberInt: - s << _numberInt(); - break; - case mongo::Bool: - s << ( boolean() ? "true" : "false" ); - break; - case Object: - embeddedObject().toString(s, false, full); - break; - case mongo::Array: - embeddedObject().toString(s, true, full); - break; - case Undefined: - s << "undefined"; - break; - case jstNULL: - s << "null"; - break; - case MaxKey: - s << "MaxKey"; - break; - case MinKey: - s << "MinKey"; - break; - case CodeWScope: - s << "CodeWScope( " - << codeWScopeCode() << ", " << codeWScopeObject().toString(false, full) << ")"; - break; - case Code: - if ( !full && valuestrsize() > 80 ) { - s.write(valuestr(), 70); - s << "..."; - } else { - s.write(valuestr(), valuestrsize()-1); - } - break; - case Symbol: - case mongo::String: - s << '"'; - if ( !full && valuestrsize() > 80 ) { - s.write(valuestr(), 70); - s << "...\""; - } else { - s.write(valuestr(), valuestrsize()-1); - s << '"'; - } - break; - case DBRef: - s << "DBRef('" << valuestr() << "',"; - { - mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize()); - s << *x << ')'; - } - break; - case jstOID: - s << "ObjectId('"; - s << __oid() << "')"; - break; - case BinData: - s << "BinData"; - if (full){ - int len; - const char* data = binDataClean(len); - s << '(' << binDataType() << ", " << toHex(data, len) << ')'; - } - break; - case Timestamp: - s << "Timestamp " << timestampTime() << "|" << timestampInc(); - break; - default: - s << "?type=" << type(); - break; - } - } - - /* return has eoo() true if no match - supports "." notation to reach into embedded objects - */ - inline BSONElement BSONObj::getFieldDotted(const char *name) const { - BSONElement e = getField( name ); - if ( e.eoo() ) { - const char *p = strchr(name, '.'); - if ( p ) { - string left(name, p-name); - BSONObj sub = getObjectField(left.c_str()); - return sub.isEmpty() ? BSONElement() : sub.getFieldDotted(p+1); - } - } - - return e; - } - - inline BSONObj BSONObj::getObjectField(const char *name) const { - BSONElement e = getField(name); - BSONType t = e.type(); - return t == Object || t == Array ? e.embeddedObject() : BSONObj(); - } - - inline int BSONObj::nFields() const { - int n = 0; - BSONObjIterator i(*this); - while ( i.moreWithEOO() ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - n++; - } - return n; - } - - inline BSONObj::BSONObj() { - /* LITTLE ENDIAN */ - static char p[] = { 5, 0, 0, 0, 0 }; - _objdata = p; - } - - inline BSONObj BSONElement::Obj() const { return embeddedObjectUserCheck(); } - - inline BSONElement BSONElement::operator[] (const string& field) const { - BSONObj o = Obj(); - return o[field]; - } - - inline void BSONObj::elems(vector &v) const { - BSONObjIterator i(*this); - while( i.more() ) - v.push_back(i.next()); - } - - inline void BSONObj::elems(list &v) const { - BSONObjIterator i(*this); - while( i.more() ) - v.push_back(i.next()); - } - - template - void BSONObj::Vals(vector& v) const { - BSONObjIterator i(*this); - while( i.more() ) { - T t; - i.next().Val(t); - v.push_back(t); - } - } - template - void BSONObj::Vals(list& v) const { - BSONObjIterator i(*this); - while( i.more() ) { - T t; - i.next().Val(t); - v.push_back(t); - } - } - - template - void BSONObj::vals(vector& v) const { - BSONObjIterator i(*this); - while( i.more() ) { - try { - T t; - i.next().Val(t); - v.push_back(t); - } catch(...) { } - } - } - template - void BSONObj::vals(list& v) const { - BSONObjIterator i(*this); - while( i.more() ) { - try { - T t; - i.next().Val(t); - v.push_back(t); - } catch(...) { } - } - } - - inline ostream& operator<<( ostream &s, const BSONObj &o ) { - return s << o.toString(); - } - - inline ostream& operator<<( ostream &s, const BSONElement &e ) { - return s << e.toString(); - } - - inline void BSONElement::Val(BSONObj& v) const { v = Obj(); } - - template - inline BSONFieldValue BSONField::query( const char * q , const T& t ) const { - BSONObjBuilder b; - b.append( q , t ); - return BSONFieldValue( _name , b.obj() ); - } -} diff --git a/bson/bsonmisc.h b/bson/bsonmisc.h index 40ec6d3..96be12a 100644 --- a/bson/bsonmisc.h +++ b/bson/bsonmisc.h @@ -26,7 +26,7 @@ namespace mongo { return l.woCompare( r, false ) < 0; } }; - + class BSONObjCmp { public: BSONObjCmp( const BSONObj &_order = BSONObj() ) : order( _order ) {} @@ -54,26 +54,26 @@ namespace mongo { FieldCompareResult compareDottedFieldNames( const string& l , const string& r ); -/** Use BSON macro to build a BSONObj from a stream + /** Use BSON macro to build a BSONObj from a stream + + e.g., + BSON( "name" << "joe" << "age" << 33 ) - e.g., - BSON( "name" << "joe" << "age" << 33 ) + with auto-generated object id: + BSON( GENOID << "name" << "joe" << "age" << 33 ) - with auto-generated object id: - BSON( GENOID << "name" << "joe" << "age" << 33 ) - - The labels GT, GTE, LT, LTE, NE can be helpful for stream-oriented construction - of a BSONObj, particularly when assembling a Query. For example, - BSON( "a" << GT << 23.4 << NE << 30 << "b" << 2 ) produces the object - { a: { \$gt: 23.4, \$ne: 30 }, b: 2 }. -*/ + The labels GT, GTE, LT, LTE, NE can be helpful for stream-oriented construction + of a BSONObj, particularly when assembling a Query. For example, + BSON( "a" << GT << 23.4 << NE << 30 << "b" << 2 ) produces the object + { a: { \$gt: 23.4, \$ne: 30 }, b: 2 }. + */ #define BSON(x) (( mongo::BSONObjBuilder(64) << x ).obj()) -/** Use BSON_ARRAY macro like BSON macro, but without keys + /** Use BSON_ARRAY macro like BSON macro, but without keys - BSONArray arr = BSON_ARRAY( "hello" << 1 << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) ); + BSONArray arr = BSON_ARRAY( "hello" << 1 << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) ); - */ + */ #define BSON_ARRAY(x) (( mongo::BSONArrayBuilder() << x ).arr()) /* Utility class to auto assign object IDs. @@ -83,11 +83,18 @@ namespace mongo { extern struct GENOIDLabeler { } GENOID; /* Utility class to add a Date element with the current time - Example: + Example: cout << BSON( "created" << DATENOW ); // { created : "2009-10-09 11:41:42" } */ extern struct DateNowLabeler { } DATENOW; + /* Utility class to add the minKey (minus infinity) to a given attribute + Example: + cout << BSON( "a" << MINKEY ); // { "a" : { "$minKey" : 1 } } + */ + extern struct MinKeyLabeler { } MINKEY; + extern struct MaxKeyLabeler { } MAXKEY; + // Utility class to implement GT, GTE, etc as described above. class Labeler { public: @@ -99,17 +106,17 @@ namespace mongo { template BSONObjBuilder& operator<<( T value ); - /* the value of the element e is appended i.e. for + /* the value of the element e is appended i.e. for "age" << GT << someElement - one gets - { age : { $gt : someElement's value } } + one gets + { age : { $gt : someElement's value } } */ BSONObjBuilder& operator<<( const BSONElement& e ); private: const Label &l_; BSONObjBuilderValueStream *s_; }; - + extern Labeler::Label GT; extern Labeler::Label GTE; extern Labeler::Label LT; @@ -126,7 +133,7 @@ namespace mongo { inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e); inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e, const BSONObj& f); // definitions in bsonobjbuilder.h b/c of incomplete types - + // Utility class to implement BSON( key << val ) as described above. class BSONObjBuilderValueStream : public boost::noncopyable { public: @@ -134,17 +141,20 @@ namespace mongo { BSONObjBuilderValueStream( BSONObjBuilder * builder ); BSONObjBuilder& operator<<( const BSONElement& e ); - - template + + template BSONObjBuilder& operator<<( T value ); BSONObjBuilder& operator<<(DateNowLabeler& id); - + + BSONObjBuilder& operator<<(MinKeyLabeler& id); + BSONObjBuilder& operator<<(MaxKeyLabeler& id); + Labeler operator<<( const Labeler::Label &l ); void endField( const char *nextFieldName = 0 ); bool subobjStarted() const { return _fieldName != 0; } - + private: const char * _fieldName; BSONObjBuilder * _builder; @@ -153,39 +163,39 @@ namespace mongo { BSONObjBuilder *subobj(); auto_ptr< BSONObjBuilder > _subobj; }; - + /** used in conjuction with BSONObjBuilder, allows for proper buffer size to prevent crazy memory usage */ class BSONSizeTracker { public: - BSONSizeTracker(){ + BSONSizeTracker() { _pos = 0; for ( int i=0; i= SIZE ) _pos = 0; } - + /** * right now choosing largest size */ int getSize() const { int x = 16; // sane min - for ( int i=0; i x ) x = _sizes[i]; } return x; } - + private: enum { SIZE = 10 }; int _pos; diff --git a/bson/bsonobj.h b/bson/bsonobj.h index a802526..3ca6b8c 100644 --- a/bson/bsonobj.h +++ b/bson/bsonobj.h @@ -28,23 +28,23 @@ namespace mongo { typedef set< BSONElement, BSONElementCmpWithoutField > BSONElementSet; /** - C++ representation of a "BSON" object -- that is, an extended JSON-style + C++ representation of a "BSON" object -- that is, an extended JSON-style object in a binary representation. See bsonspec.org. - Note that BSONObj's have a smart pointer capability built in -- so you can + Note that BSONObj's have a smart pointer capability built in -- so you can pass them around by value. The reference counts used to implement this do not use locking, so copying and destroying BSONObj's are not thread-safe operations. BSON object format: - + code {}* EOO - + totalSize includes itself. - + Data: Bool: EOO: nothing follows @@ -67,31 +67,65 @@ namespace mongo { */ class BSONObj { public: - /** Construct a BSONObj from data in the proper format. - @param ifree true if the BSONObj should free() the msgdata when - it destructs. - */ + + /** Construct a BSONObj from data in the proper format. + @param ifree true if the BSONObj should free() the msgdata when + it destructs. + */ explicit BSONObj(const char *msgdata, bool ifree = false) { init(msgdata, ifree); } - BSONObj(const Record *r); + + explicit BSONObj(const Record *r); + /** Construct an empty BSONObj -- that is, {}. */ BSONObj(); - // defensive - ~BSONObj() { _objdata = 0; } - void appendSelfToBufBuilder(BufBuilder& b) const { - assert( objsize() ); - b.appendBuf(reinterpret_cast( objdata() ), objsize()); - } + ~BSONObj() { /*defensive:*/ _objdata = 0; } + + /** + A BSONObj can use a buffer it "owns" or one it does not. + + OWNED CASE + If the BSONObj owns the buffer, the buffer can be shared among several BSONObj's (by assignment). + In this case the buffer is basically implemented as a shared_ptr. + Since BSONObj's are typically immutable, this works well. + + UNOWNED CASE + A BSONObj can also point to BSON data in some other data structure it does not "own" or free later. + For example, in a memory mapped file. In this case, it is important the original data stays in + scope for as long as the BSONObj is in use. If you think the original data may go out of scope, + call BSONObj::getOwned() to promote your BSONObj to having its own copy. + + On a BSONObj assignment, if the source is unowned, both the source and dest will have unowned + pointers to the original buffer after the assignment. - /** Readable representation of a BSON object in an extended JSON-style notation. + If you are not sure about ownership but need the buffer to last as long as the BSONObj, call + getOwned(). getOwned() is a no-op if the buffer is already owned. If not already owned, a malloc + and memcpy will result. + + Most ways to create BSONObj's create 'owned' variants. Unowned versions can be created with: + (1) specifying true for the ifree parameter in the constructor + (2) calling BSONObjBuilder::done(). Use BSONObjBuilder::obj() to get an owned copy + (3) retrieving a subobject retrieves an unowned pointer into the parent BSON object + + @return true if this is in owned mode + */ + bool isOwned() const { return _holder.get() != 0; } + + /* make sure the data buffer is under the control of this BSONObj and not a remote buffer */ + BSONObj getOwned() const; + + /** @return a new full (and owned) copy of the object. */ + BSONObj copy() const; + + /** Readable representation of a BSON object in an extended JSON-style notation. This is an abbreviated representation which might be used for logging. */ string toString( bool isArray = false, bool full=false ) const; void toString(StringBuilder& s, bool isArray = false, bool full=false ) const; - - /** Properly formatted JSON string. + + /** Properly formatted JSON string. @param pretty if true we try to add some lf's and indentation */ string jsonString( JsonStringFormat format = Strict, int pretty = 0 ) const; @@ -126,38 +160,36 @@ namespace mongo { names with respect to the returned element. */ BSONElement getFieldDottedOrArray(const char *&name) const; - /** Get the field of the specified name. eoo() is true on the returned - element if not found. + /** Get the field of the specified name. eoo() is true on the returned + element if not found. */ BSONElement getField(const StringData& name) const; - /** Get the field of the specified name. eoo() is true on the returned - element if not found. + /** Get the field of the specified name. eoo() is true on the returned + element if not found. */ - BSONElement operator[] (const char *field) const { + BSONElement operator[] (const char *field) const { return getField(field); } - BSONElement operator[] (const string& field) const { + BSONElement operator[] (const string& field) const { return getField(field); } - BSONElement operator[] (int field) const { + BSONElement operator[] (int field) const { StringBuilder ss; ss << field; string s = ss.str(); return getField(s.c_str()); } - /** @return true if field exists */ - bool hasField( const char * name )const { - return ! getField( name ).eoo(); - } + /** @return true if field exists */ + bool hasField( const char * name ) const { return ! getField( name ).eoo(); } /** @return "" if DNE or wrong type */ const char * getStringField(const char *name) const; - /** @return subobject of the given name */ + /** @return subobject of the given name */ BSONObj getObjectField(const char *name) const; /** @return INT_MIN if not present - does some type conversions */ @@ -172,26 +204,24 @@ namespace mongo { object. */ BSONObj extractFieldsUnDotted(BSONObj pattern) const; - + /** extract items from object which match a pattern object. - e.g., if pattern is { x : 1, y : 1 }, builds an object with - x and y elements of this object, if they are present. + e.g., if pattern is { x : 1, y : 1 }, builds an object with + x and y elements of this object, if they are present. returns elements with original field names */ BSONObj extractFields(const BSONObj &pattern , bool fillWithNull=false) const; - + BSONObj filterFieldsUndotted(const BSONObj &filter, bool inFilter) const; BSONElement getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const; - + /** @return the raw data of the object */ const char *objdata() const { return _objdata; } /** @return total size of the BSON object in bytes */ - int objsize() const { - return *(reinterpret_cast(objdata())); - } + int objsize() const { return *(reinterpret_cast(objdata())); } /** performs a cursory check on the object's size only. */ bool isValid(); @@ -201,32 +231,30 @@ namespace mongo { */ bool okForStorage() const; - /** @return true if object is empty -- i.e., {} */ - bool isEmpty() const { - return objsize() <= 5; - } + /** @return true if object is empty -- i.e., {} */ + bool isEmpty() const { return objsize() <= 5; } void dump() const; /** Alternative output format */ string hexDump() const; - + /**wo='well ordered'. fields must be in same order in each object. - Ordering is with respect to the signs of the elements + Ordering is with respect to the signs of the elements and allows ascending / descending key mixing. - @return <0 if l0 if l>r + @return <0 if l0 if l>r */ int woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName=true) const; /**wo='well ordered'. fields must be in same order in each object. - Ordering is with respect to the signs of the elements + Ordering is with respect to the signs of the elements and allows ascending / descending key mixing. - @return <0 if l0 if l>r + @return <0 if l0 if l>r */ int woCompare(const BSONObj& r, const BSONObj &ordering = BSONObj(), bool considerFieldName=true) const; - + bool operator<( const BSONObj& other ) const { return woCompare( other ) < 0; } bool operator<=( const BSONObj& other ) const { return woCompare( other ) <= 0; } @@ -249,31 +277,18 @@ namespace mongo { return false; } - /** @return first field of the object */ - BSONElement firstElement() const { - return BSONElement(objdata() + 4); - } + /** @return first field of the object */ + BSONElement firstElement() const { return BSONElement(objdata() + 4); } - /** @return true if field exists in the object */ + /** @return true if field exists in the object */ bool hasElement(const char *name) const; - /** Get the _id field from the object. For good performance drivers should - assure that _id is the first element of the object; however, correct operation + /** Get the _id field from the object. For good performance drivers should + assure that _id is the first element of the object; however, correct operation is assured regardless. @return true if found - */ - bool getObjectID(BSONElement& e) const; - - /** makes a copy of the object. */ - BSONObj copy() const; - - /* make sure the data buffer is under the control of this BSONObj and not a remote buffer */ - BSONObj getOwned() const{ - if ( !isOwned() ) - return copy(); - return *this; - } - bool isOwned() const { return _holder.get() != 0; } + */ + bool getObjectID(BSONElement& e) const; /** @return A hash code for the object */ int hash() const { @@ -289,18 +304,18 @@ namespace mongo { // string identifier equivalents. // TODO Support conversion of element types other than min and max. BSONObj clientReadable() const; - + /** Return new object with the field names replaced by those in the passed object. */ BSONObj replaceFieldNames( const BSONObj &obj ) const; - + /** true unless corrupt */ bool valid() const; - + /** @return an md5 value for this object. */ string md5() const; - - bool operator==( const BSONObj& other ) const{ + + bool operator==( const BSONObj& other ) const { return woCompare( other ) == 0; } @@ -324,14 +339,21 @@ namespace mongo { opNEAR = 0x13, opWITHIN = 0x14, opMAX_DISTANCE=0x15 - }; + }; /** add all elements of the object to the specified vector */ void elems(vector &) const; /** add all elements of the object to the specified list */ void elems(list &) const; - /** add all values of the object to the specified vector. If type mismatches, exception. */ + /** add all values of the object to the specified vector. If type mismatches, exception. + this is most useful when the BSONObj is an array, but can be used with non-arrays too in theory. + + example: + bo sub = y["subobj"].Obj(); + vector myints; + sub.Vals(myints); + */ template void Vals(vector &) const; /** add all values of the object to the specified list. If type mismatches, exception. */ @@ -347,13 +369,25 @@ namespace mongo { friend class BSONObjIterator; typedef BSONObjIterator iterator; + + /** use something like this: + for( BSONObj::iterator i = myObj.begin(); i.more(); ) { + BSONElement e = i.next(); + ... + } + */ BSONObjIterator begin(); -private: + void appendSelfToBufBuilder(BufBuilder& b) const { + assert( objsize() ); + b.appendBuf(reinterpret_cast( objdata() ), objsize()); + } + + private: class Holder { public: Holder( const char *objdata ) : - _objdata( objdata ) { + _objdata( objdata ) { } ~Holder() { free((void *)_objdata); @@ -362,29 +396,27 @@ private: private: const char *_objdata; }; + const char *_objdata; boost::shared_ptr< Holder > _holder; + + void _assertInvalid() const; void init(const char *data, bool ifree) { if ( ifree ) _holder.reset( new Holder( data ) ); _objdata = data; - if ( ! isValid() ){ - StringBuilder ss; - int os = objsize(); - ss << "Invalid BSONObj spec size: " << os << " (" << toHex( &os, 4 ) << ")"; - try { - BSONElement e = firstElement(); - ss << " first element:" << e.toString() << " "; - } - catch ( ... ){} - string s = ss.str(); - massert( 10334 , s , 0 ); - } + if ( !isValid() ) + _assertInvalid(); } }; + ostream& operator<<( ostream &s, const BSONObj &o ); ostream& operator<<( ostream &s, const BSONElement &e ); + StringBuilder& operator<<( StringBuilder &s, const BSONObj &o ); + StringBuilder& operator<<( StringBuilder &s, const BSONElement &e ); + + struct BSONArray : BSONObj { // Don't add anything other than forwarding constructors!!! BSONArray(): BSONObj() {} diff --git a/bson/bsonobjbuilder.h b/bson/bsonobjbuilder.h index fdfe4de..a39b529 100644 --- a/bson/bsonobjbuilder.h +++ b/bson/bsonobjbuilder.h @@ -36,7 +36,7 @@ namespace mongo { template class BSONFieldValue { public: - BSONFieldValue( const string& name , const T& t ){ + BSONFieldValue( const string& name , const T& t ) { _name = name; _t = t; } @@ -52,8 +52,8 @@ namespace mongo { template class BSONField { public: - BSONField( const string& name , const string& longName="" ) - : _name(name), _longName(longName){} + BSONField( const string& name , const string& longName="" ) + : _name(name), _longName(longName) {} const string& name() const { return _name; } operator string() const { return _name; } @@ -65,11 +65,11 @@ namespace mongo { BSONFieldValue lt( const T& t ) const { return query( "$lt" , t ); } BSONFieldValue query( const char * q , const T& t ) const; - + BSONFieldValue operator()( const T& t ) const { return BSONFieldValue( _name , t ); } - + private: string _name; string _longName; @@ -85,17 +85,18 @@ namespace mongo { _b.skip(4); /*leave room for size field*/ } + /* dm why do we have this/need this? not clear to me, comment please tx. */ /** @param baseBuilder construct a BSONObjBuilder using an existing BufBuilder */ BSONObjBuilder( BufBuilder &baseBuilder ) : _b( baseBuilder ), _buf( 0 ), _offset( baseBuilder.len() ), _s( this ) , _tracker(0) , _doneCalled(false) { _b.skip( 4 ); } - + BSONObjBuilder( const BSONSizeTracker & tracker ) : _b(_buf) , _buf(tracker.getSize() ), _offset(0), _s( this ) , _tracker( (BSONSizeTracker*)(&tracker) ) , _doneCalled(false) { _b.skip( 4 ); } - ~BSONObjBuilder(){ - if ( !_doneCalled && _b.buf() && _buf.getSize() == 0 ){ + ~BSONObjBuilder() { + if ( !_doneCalled && _b.buf() && _buf.getSize() == 0 ) { _done(); } } @@ -103,6 +104,9 @@ namespace mongo { /** add all the fields from the object specified to this object */ BSONObjBuilder& appendElements(BSONObj x); + /** add all the fields from the object specified to this object if they don't exist already */ + BSONObjBuilder& appendElementsUnique( BSONObj x ); + /** append element to the object we are building */ BSONObjBuilder& append( const BSONElement& e) { assert( !e.eoo() ); // do not append eoo, that would corrupt us. the builder auto appends when done() is called. @@ -111,7 +115,7 @@ namespace mongo { } /** append an element but with a new name */ - BSONObjBuilder& appendAs(const BSONElement& e, const StringData& fieldName) { + BSONObjBuilder& appendAs(const BSONElement& e, const StringData& fieldName) { assert( !e.eoo() ); // do not append eoo, that would corrupt us. the builder auto appends when done() is called. _b.appendNum((char) e.type()); _b.appendStr(fieldName); @@ -128,14 +132,14 @@ namespace mongo { } /** add a subobject as a member */ - BSONObjBuilder& appendObject(const StringData& fieldName, const char * objdata , int size = 0 ){ + BSONObjBuilder& appendObject(const StringData& fieldName, const char * objdata , int size = 0 ) { assert( objdata ); - if ( size == 0 ){ + if ( size == 0 ) { size = *((int*)objdata); } - + assert( size > 4 && size < 100000000 ); - + _b.appendNum((char) Object); _b.appendStr(fieldName); _b.appendBuf((void*)objdata, size ); @@ -150,7 +154,7 @@ namespace mongo { _b.appendStr(fieldName); return _b; } - + /** add a subobject as a member with type Array. Thus arr object should have "0", "1", ... style fields in it. */ @@ -160,9 +164,9 @@ namespace mongo { _b.appendBuf((void *) subObj.objdata(), subObj.objsize()); return *this; } - BSONObjBuilder& append(const StringData& fieldName, BSONArray arr) { - return appendArray(fieldName, arr); - } + BSONObjBuilder& append(const StringData& fieldName, BSONArray arr) { + return appendArray(fieldName, arr); + } /** add header for a new subarray and return bufbuilder for writing to the subarray's body */ @@ -171,7 +175,7 @@ namespace mongo { _b.appendStr(fieldName); return _b; } - + /** Append a boolean element */ BSONObjBuilder& appendBool(const StringData& fieldName, int val) { _b.appendNum((char) Bool); @@ -184,10 +188,10 @@ namespace mongo { BSONObjBuilder& append(const StringData& fieldName, bool val) { _b.appendNum((char) Bool); _b.appendStr(fieldName); - _b.appendNum((char) (val?1:0)); + _b.appendNum((char) (val?1:0)); return *this; } - + /** Append a 32 bit integer element */ BSONObjBuilder& append(const StringData& fieldName, int n) { _b.appendNum((char) NumberInt); @@ -197,20 +201,20 @@ namespace mongo { } /** Append a 32 bit unsigned element - cast to a signed int. */ - BSONObjBuilder& append(const StringData& fieldName, unsigned n) { - return append(fieldName, (int) n); + BSONObjBuilder& append(const StringData& fieldName, unsigned n) { + return append(fieldName, (int) n); } /** Append a NumberLong */ - BSONObjBuilder& append(const StringData& fieldName, long long n) { + BSONObjBuilder& append(const StringData& fieldName, long long n) { _b.appendNum((char) NumberLong); _b.appendStr(fieldName); _b.appendNum(n); - return *this; + return *this; } /** appends a number. if n < max(int)/2 then uses int, otherwise long long */ - BSONObjBuilder& appendIntOrLL( const StringData& fieldName , long long n ){ + BSONObjBuilder& appendIntOrLL( const StringData& fieldName , long long n ) { long long x = n; if ( x < 0 ) x = x * -1; @@ -225,15 +229,26 @@ namespace mongo { * appendNumber is a series of method for appending the smallest sensible type * mostly for JS */ - BSONObjBuilder& appendNumber( const StringData& fieldName , int n ){ + BSONObjBuilder& appendNumber( const StringData& fieldName , int n ) { return append( fieldName , n ); } - BSONObjBuilder& appendNumber( const StringData& fieldName , double d ){ + BSONObjBuilder& appendNumber( const StringData& fieldName , double d ) { return append( fieldName , d ); } - BSONObjBuilder& appendNumber( const StringData& fieldName , long long l ){ + BSONObjBuilder& appendNumber( const StringData& fieldName , size_t n ) { + static size_t maxInt = (size_t)pow( 2.0 , 30.0 ); + + if ( n < maxInt ) + append( fieldName , (int)n ); + else + append( fieldName , (long long)n ); + return *this; + } + + + BSONObjBuilder& appendNumber( const StringData& fieldName , long long l ) { static long long maxInt = (int)pow( 2.0 , 30.0 ); static long long maxDouble = (long long)pow( 2.0 , 40.0 ); @@ -245,7 +260,7 @@ namespace mongo { append( fieldName , l ); return *this; } - + /** Append a double element */ BSONObjBuilder& append(const StringData& fieldName, double n) { _b.appendNum((char) NumberDouble); @@ -259,8 +274,8 @@ namespace mongo { */ bool appendAsNumber( const StringData& fieldName , const string& data ); - /** Append a BSON Object ID (OID type). - @deprecated Generally, it is preferred to use the append append(name, oid) + /** Append a BSON Object ID (OID type). + @deprecated Generally, it is preferred to use the append append(name, oid) method for this. */ BSONObjBuilder& appendOID(const StringData& fieldName, OID *oid = 0 , bool generateIfBlank = false ) { @@ -279,8 +294,8 @@ namespace mongo { return *this; } - /** - Append a BSON Object ID. + /** + Append a BSON Object ID. @param fieldName Field name, e.g., "_id". @returns the builder object */ @@ -309,14 +324,14 @@ namespace mongo { _b.appendNum(static_cast(dt) * 1000); return *this; } - /** Append a date. - @param dt a Java-style 64 bit date value, that is + /** Append a date. + @param dt a Java-style 64 bit date value, that is the number of milliseconds since January 1, 1970, 00:00:00 GMT */ BSONObjBuilder& appendDate(const StringData& fieldName, Date_t dt) { /* easy to pass a time_t to this and get a bad result. thus this warning. */ #if defined(_DEBUG) && defined(MONGO_EXPOSE_MACROS) - if( dt > 0 && dt <= 0xffffffff ) { + if( dt > 0 && dt <= 0xffffffff ) { static int n; if( n++ == 0 ) log() << "DEV WARNING appendDate() called with a tiny (but nonzero) date" << endl; @@ -335,27 +350,22 @@ namespace mongo { @param regex the regular expression pattern @param regex options such as "i" or "g" */ - BSONObjBuilder& appendRegex(const StringData& fieldName, const char *regex, const char *options = "") { + BSONObjBuilder& appendRegex(const StringData& fieldName, const StringData& regex, const StringData& options = "") { _b.appendNum((char) RegEx); _b.appendStr(fieldName); _b.appendStr(regex); _b.appendStr(options); return *this; } - /** Append a regular expression value - @param regex the regular expression pattern - @param regex options such as "i" or "g" - */ - BSONObjBuilder& appendRegex(const StringData& fieldName, string regex, string options = "") { - return appendRegex(fieldName, regex.c_str(), options.c_str()); - } - BSONObjBuilder& appendCode(const StringData& fieldName, const char *code) { + + BSONObjBuilder& appendCode(const StringData& fieldName, const StringData& code) { _b.appendNum((char) Code); _b.appendStr(fieldName); - _b.appendNum((int) strlen(code)+1); + _b.appendNum((int) code.size()+1); _b.appendStr(code); return *this; } + /** Append a string element. len DOES include terminating nul */ BSONObjBuilder& append(const StringData& fieldName, const char *str, int len) { _b.appendNum((char) String); @@ -369,48 +379,51 @@ namespace mongo { return append(fieldName, str, (int) strlen(str)+1); } /** Append a string element */ - BSONObjBuilder& append(const StringData& fieldName, string str) { + BSONObjBuilder& append(const StringData& fieldName, const string& str) { return append(fieldName, str.c_str(), (int) str.size()+1); } - BSONObjBuilder& appendSymbol(const StringData& fieldName, const char *symbol) { + + BSONObjBuilder& appendSymbol(const StringData& fieldName, const StringData& symbol) { _b.appendNum((char) Symbol); _b.appendStr(fieldName); - _b.appendNum((int) strlen(symbol)+1); + _b.appendNum((int) symbol.size()+1); _b.appendStr(symbol); - return *this; } + return *this; + } /** Append a Null element to the object */ BSONObjBuilder& appendNull( const StringData& fieldName ) { _b.appendNum( (char) jstNULL ); _b.appendStr( fieldName ); - return *this; } + return *this; + } // Append an element that is less than all other keys. BSONObjBuilder& appendMinKey( const StringData& fieldName ) { _b.appendNum( (char) MinKey ); _b.appendStr( fieldName ); - return *this; + return *this; } // Append an element that is greater than all other keys. BSONObjBuilder& appendMaxKey( const StringData& fieldName ) { _b.appendNum( (char) MaxKey ); _b.appendStr( fieldName ); - return *this; + return *this; } - + // Append a Timestamp field -- will be updated to next OpTime on db insert. BSONObjBuilder& appendTimestamp( const StringData& fieldName ) { _b.appendNum( (char) Timestamp ); _b.appendStr( fieldName ); _b.appendNum( (unsigned long long) 0 ); - return *this; + return *this; } BSONObjBuilder& appendTimestamp( const StringData& fieldName , unsigned long long val ) { _b.appendNum( (char) Timestamp ); _b.appendStr( fieldName ); _b.appendNum( val ); - return *this; + return *this; } /** @@ -419,24 +432,24 @@ namespace mongo { @param time - in millis (but stored in seconds) */ BSONObjBuilder& appendTimestamp( const StringData& fieldName , unsigned long long time , unsigned int inc ); - + /* Append an element of the deprecated DBRef type. - @deprecated + @deprecated */ - BSONObjBuilder& appendDBRef( const StringData& fieldName, const char *ns, const OID &oid ) { + BSONObjBuilder& appendDBRef( const StringData& fieldName, const StringData& ns, const OID &oid ) { _b.appendNum( (char) DBRef ); _b.appendStr( fieldName ); - _b.appendNum( (int) strlen( ns ) + 1 ); + _b.appendNum( (int) ns.size() + 1 ); _b.appendStr( ns ); _b.appendBuf( (void *) &oid, 12 ); - return *this; + return *this; } - /** Append a binary data element + /** Append a binary data element @param fieldName name of the field @param len length of the binary data in bytes - @param subtype subtype information for the data. @see enum BinDataType in bsontypes.h. + @param subtype subtype information for the data. @see enum BinDataType in bsontypes.h. Use BinDataGeneral if you don't care about the type. @param data the byte array */ @@ -446,36 +459,36 @@ namespace mongo { _b.appendNum( len ); _b.appendNum( (char) type ); _b.appendBuf( (void *) data, len ); - return *this; + return *this; } BSONObjBuilder& appendBinData( const StringData& fieldName, int len, BinDataType type, const unsigned char *data ) { return appendBinData(fieldName, len, type, (const char *) data); } - + /** Subtype 2 is deprecated. Append a BSON bindata bytearray element. @param data a byte array @param len the length of data */ - BSONObjBuilder& appendBinDataArrayDeprecated( const char * fieldName , const char * data , int len ){ + BSONObjBuilder& appendBinDataArrayDeprecated( const char * fieldName , const char * data , int len ) { _b.appendNum( (char) BinData ); _b.appendStr( fieldName ); _b.appendNum( len + 4 ); _b.appendNum( (char)0x2 ); _b.appendNum( len ); - _b.appendBuf( (void *) data, len ); - return *this; + _b.appendBuf( (void *) data, len ); + return *this; } - /** Append to the BSON object a field of type CodeWScope. This is a javascript code + /** Append to the BSON object a field of type CodeWScope. This is a javascript code fragment accompanied by some scope that goes with it. */ - BSONObjBuilder& appendCodeWScope( const StringData& fieldName, const char *code, const BSONObj &scope ) { + BSONObjBuilder& appendCodeWScope( const StringData& fieldName, const StringData& code, const BSONObj &scope ) { _b.appendNum( (char) CodeWScope ); _b.appendStr( fieldName ); - _b.appendNum( ( int )( 4 + 4 + strlen( code ) + 1 + scope.objsize() ) ); - _b.appendNum( ( int ) strlen( code ) + 1 ); + _b.appendNum( ( int )( 4 + 4 + code.size() + 1 + scope.objsize() ) ); + _b.appendNum( ( int ) code.size() + 1 ); _b.appendStr( code ); _b.appendBuf( ( void * )scope.objdata(), scope.objsize() ); return *this; @@ -485,15 +498,12 @@ namespace mongo { _b.appendNum( (char) Undefined ); _b.appendStr( fieldName ); } - + /* helper function -- see Query::where() for primary way to do this. */ - void appendWhere( const char *code, const BSONObj &scope ){ + void appendWhere( const StringData& code, const BSONObj &scope ) { appendCodeWScope( "$where" , code , scope ); } - void appendWhere( const string &code, const BSONObj &scope ){ - appendWhere( code.c_str(), scope ); - } - + /** these are the min/max when comparing, not strict min/max elements for a given type */ @@ -507,7 +517,11 @@ namespace mongo { template < class T > BSONObjBuilder& append( const StringData& fieldName, const list< T >& vals ); - /** The returned BSONObj will free the buffer when it is finished. */ + /** + * destructive + * The returned BSONObj will free the buffer when it is finished. + * @return owned BSONObj + */ BSONObj obj() { bool own = owned(); massert( 10335 , "builder does not own memory", own ); @@ -516,12 +530,12 @@ namespace mongo { } /** Fetch the object we have built. - BSONObjBuilder still frees the object when the builder goes out of - scope -- very important to keep in mind. Use obj() if you - would like the BSONObj to last longer than the builder. + BSONObjBuilder still frees the object when the builder goes out of + scope -- very important to keep in mind. Use obj() if you + would like the BSONObj to last longer than the builder. */ BSONObj done() { - return BSONObj(_done()); + return BSONObj(_done(), /*ifree*/false); } // Like 'done' above, but does not construct a BSONObj to return to the caller. @@ -591,25 +605,29 @@ namespace mongo { BSONObjBuilderValueStream& operator<<( const BSONField& f ) { _s.endField( f.name().c_str() ); return _s; - } + } template BSONObjBuilder& operator<<( const BSONFieldValue& v ) { append( v.name().c_str() , v.value() ); return *this; - } - + } + /** @return true if we are using our own bufbuilder, and not an alternate that was given to us in our constructor */ bool owned() const { return &_b == &_buf; } BSONObjIterator iterator() const ; - + + bool hasField( const StringData& name ) const ; + + int len() const { return _b.len(); } + private: char* _done() { if ( _doneCalled ) return _b.buf() + _offset; - + _doneCalled = true; _s.endField(); _b.appendNum((char) EOO); @@ -635,82 +653,89 @@ namespace mongo { public: BSONArrayBuilder() : _i(0), _b() {} BSONArrayBuilder( BufBuilder &_b ) : _i(0), _b(_b) {} + BSONArrayBuilder( int initialSize ) : _i(0), _b(initialSize) {} template - BSONArrayBuilder& append(const T& x){ - _b.append(num().c_str(), x); + BSONArrayBuilder& append(const T& x) { + _b.append(num(), x); return *this; } - BSONArrayBuilder& append(const BSONElement& e){ + BSONArrayBuilder& append(const BSONElement& e) { _b.appendAs(e, num()); return *this; } - + template - BSONArrayBuilder& operator<<(const T& x){ + BSONArrayBuilder& operator<<(const T& x) { return append(x); } - + void appendNull() { - _b.appendNull(num().c_str()); + _b.appendNull(num()); } - BSONArray arr(){ return BSONArray(_b.obj()); } - + /** + * destructive - ownership moves to returned BSONArray + * @return owned BSONArray + */ + BSONArray arr() { return BSONArray(_b.obj()); } + BSONObj done() { return _b.done(); } - + void doneFast() { _b.doneFast(); } - + template - BSONArrayBuilder& append(const StringData& name, const T& x){ + BSONArrayBuilder& append(const StringData& name, const T& x) { fill( name ); append( x ); return *this; } - - BufBuilder &subobjStart( const char *name = "0" ) { + + BufBuilder &subobjStart( const StringData& name = "0" ) { fill( name ); - return _b.subobjStart( num().c_str() ); + return _b.subobjStart( num() ); } BufBuilder &subarrayStart( const char *name ) { fill( name ); - return _b.subarrayStart( num().c_str() ); + return _b.subarrayStart( num() ); } - + void appendArray( const StringData& name, BSONObj subObj ) { fill( name ); - _b.appendArray( num().c_str(), subObj ); + _b.appendArray( num(), subObj ); } - - void appendAs( const BSONElement &e, const char *name ) { + + void appendAs( const BSONElement &e, const char *name) { fill( name ); append( e ); } - + + int len() const { return _b.len(); } + private: void fill( const StringData& name ) { char *r; - int n = strtol( name.data(), &r, 10 ); + long int n = strtol( name.data(), &r, 10 ); if ( *r ) uasserted( 13048, (string)"can't append to array using string field name [" + name.data() + "]" ); while( _i < n ) append( nullElt() ); } - + static BSONElement nullElt() { static BSONObj n = nullObj(); return n.firstElement(); } - + static BSONObj nullObj() { BSONObjBuilder _b; _b.appendNull( "" ); return _b.obj(); } - - string num(){ return _b.numStr(_i++); } + + string num() { return _b.numStr(_i++); } int _i; BSONObjBuilder _b; }; @@ -736,14 +761,14 @@ namespace mongo { // $or helper: OR(BSON("x" << GT << 7), BSON("y" << LT 6)); inline BSONObj OR(const BSONObj& a, const BSONObj& b) - { return BSON( "$or" << BSON_ARRAY(a << b) ); } + { return BSON( "$or" << BSON_ARRAY(a << b) ); } inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c) - { return BSON( "$or" << BSON_ARRAY(a << b << c) ); } + { return BSON( "$or" << BSON_ARRAY(a << b << c) ); } inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d) - { return BSON( "$or" << BSON_ARRAY(a << b << c << d) ); } + { return BSON( "$or" << BSON_ARRAY(a << b << c << d) ); } inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e) - { return BSON( "$or" << BSON_ARRAY(a << b << c << d << e) ); } + { return BSON( "$or" << BSON_ARRAY(a << b << c << d << e) ); } inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e, const BSONObj& f) - { return BSON( "$or" << BSON_ARRAY(a << b << c << d << e << f) ); } - + { return BSON( "$or" << BSON_ARRAY(a << b << c << d << e << f) ); } + } diff --git a/bson/bsonobjiterator.h b/bson/bsonobjiterator.h index c8224d2..6e6a69e 100644 --- a/bson/bsonobjiterator.h +++ b/bson/bsonobjiterator.h @@ -20,6 +20,7 @@ #include // like the ## operator but works with __LINE__ namespace mongo { + /** iterator for a BSONObj Note each BSONObj ends with an EOO element: so you will get more() on an empty @@ -30,7 +31,7 @@ namespace mongo { */ class BSONObjIterator { public: - /** Create an iterator for a BSON object. + /** Create an iterator for a BSON object. */ BSONObjIterator(const BSONObj& jso) { int sz = jso.objsize(); @@ -42,18 +43,17 @@ namespace mongo { _theend = jso.objdata() + sz; } - BSONObjIterator( const char * start , const char * end ){ + BSONObjIterator( const char * start , const char * end ) { _pos = start + 4; _theend = end; } - + /** @return true if more elements exist to be enumerated. */ - bool moreWithEOO() { - return _pos < _theend; - } - bool more(){ - return _pos < _theend && _pos[0]; - } + bool more() { return _pos < _theend && _pos[0]; } + + /** @return true if more elements exist to be enumerated INCLUDING the EOO element which is always at the end. */ + bool moreWithEOO() { return _pos < _theend; } + /** @return the next element in the object. For the final element, element.eoo() will be true. */ BSONElement next( bool checkEnd = false ) { assert( _pos < _theend ); @@ -78,18 +78,18 @@ namespace mongo { class BSONObjIteratorSorted { public: BSONObjIteratorSorted( const BSONObj& o ); - - ~BSONObjIteratorSorted(){ + + ~BSONObjIteratorSorted() { assert( _fields ); delete[] _fields; _fields = 0; } - bool more(){ + bool more() { return _cur < _nfields; } - - BSONElement next(){ + + BSONElement next() { assert( _fields ); if ( _cur < _nfields ) return BSONElement( _fields[_cur++] ); @@ -102,30 +102,30 @@ namespace mongo { int _cur; }; -/** Similar to BOOST_FOREACH - * - * because the iterator is defined outside of the for, you must use {} around - * the surrounding scope. Don't do this: - * - * if (foo) - * BSONForEach(e, obj) - * doSomething(e); - * - * but this is OK: - * - * if (foo) { - * BSONForEach(e, obj) - * doSomething(e); - * } - * - */ + /** Similar to BOOST_FOREACH + * + * because the iterator is defined outside of the for, you must use {} around + * the surrounding scope. Don't do this: + * + * if (foo) + * BSONForEach(e, obj) + * doSomething(e); + * + * but this is OK: + * + * if (foo) { + * BSONForEach(e, obj) + * doSomething(e); + * } + * + */ #define BSONForEach(e, obj) \ BSONObjIterator BOOST_PP_CAT(it_,__LINE__)(obj); \ for ( BSONElement e; \ - (BOOST_PP_CAT(it_,__LINE__).more() ? \ - (e = BOOST_PP_CAT(it_,__LINE__).next(), true) : \ - false) ; \ - /*nothing*/ ) + (BOOST_PP_CAT(it_,__LINE__).more() ? \ + (e = BOOST_PP_CAT(it_,__LINE__).next(), true) : \ + false) ; \ + /*nothing*/ ) } diff --git a/bson/bsontypes.h b/bson/bsontypes.h index 27f2aaf..9d95e8e 100644 --- a/bson/bsontypes.h +++ b/bson/bsontypes.h @@ -39,69 +39,69 @@ namespace mongo { extern BSONObj maxKey; extern BSONObj minKey; -/** - the complete list of valid BSON types - see also bsonspec.org -*/ -enum BSONType { - /** smaller than all other types */ - MinKey=-1, - /** end of object */ - EOO=0, - /** double precision floating point value */ - NumberDouble=1, - /** character string, stored in utf8 */ - String=2, - /** an embedded object */ - Object=3, - /** an embedded array */ - Array=4, - /** binary data */ - BinData=5, - /** Undefined type */ - Undefined=6, - /** ObjectId */ - jstOID=7, - /** boolean type */ - Bool=8, - /** date type */ - Date=9, - /** null type */ - jstNULL=10, - /** regular expression, a pattern with options */ - RegEx=11, - /** deprecated / will be redesigned */ - DBRef=12, - /** deprecated / use CodeWScope */ - Code=13, - /** a programming language (e.g., Python) symbol */ - Symbol=14, - /** javascript code that can execute on the database server, with SavedContext */ - CodeWScope=15, - /** 32 bit signed integer */ - NumberInt = 16, - /** Updated to a Date with value next OpTime on insert */ - Timestamp = 17, - /** 64 bit integer */ - NumberLong = 18, - /** max type that is not MaxKey */ - JSTypeMax=18, - /** larger than all other types */ - MaxKey=127 -}; + /** + the complete list of valid BSON types + see also bsonspec.org + */ + enum BSONType { + /** smaller than all other types */ + MinKey=-1, + /** end of object */ + EOO=0, + /** double precision floating point value */ + NumberDouble=1, + /** character string, stored in utf8 */ + String=2, + /** an embedded object */ + Object=3, + /** an embedded array */ + Array=4, + /** binary data */ + BinData=5, + /** Undefined type */ + Undefined=6, + /** ObjectId */ + jstOID=7, + /** boolean type */ + Bool=8, + /** date type */ + Date=9, + /** null type */ + jstNULL=10, + /** regular expression, a pattern with options */ + RegEx=11, + /** deprecated / will be redesigned */ + DBRef=12, + /** deprecated / use CodeWScope */ + Code=13, + /** a programming language (e.g., Python) symbol */ + Symbol=14, + /** javascript code that can execute on the database server, with SavedContext */ + CodeWScope=15, + /** 32 bit signed integer */ + NumberInt = 16, + /** Updated to a Date with value next OpTime on insert */ + Timestamp = 17, + /** 64 bit integer */ + NumberLong = 18, + /** max type that is not MaxKey */ + JSTypeMax=18, + /** larger than all other types */ + MaxKey=127 + }; /* subtypes of BinData. bdtCustom and above are ones that the JS compiler understands, but are opaque to the database. */ - enum BinDataType { + enum BinDataType { BinDataGeneral=0, - Function=1, + Function=1, ByteArrayDeprecated=2, /* use BinGeneral instead */ - bdtUUID = 3, - MD5Type=5, - bdtCustom=128 + bdtUUID = 3, + MD5Type=5, + bdtCustom=128 }; - + } diff --git a/bson/inline_decls.h b/bson/inline_decls.h index aab9810..1605611 100644 --- a/bson/inline_decls.h +++ b/bson/inline_decls.h @@ -26,7 +26,7 @@ #define NOINLINE_DECL __declspec(noinline) -#else +#else #define NOINLINE_DECL diff --git a/bson/oid.cpp b/bson/oid.cpp new file mode 100644 index 0000000..6aa0730 --- /dev/null +++ b/bson/oid.cpp @@ -0,0 +1,154 @@ +// @file oid.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" +#include "oid.h" +#include "util/atomic_int.h" +#include "../db/nonce.h" + +BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 ); + +namespace mongo { + + // machine # before folding in the process id + OID::MachineAndPid OID::ourMachine; + + unsigned OID::ourPid() { + unsigned pid; +#if defined(_WIN32) + pid = (unsigned short) GetCurrentProcessId(); +#elif defined(__linux__) || defined(__APPLE__) || defined(__sunos__) + pid = (unsigned short) getpid(); +#else + pid = (unsigned short) security.getNonce(); +#endif + return pid; + } + + void OID::foldInPid(OID::MachineAndPid& x) { + unsigned p = ourPid(); + x._pid ^= (unsigned short) p; + // when the pid is greater than 16 bits, let the high bits modulate the machine id field. + unsigned short& rest = (unsigned short &) x._machineNumber[1]; + rest ^= p >> 16; + } + + OID::MachineAndPid OID::genMachineAndPid() { + BOOST_STATIC_ASSERT( sizeof(mongo::OID::MachineAndPid) == 5 ); + + // this is not called often, so the following is not expensive, and gives us some + // testing that nonce generation is working right and that our OIDs are (perhaps) ok. + { + nonce a = security.getNonce(); + nonce b = security.getNonce(); + nonce c = security.getNonce(); + assert( !(a==b && b==c) ); + } + + unsigned long long n = security.getNonce(); + OID::MachineAndPid x = ourMachine = (OID::MachineAndPid&) n; + foldInPid(x); + return x; + } + + // after folding in the process id + OID::MachineAndPid OID::ourMachineAndPid = OID::genMachineAndPid(); + + void OID::regenMachineId() { + ourMachineAndPid = genMachineAndPid(); + } + + inline bool OID::MachineAndPid::operator!=(const OID::MachineAndPid& rhs) const { + return _pid != rhs._pid || _machineNumber != rhs._machineNumber; + } + + unsigned OID::getMachineId() { + unsigned char x[4]; + x[0] = ourMachineAndPid._machineNumber[0]; + x[1] = ourMachineAndPid._machineNumber[1]; + x[2] = ourMachineAndPid._machineNumber[2]; + x[3] = 0; + return (unsigned&) x[0]; + } + + void OID::justForked() { + MachineAndPid x = ourMachine; + // we let the random # for machine go into all 5 bytes of MachineAndPid, and then + // xor in the pid into _pid. this reduces the probability of collisions. + foldInPid(x); + ourMachineAndPid = genMachineAndPid(); + assert( x != ourMachineAndPid ); + ourMachineAndPid = x; + } + + void OID::init() { + static AtomicUInt inc = (unsigned) security.getNonce(); + + { + unsigned t = (unsigned) time(0); + unsigned char *T = (unsigned char *) &t; + _time[0] = T[3]; // big endian order because we use memcmp() to compare OID's + _time[1] = T[2]; + _time[2] = T[1]; + _time[3] = T[0]; + } + + _machineAndPid = ourMachineAndPid; + + { + int new_inc = inc++; + unsigned char *T = (unsigned char *) &new_inc; + _inc[0] = T[2]; + _inc[1] = T[1]; + _inc[2] = T[0]; + } + } + + void OID::init( string s ) { + assert( s.size() == 24 ); + const char *p = s.c_str(); + for( int i = 0; i < 12; i++ ) { + data[i] = fromHex(p); + p += 2; + } + } + + void OID::init(Date_t date, bool max) { + int time = (int) (date / 1000); + char* T = (char *) &time; + data[0] = T[3]; + data[1] = T[2]; + data[2] = T[1]; + data[3] = T[0]; + + if (max) + *(long long*)(data + 4) = 0xFFFFFFFFFFFFFFFFll; + else + *(long long*)(data + 4) = 0x0000000000000000ll; + } + + time_t OID::asTimeT() { + int time; + char* T = (char *) &time; + T[0] = data[3]; + T[1] = data[2]; + T[2] = data[1]; + T[3] = data[0]; + return time; + } + +} diff --git a/bson/oid.h b/bson/oid.h index c1bf34d..bf06ee1 100644 --- a/bson/oid.h +++ b/bson/oid.h @@ -22,56 +22,48 @@ namespace mongo { #pragma pack(1) - /** Object ID type. - BSON objects typically have an _id field for the object id. This field should be the first - member of the object when present. class OID is a special type that is a 12 byte id which + /** Object ID type. + BSON objects typically have an _id field for the object id. This field should be the first + member of the object when present. class OID is a special type that is a 12 byte id which is likely to be unique to the system. You may also use other types for _id's. - When _id field is missing from a BSON object, on an insert the database may insert one + When _id field is missing from a BSON object, on an insert the database may insert one automatically in certain circumstances. Warning: You must call OID::newState() after a fork(). + + Typical contents of the BSON ObjectID is a 12-byte value consisting of a 4-byte timestamp (seconds since epoch), + a 3-byte machine id, a 2-byte process id, and a 3-byte counter. Note that the timestamp and counter fields must + be stored big endian unlike the rest of BSON. This is because they are compared byte-by-byte and we want to ensure + a mostly increasing order. */ class OID { - union { - struct{ - long long a; - unsigned b; - }; - unsigned char data[12]; - }; - static unsigned _machine; public: - /** call this after a fork */ - static void newState(); + OID() : a(0), b(0) { } - /** initialize to 'null' */ - void clear() { a = 0; b = 0; } + /** init from a 24 char hex string */ + explicit OID(const string &s) { init(s); } - const unsigned char *getData() const { return data; } + /** initialize to 'null' */ + void clear() { a = 0; b = 0; } - bool operator==(const OID& r) { - return a==r.a&&b==r.b; - } - bool operator!=(const OID& r) { - return a!=r.a||b!=r.b; - } + const unsigned char *getData() const { return data; } - /** The object ID output as 24 hex digits. */ - string str() const { - return toHexLower(data, 12); - } + bool operator==(const OID& r) const { return a==r.a && b==r.b; } + bool operator!=(const OID& r) const { return a!=r.a || b!=r.b; } + int compare( const OID& other ) const { return memcmp( data , other.data , 12 ); } + bool operator<( const OID& other ) const { return compare( other ) < 0; } + bool operator<=( const OID& other ) const { return compare( other ) <= 0; } + /** @return the object ID output as 24 hex digits */ + string str() const { return toHexLower(data, 12); } string toString() const { return str(); } static OID gen() { OID o; o.init(); return o; } - - static unsigned staticMachine(){ return _machine; } - /** - sets the contents to a new oid / randomized value - */ + + /** sets the contents to a new oid / randomized value */ void init(); - /** Set to the hex string value specified. */ + /** init from a 24 char hex string */ void init( string s ); /** Set to the min/max OID that could be generated at given timestamp. */ @@ -79,12 +71,39 @@ namespace mongo { time_t asTimeT(); Date_t asDateT() { return asTimeT() * (long long)1000; } - + bool isSet() const { return a || b; } - - int compare( const OID& other ) const { return memcmp( data , other.data , 12 ); } - - bool operator<( const OID& other ) const { return compare( other ) < 0; } + + /** call this after a fork to update the process id */ + static void justForked(); + + static unsigned getMachineId(); // features command uses + static void regenMachineId(); // used by unit tests + + private: + struct MachineAndPid { + unsigned char _machineNumber[3]; + unsigned short _pid; + bool operator!=(const OID::MachineAndPid& rhs) const; + }; + static MachineAndPid ourMachine, ourMachineAndPid; + union { + struct { + // 12 bytes total + unsigned char _time[4]; + MachineAndPid _machineAndPid; + unsigned char _inc[3]; + }; + struct { + long long a; + unsigned b; + }; + unsigned char data[12]; + }; + + static unsigned ourPid(); + static void foldInPid(MachineAndPid& x); + static MachineAndPid genMachineAndPid(); }; #pragma pack() diff --git a/bson/ordering.h b/bson/ordering.h index fbbfbec..749e20d 100644 --- a/bson/ordering.h +++ b/bson/ordering.h @@ -23,7 +23,7 @@ namespace mongo { The constructor is private to make conversion more explicit so we notice where we call make(). Over time we should push this up higher and higher. */ - class Ordering { + class Ordering { const unsigned bits; const unsigned nkeys; Ordering(unsigned b,unsigned n) : bits(b),nkeys(n) { } @@ -32,13 +32,13 @@ namespace mongo { get(0) == 1 get(1) == -1 */ - int get(int i) const { + int get(int i) const { return ((1 << i) & bits) ? -1 : 1; } // for woCompare... unsigned descending(unsigned mask) const { return bits & mask; } - + operator string() const { StringBuilder buf(32); for ( unsigned i=0; i StringData( const char (&val)[N], LiteralTag ) : _data(&val[0]), _size(N-1) {} - // Construct a StringData explicitly, for the case where the - // length of the string is already known. 'c' must be a - // pointer to a null-terminated string, and strlenOfc must be - // the length that std::strlen(c) would return, a.k.a the - // index of the terminator in c. - StringData( const char* c, size_t strlenOfc ) - : _data(c), _size((unsigned) strlenOfc) {} + // accessors const char* const data() const { return _data; } const unsigned size() const { return _size; } private: - // TODO - Hook this class up in the BSON machinery - // There are two assumptions here that we may want to review then. + // There are two assumptions we use bellow. // '_data' *always* finishes with a null terminator // 'size' does *not* account for the null terminator - // These assumptions may make it easier to minimize changes to existing code + // These assumptions may make it easier to minimize changes to existing code. const char* const _data; const unsigned _size; }; diff --git a/bson/util/atomic_int.h b/bson/util/atomic_int.h index f4d2749..1573552 100644 --- a/bson/util/atomic_int.h +++ b/bson/util/atomic_int.h @@ -24,51 +24,55 @@ namespace mongo { - struct AtomicUInt{ + struct AtomicUInt { AtomicUInt() : x(0) {} AtomicUInt(unsigned z) : x(z) { } - volatile unsigned x; - operator unsigned() const { - return x; - } + + operator unsigned() const { return x; } + unsigned get() const { return x; } + inline AtomicUInt operator++(); // ++prefix inline AtomicUInt operator++(int);// postfix++ inline AtomicUInt operator--(); // --prefix inline AtomicUInt operator--(int); // postfix-- + + inline void zero() { x = 0; } // TODO: this isn't thread safe + + volatile unsigned x; }; #if defined(_WIN32) - AtomicUInt AtomicUInt::operator++(){ + AtomicUInt AtomicUInt::operator++() { // InterlockedIncrement returns the new value return InterlockedIncrement((volatile long*)&x); //long is 32bits in Win64 } - AtomicUInt AtomicUInt::operator++(int){ + AtomicUInt AtomicUInt::operator++(int) { return InterlockedIncrement((volatile long*)&x)-1; } - AtomicUInt AtomicUInt::operator--(){ + AtomicUInt AtomicUInt::operator--() { return InterlockedDecrement((volatile long*)&x); } - AtomicUInt AtomicUInt::operator--(int){ + AtomicUInt AtomicUInt::operator--(int) { return InterlockedDecrement((volatile long*)&x)+1; } #elif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) // this is in GCC >= 4.1 - AtomicUInt AtomicUInt::operator++(){ + AtomicUInt AtomicUInt::operator++() { return __sync_add_and_fetch(&x, 1); } - AtomicUInt AtomicUInt::operator++(int){ + AtomicUInt AtomicUInt::operator++(int) { return __sync_fetch_and_add(&x, 1); } - AtomicUInt AtomicUInt::operator--(){ + AtomicUInt AtomicUInt::operator--() { return __sync_add_and_fetch(&x, -1); } - AtomicUInt AtomicUInt::operator--(int){ + AtomicUInt AtomicUInt::operator--(int) { return __sync_fetch_and_add(&x, -1); } #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) // from boost 1.39 interprocess/detail/atomic.hpp - inline unsigned atomic_int_helper(volatile unsigned *x, int val){ + inline unsigned atomic_int_helper(volatile unsigned *x, int val) { int r; asm volatile ( @@ -80,16 +84,16 @@ namespace mongo { ); return r; } - AtomicUInt AtomicUInt::operator++(){ + AtomicUInt AtomicUInt::operator++() { return atomic_int_helper(&x, 1)+1; } - AtomicUInt AtomicUInt::operator++(int){ + AtomicUInt AtomicUInt::operator++(int) { return atomic_int_helper(&x, 1); } - AtomicUInt AtomicUInt::operator--(){ + AtomicUInt AtomicUInt::operator--() { return atomic_int_helper(&x, -1)-1; } - AtomicUInt AtomicUInt::operator--(int){ + AtomicUInt AtomicUInt::operator--(int) { return atomic_int_helper(&x, -1); } #else diff --git a/bson/util/builder.h b/bson/util/builder.h index 9d9eda2..6f4ff9e 100644 --- a/bson/util/builder.h +++ b/bson/util/builder.h @@ -27,6 +27,24 @@ namespace mongo { + /* Note the limit here is rather arbitrary and is simply a standard. generally the code works + with any object that fits in ram. + + Also note that the server has some basic checks to enforce this limit but those checks are not exhaustive + for example need to check for size too big after + update $push (append) operation + various db.eval() type operations + */ + const int BSONObjMaxUserSize = 16 * 1024 * 1024; + + /* + Sometimeswe we need objects slightly larger - an object in the replication local.oplog + is slightly larger than a user object for example. + */ + const int BSONObjMaxInternalSize = BSONObjMaxUserSize + ( 16 * 1024 ); + + const int BufferMaxSize = 64 * 1024 * 1024; + class StringBuilder; void msgasserted(int msgid, const char *msg); @@ -38,7 +56,8 @@ namespace mongo { data = (char *) malloc(size); if( data == 0 ) msgasserted(10000, "out of memory BufBuilder"); - } else { + } + else { data = 0; } l = 0; @@ -54,16 +73,18 @@ namespace mongo { } } - void reset( int maxSize = 0 ){ + void reset( int maxSize = 0 ) { l = 0; - if ( maxSize && size > maxSize ){ + if ( maxSize && size > maxSize ) { free(data); data = (char*)malloc(maxSize); size = maxSize; - } + } } - /* leave room for some stuff later */ + /** leave room for some stuff later + @return point to region that was skipped. pointer may change later (on realloc), so for immediate use only + */ char* skip(int n) { return grow(n); } /* note this may be deallocated (realloced) if you keep writing. */ @@ -73,10 +94,10 @@ namespace mongo { /* assume ownership of the buffer - you must then free() it */ void decouple() { data = 0; } - void appendChar(char j){ + void appendChar(char j) { *((char*)grow(sizeof(char))) = j; } - void appendNum(char j){ + void appendNum(char j) { *((char*)grow(sizeof(char))) = j; } void appendNum(short j) { @@ -105,18 +126,19 @@ namespace mongo { memcpy(grow((int) len), src, len); } + template + void appendStruct(const T& s) { + appendBuf(&s, sizeof(T)); + } + void appendStr(const StringData &str , bool includeEOO = true ) { const int len = str.size() + ( includeEOO ? 1 : 0 ); memcpy(grow(len), str.data(), len); } - int len() const { - return l; - } - - void setlen( int newLen ){ - l = newLen; - } + int len() const { return l; } + void setlen( int newLen ) { l = newLen; } + int getSize() const { return size; } /* returns the pre-grow write position */ inline char* grow(int by) { @@ -128,18 +150,16 @@ namespace mongo { return data + oldlen; } - int getSize() const { return size; } - private: /* "slow" portion of 'grow()' */ - void NOINLINE_DECL grow_reallocate(){ + void NOINLINE_DECL grow_reallocate() { int a = size * 2; if ( a == 0 ) a = 512; if ( l > a ) a = l + 16 * 1024; - if( a > 64 * 1024 * 1024 ) - msgasserted(10000, "BufBuilder grow() > 64MB"); + if ( a > BufferMaxSize ) + msgasserted(13548, "BufBuilder grow() > 64MB"); data = (char *) realloc(data, a); size= a; } @@ -152,87 +172,90 @@ namespace mongo { }; #if defined(_WIN32) +#pragma warning( push ) +// warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. #pragma warning( disable : 4996 ) #endif + /** stringstream deals with locale so this is a lot faster than std::stringstream for UTF8 */ class StringBuilder { public: StringBuilder( int initsize=256 ) - : _buf( initsize ){ + : _buf( initsize ) { } -#define SBNUM(val,maxSize,macro) \ - int prev = _buf.l; \ - int z = sprintf( _buf.grow(maxSize) , macro , (val) ); \ - assert( z >= 0 ); \ - _buf.l = prev + z; \ - return *this; - - StringBuilder& operator<<( double x ){ - SBNUM( x , 25 , "%g" ); + StringBuilder& operator<<( double x ) { + return SBNUM( x , 25 , "%g" ); } - StringBuilder& operator<<( int x ){ - SBNUM( x , 11 , "%d" ); + StringBuilder& operator<<( int x ) { + return SBNUM( x , 11 , "%d" ); } - StringBuilder& operator<<( unsigned x ){ - SBNUM( x , 11 , "%u" ); + StringBuilder& operator<<( unsigned x ) { + return SBNUM( x , 11 , "%u" ); } - StringBuilder& operator<<( long x ){ - SBNUM( x , 22 , "%ld" ); + StringBuilder& operator<<( long x ) { + return SBNUM( x , 22 , "%ld" ); } - StringBuilder& operator<<( unsigned long x ){ - SBNUM( x , 22 , "%lu" ); + StringBuilder& operator<<( unsigned long x ) { + return SBNUM( x , 22 , "%lu" ); } - StringBuilder& operator<<( long long x ){ - SBNUM( x , 22 , "%lld" ); + StringBuilder& operator<<( long long x ) { + return SBNUM( x , 22 , "%lld" ); } - StringBuilder& operator<<( unsigned long long x ){ - SBNUM( x , 22 , "%llu" ); + StringBuilder& operator<<( unsigned long long x ) { + return SBNUM( x , 22 , "%llu" ); } - StringBuilder& operator<<( short x ){ - SBNUM( x , 8 , "%hd" ); + StringBuilder& operator<<( short x ) { + return SBNUM( x , 8 , "%hd" ); } - StringBuilder& operator<<( char c ){ + StringBuilder& operator<<( char c ) { _buf.grow( 1 )[0] = c; return *this; } -#undef SBNUM - void appendDoubleNice( double x ){ + void appendDoubleNice( double x ) { int prev = _buf.l; char * start = _buf.grow( 32 ); int z = sprintf( start , "%.16g" , x ); assert( z >= 0 ); _buf.l = prev + z; - if( strchr(start, '.') == 0 && strchr(start, 'E') == 0 && strchr(start, 'N') == 0 ){ + if( strchr(start, '.') == 0 && strchr(start, 'E') == 0 && strchr(start, 'N') == 0 ) { write( ".0" , 2 ); } } - void write( const char* buf, int len){ - memcpy( _buf.grow( len ) , buf , len ); - } + void write( const char* buf, int len) { memcpy( _buf.grow( len ) , buf , len ); } - void append( const StringData& str ){ - memcpy( _buf.grow( str.size() ) , str.data() , str.size() ); - } - StringBuilder& operator<<( const StringData& str ){ + void append( const StringData& str ) { memcpy( _buf.grow( str.size() ) , str.data() , str.size() ); } + + StringBuilder& operator<<( const StringData& str ) { append( str ); return *this; } - - // access - void reset( int maxSize = 0 ){ - _buf.reset( maxSize ); - } - - std::string str(){ - return std::string(_buf.data, _buf.l); - } + void reset( int maxSize = 0 ) { _buf.reset( maxSize ); } + + std::string str() const { return std::string(_buf.data, _buf.l); } private: BufBuilder _buf; + + // non-copyable, non-assignable + StringBuilder( const StringBuilder& ); + StringBuilder& operator=( const StringBuilder& ); + + template + StringBuilder& SBNUM(T val,int maxSize,const char *macro) { + int prev = _buf.l; + int z = sprintf( _buf.grow(maxSize) , macro , (val) ); + assert( z >= 0 ); + _buf.l = prev + z; + return *this; + } }; +#if defined(_WIN32) +#pragma warning( pop ) +#endif + } // namespace mongo diff --git a/bson/util/misc.h b/bson/util/misc.h index cad9a28..b31f36f 100644 --- a/bson/util/misc.h +++ b/bson/util/misc.h @@ -34,7 +34,7 @@ namespace mongo { buf[24] = 0; // don't want the \n } - inline string time_t_to_String(time_t t = time(0) ){ + inline string time_t_to_String(time_t t = time(0) ) { char buf[64]; #if defined(_WIN32) ctime_s(buf, sizeof(buf), &t); @@ -76,7 +76,7 @@ namespace mongo { Date_t(unsigned long long m): millis(m) {} operator unsigned long long&() { return millis; } operator const unsigned long long&() const { return millis; } - string toString() const { + string toString() const { char buf[64]; time_t_to_String(millis/1000, buf); return buf; diff --git a/buildscripts/distmirror.py b/buildscripts/distmirror.py index 1902e2a..7af1a89 100644 --- a/buildscripts/distmirror.py +++ b/buildscripts/distmirror.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Download mongodb stuff (at present builds, sources, docs, but not # drivers). diff --git a/buildscripts/errorcodes.py b/buildscripts/errorcodes.py index d87b7ad..a105647 100755 --- a/buildscripts/errorcodes.py +++ b/buildscripts/errorcodes.py @@ -1,32 +1,17 @@ -#!/usr/bin/python +#!/usr/bin/env python import os import sys import re import utils -def getAllSourceFiles( arr=None , prefix="." ): - if arr is None: - arr = [] - - for x in os.listdir( prefix ): - if x.startswith( "." ) or x.startswith( "pcre-" ) or x.startswith( "32bit" ) or x.startswith( "mongodb-" ) or x.startswith("debian") or x.startswith( "mongo-cxx-driver" ): - continue - full = prefix + "/" + x - if os.path.isdir( full ) and not os.path.islink( full ): - getAllSourceFiles( arr , full ) - else: - if full.endswith( ".cpp" ) or full.endswith( ".h" ) or full.endswith( ".c" ): - arr.append( full ) - - return arr assertNames = [ "uassert" , "massert" ] def assignErrorCodes(): cur = 10000 for root in assertNames: - for x in getAllSourceFiles(): + for x in utils.getAllSourceFiles(): print( x ) didAnything = False fixed = "" @@ -50,7 +35,7 @@ def readErrorCodes( callback ): ps = [ re.compile( "([um]asser(t|ted)) *\( *(\d+)" ) , re.compile( "(User|Msg)Exceptio(n)\( *(\d+)" ) ] - for x in getAllSourceFiles(): + for x in utils.getAllSourceFiles(): lineNum = 1 for line in open( x ): for p in ps: diff --git a/buildscripts/frob_version.py b/buildscripts/frob_version.py index 7b89e0b..560a8ed 100644 --- a/buildscripts/frob_version.py +++ b/buildscripts/frob_version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import with_statement import tempfile diff --git a/buildscripts/hacks_ubuntu.py b/buildscripts/hacks_ubuntu.py index 81deddd..977d2df 100644 --- a/buildscripts/hacks_ubuntu.py +++ b/buildscripts/hacks_ubuntu.py @@ -21,7 +21,7 @@ def foundxulrunner( env , options ): if best is None: - print( "warning: using ubuntu without xulrunner-dev. we reccomend installing it" ) + print( "warning: using ubuntu without xulrunner-dev. we recommend installing it" ) return False incroot = "/usr/include/" + best + "/" diff --git a/buildscripts/makealldists.py b/buildscripts/makealldists.py index 762700e..6b6f365 100644 --- a/buildscripts/makealldists.py +++ b/buildscripts/makealldists.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import with_statement import subprocess @@ -184,14 +184,16 @@ def __main__(): print "makedist output under: %s\ncombined repo: %s\n" % (outputroot, repodir) sys.stdout.flush() # Add more dist/version/architecture tuples as they're supported. - dists = (("ubuntu", "10.4"), + dists = (("ubuntu", "10.10"), + ("ubuntu", "10.4"), ("ubuntu", "9.10"), ("ubuntu", "9.4"), - ("ubuntu", "8.10"), + #("ubuntu", "8.10"), ("debian", "5.0"), ("centos", "5.4"), - ("fedora", "11"), - ("fedora", "12")) + #("fedora", "12"), + ("fedora", "13"), + ("fedora", "14")) arches = ("x86", "x86_64") # mongos = branches.split(',') # Run a makedist for each distro/version/architecture tuple above. @@ -202,7 +204,7 @@ def __main__(): procs = [] count = 0 for ((distro, distro_version), arch, spec) in gen([dists, arches, [branches]]): - # FIXME: now x86 fedoras on RackSpace circa 04/10. + # FIXME: no x86 fedoras on RackSpace circa 04/10. if distro == "fedora" and arch == "x86": continue count+=1 @@ -264,9 +266,9 @@ def __main__(): if r != 0: raise Exception("mergerepositories.py exited %d" % r) print repodir - pushrepo(repodir) - shutil.rmtree(outputroot) - shutil.rmtree(repodir) + #pushrepo(repodir) + #shutil.rmtree(outputroot) + #shutil.rmtree(repodir) return 0 diff --git a/buildscripts/makedist.py b/buildscripts/makedist.py index 1928b76..b5387c2 100644 --- a/buildscripts/makedist.py +++ b/buildscripts/makedist.py @@ -123,7 +123,9 @@ class EC2InstanceConfigurator(BaseConfigurator): def __init__(self, **kwargs): super(EC2InstanceConfigurator, self).__init__(**kwargs) self.configuration += [("ec2_ami", - ((("ubuntu", "10.4", "x86_64"), "ami-bf07ead6"), + ((("ubuntu", "10.10", "x86_64"), "ami-688c7801"), + (("ubuntu", "10.10", "x86"), "ami-1a837773"), + (("ubuntu", "10.4", "x86_64"), "ami-bf07ead6"), (("ubuntu", "10.4", "x86"), "ami-f707ea9e"), (("ubuntu", "9.10", "x86_64"), "ami-55739e3c"), (("ubuntu", "9.10", "x86"), "ami-bb709dd2"), @@ -140,9 +142,9 @@ class EC2InstanceConfigurator(BaseConfigurator): (("fedora", "8", "x86_64"), "ami-2547a34c"), (("fedora", "8", "x86"), "ami-5647a33f"))), ("rackspace_imgname", - ((("fedora", "11", "x86_64"), "Fedora 11"), - (("fedora", "12", "x86_64"), "Fedora 12"), - (("fedora", "13", "x86_64"), "Fedora 13"))), + ((("fedora", "12", "x86_64"), "Fedora 12"), + (("fedora", "13", "x86_64"), "Fedora 13"), + (("fedora", "14", "x86_64"), "Fedora 14"))), ("ec2_mtype", ((("*", "*", "x86"), "m1.small"), (("*", "*", "x86_64"), "m1.large"))), @@ -266,6 +268,7 @@ class SshConnectionConfigurator (BaseConfigurator): # FLAW: this actually depends more on the AMI # than the triple. ((("debian", "*", "*"), "root"), + (("ubuntu", "10.10", "*"), "ubuntu"), (("ubuntu", "10.4", "*"), "ubuntu"), (("ubuntu", "9.10", "*"), "ubuntu"), (("ubuntu", "9.4", "*"), "root"), @@ -420,8 +423,12 @@ cp {pkg_name}{pkg_name_suffix}*.tar.gz "{pkg_product_dir}/{distro_version}/10gen dpkg-scanpackages "{pkg_product_dir}/{distro_version}/10gen/binary-{distro_arch}" /dev/null | gzip -9c > "{pkg_product_dir}/{distro_version}/10gen/binary-{distro_arch}/Packages.gz" dpkg-scansources "{pkg_product_dir}/{distro_version}/10gen/source" /dev/null | gzip -9c > "{pkg_product_dir}/{distro_version}/10gen/source/Sources.gz" """ - rpm_prereq_commands = """ -rpm -Uvh http://download.fedora.redhat.com/pub/epel/5/{distro_arch}/epel-release-5-3.noarch.rpm + centos_prereq_commands = """ +rpm -Uvh http://download.fedora.redhat.com/pub/epel/5/{distro_arch}/epel-release-5-4.noarch.rpm +yum -y install {pkg_prereq_str} +""" + fedora_prereq_commands = """ +#rpm -Uvh http://download.fedora.redhat.com/pub/epel/5/{distro_arch}/epel-release-5-4.noarch.rpm yum -y install {pkg_prereq_str} """ rpm_build_commands=""" @@ -462,6 +469,7 @@ rpm -ivh /usr/src/redhat/RPMS/{distro_arch}/boost-devel-1.38.0-1.{distro_arch}.r # 1.34, but 1.35 packages are available, so we want those. versioned_deb_boost_prereqs = ["libboost-thread1.35-dev", "libboost-filesystem1.35-dev", "libboost-program-options1.35-dev", "libboost-date-time1.35-dev", "libboost1.35-dev"] + new_versioned_deb_boost_prereqs = ["libboost-thread1.42-dev", "libboost-filesystem1.42-dev", "libboost-program-options1.42-dev", "libboost-date-time1.42-dev", "libboost1.42-dev"] unversioned_deb_xulrunner_prereqs = ["xulrunner-dev"] old_versioned_deb_xulrunner_prereqs = ["xulrunner-1.9-dev"] @@ -511,6 +519,8 @@ git clone git://github.com/mongodb/mongo.git self.versioned_deb_boost_prereqs + self.unversioned_deb_xulrunner_prereqs + self.common_deb_prereqs), (("ubuntu", "9.10", "*"), self.unversioned_deb_boost_prereqs + self.unversioned_deb_xulrunner_prereqs + self.common_deb_prereqs), + (("ubuntu", "10.10", "*"), + self.new_versioned_deb_boost_prereqs + self.new_versioned_deb_xulrunner_prereqs + self.common_deb_prereqs), (("ubuntu", "10.4", "*"), self.unversioned_deb_boost_prereqs + self.new_versioned_deb_xulrunner_prereqs + self.common_deb_prereqs), (("ubuntu", "8.10", "*"), @@ -532,22 +542,24 @@ git clone git://github.com/mongodb/mongo.git (("ubuntu", "*", "*"), self.preamble_commands + self.deb_prereq_commands + self.get_mongo_commands + self.mangle_files_commands + self.deb_build_commands), (("centos", "*", "*"), - self.preamble_commands + self.old_rpm_precommands + self.rpm_prereq_commands + self.get_mongo_commands + self.mangle_files_commands + self.mangle_files_for_ancient_redhat_commands + self.rpm_build_commands), + self.preamble_commands + self.old_rpm_precommands + self.centos_prereq_commands + self.get_mongo_commands + self.mangle_files_commands + self.mangle_files_for_ancient_redhat_commands + self.rpm_build_commands), (("fedora", "*", "*"), - self.preamble_commands + self.old_rpm_precommands + self.rpm_prereq_commands + self.get_mongo_commands + self.mangle_files_commands + self.rpm_build_commands))), + self.preamble_commands + self.old_rpm_precommands + self.fedora_prereq_commands + self.get_mongo_commands + self.mangle_files_commands + self.rpm_build_commands))), ("preamble_commands", ((("*", "*", "*"), self.preamble_commands), )), ("install_prereqs", ((("debian", "*", "*"), self.deb_prereq_commands), (("ubuntu", "*", "*"), self.deb_prereq_commands), - (("centos", "*", "*"), self.rpm_prereq_commands), - (("fedora", "*", "*"), self.rpm_prereq_commands))), + (("centos", "*", "*"), self.centos_prereq_commands), + (("fedora", "*", "*"), self.fedora_prereq_commands))), ("get_mongo", ((("*", "*", "*"), self.get_mongo_commands), )), ("mangle_mongo", ((("debian", "*", "*"), self.mangle_files_commands), + (("ubuntu", "10.10", "*"), + self.mangle_files_commands + self.mangle_files_for_new_deb_xulrunner_commands), (("ubuntu", "10.4", "*"), self.mangle_files_commands + self.mangle_files_for_new_deb_xulrunner_commands), (("ubuntu", "*", "*"), self.mangle_files_commands), diff --git a/buildscripts/mergerepositories.py b/buildscripts/mergerepositories.py index bc50d08..028b6e2 100644 --- a/buildscripts/mergerepositories.py +++ b/buildscripts/mergerepositories.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import with_statement from libcloud.types import Provider diff --git a/buildscripts/s3del.py b/buildscripts/s3del.py new file mode 100644 index 0000000..7967de6 --- /dev/null +++ b/buildscripts/s3del.py @@ -0,0 +1,36 @@ + +import os +import sys +import time + +sys.path.append( "." ) +sys.path.append( ".." ) +sys.path.append( "../../" ) +sys.path.append( "../../../" ) + +import simples3 +import settings +import subprocess + +# check s3 for md5 hashes + +def check_dir( bucket , prefix , todel ): + + for ( key , modify , etag , size ) in bucket.listdir( prefix=prefix ): + if key.find( todel ) < 0: + continue + print( key ) + time.sleep( 2 ) + bucket.delete( key ) + +def clean( todel ): + + + bucket = simples3.S3Bucket( settings.bucket , settings.id , settings.key ) + + for x in [ "osx" , "linux" , "win32" , "sunos5" , "src" ]: + check_dir( bucket , x , todel ) + + +if __name__ == "__main__": + clean( sys.argv[1] ) diff --git a/buildscripts/smoke.py b/buildscripts/smoke.py index 0023226..5fdd26f 100755 --- a/buildscripts/smoke.py +++ b/buildscripts/smoke.py @@ -1,8 +1,8 @@ -#!/usr/bin/python +#!/usr/bin/env python # smoke.py: run some mongo tests. -# Bugs, TODOs: +# Bugs, TODOs: # 0 Some tests hard-code pathnames relative to the mongo repository, # so the smoke.py process and all its children must be run with the @@ -34,49 +34,48 @@ # jobs on the same host at once. So something's gotta change. from __future__ import with_statement -from subprocess import Popen, PIPE, call + +import glob +from optparse import OptionParser import os +import parser +import re +import shutil +import socket +from subprocess import (Popen, + PIPE, + call) import sys -import utils import time -import socket -from optparse import OptionParser -import atexit -import glob -import shutil -import re -import parser -mongoRepo = os.getcwd() #'./' -testPath = None +from pymongo import Connection + +import utils -mongodExecutable = "./mongod" -mongodPort = "32000" -shellExecutable = "./mongo" -continueOnFailure = False -oneMongodPerTest = False +# TODO clean this up so we don't need globals... +mongo_repo = os.getcwd() #'./' +test_path = None +mongod_executable = None +mongod_port = None +shell_executable = None +continue_on_failure = None tests = [] winners = [] losers = {} -# Finally, atexit functions seem to be a little oblivious to whether -# Python is exiting because of an error, so we'll use this to -# communicate with the report() function. -exit_bad = True - # For replication hash checking -replicated_dbs = [] +replicated_collections = [] lost_in_slave = [] lost_in_master = [] screwy_in_slave = {} -smokeDbPrefix = '' -smallOplog = False +smoke_db_prefix = '' +small_oplog = False # This class just implements the with statement API, for a sneaky # purpose below. -class nothing(object): +class Nothing(object): def __enter__(self): return self def __exit__(self, type, value, traceback): @@ -99,23 +98,23 @@ class mongod(object): print >> sys.stderr, e return not isinstance(value, Exception) - def ensureTestDirs(self): - utils.ensureDir( smokeDbPrefix + "/tmp/unittest/" ) - utils.ensureDir( smokeDbPrefix + "/data/" ) - utils.ensureDir( smokeDbPrefix + "/data/db/" ) + def ensure_test_dirs(self): + utils.ensureDir(smoke_db_prefix + "/tmp/unittest/") + utils.ensureDir(smoke_db_prefix + "/data/") + utils.ensureDir(smoke_db_prefix + "/data/db/") - def checkMongoPort( self, port=27017 ): + def check_mongo_port(self, port=27017): sock = socket.socket() sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) sock.settimeout(1) sock.connect(("localhost", int(port))) sock.close() - - def didMongodStart( self, port=mongodPort, timeout=20 ): + + def did_mongod_start(self, port=mongod_port, timeout=20): while timeout > 0: - time.sleep( 1 ) + time.sleep(1) try: - self.checkMongoPort( int(port) ) + self.check_mongo_port(int(port)) return True except Exception,e: print >> sys.stderr, e @@ -123,47 +122,45 @@ class mongod(object): return False def start(self): - global mongodPort + global mongod_port global mongod if self.proc: print >> sys.stderr, "probable bug: self.proc already set in start()" return - self.ensureTestDirs() - dirName = smokeDbPrefix + "/data/db/sconsTests/" - self.port = int(mongodPort) + self.ensure_test_dirs() + dir_name = smoke_db_prefix + "/data/db/sconsTests/" + self.port = int(mongod_port) self.slave = False if 'slave' in self.kwargs: - dirName = smokeDbPrefix + '/data/db/sconsTestsSlave/' - srcport = mongodPort + dir_name = smoke_db_prefix + '/data/db/sconsTestsSlave/' + srcport = mongod_port self.port += 1 self.slave = True - if os.path.exists ( dirName ): + if os.path.exists(dir_name): if 'slave' in self.kwargs: - argv = ["python", "buildscripts/cleanbb.py", '--nokill', dirName] - + argv = ["python", "buildscripts/cleanbb.py", '--nokill', dir_name] else: - argv = ["python", "buildscripts/cleanbb.py", dirName] - call( argv ) - utils.ensureDir( dirName ) - argv = [mongodExecutable, "--port", str(self.port), "--dbpath", dirName] - if self.kwargs.get('smallOplog'): - argv += ["--master", "--oplogSize", "10"] + argv = ["python", "buildscripts/cleanbb.py", dir_name] + call(argv) + utils.ensureDir(dir_name) + argv = [mongod_executable, "--port", str(self.port), "--dbpath", dir_name] + if self.kwargs.get('small_oplog'): + argv += ["--master", "--oplogSize", "128"] if self.slave: - argv += ['--slave', '--source', 'localhost:'+str(srcport)] + argv += ['--slave', '--source', 'localhost:' + str(srcport)] print "running " + " ".join(argv) self.proc = Popen(argv) - if not self.didMongodStart( self.port ): - raise Exception( "Failed to start mongod" ) - + if not self.did_mongod_start(self.port): + raise Exception("Failed to start mongod") + if self.slave: - while True: - argv = [shellExecutable, "--port", str(self.port), "--quiet", "--eval", 'db.printSlaveReplicationInfo()'] - res = Popen(argv, stdout=PIPE).communicate()[0] - if res.find('initial sync') < 0: - break - - - + local = Connection(port=self.port, slave_okay=True).local + synced = False + while not synced: + synced = True + for source in local.sources.find(fields=["syncedTo"]): + synced = synced and "syncedTo" in source and source["syncedTo"] + def stop(self): if not self.proc: print >> sys.stderr, "probable bug: self.proc unset in stop()" @@ -177,11 +174,14 @@ class mongod(object): win32process.TerminateProcess(self.proc._handle, -1) else: from os import kill - kill( self.proc.pid, 15 ) + kill(self.proc.pid, 15) self.proc.wait() sys.stderr.flush() sys.stdout.flush() - + + def wait_for_repl(self): + Connection(port=self.port).test.smokeWait.insert({}, w=2, wtimeout=5*60*1000) + class Bug(Exception): def __str__(self): return 'bug in smoke.py: ' + super(Bug, self).__str__() @@ -192,6 +192,7 @@ class TestFailure(Exception): class TestExitFailure(TestFailure): def __init__(self, *args): self.path = args[0] + self.status=args[1] def __str__(self): return "test %s exited with status %d" % (self.path, self.status) @@ -204,48 +205,41 @@ class TestServerFailure(TestFailure): def __str__(self): return 'mongod not running after executing test %s' % self.path -def checkDbHashes(master, slave): +def check_db_hashes(master, slave): # Need to pause a bit so a slave might catch up... if not slave.slave: raise(Bug("slave instance doesn't have slave attribute set")) - print "waiting for slave to catch up..." - ARB=10 # ARBITRARY - time.sleep(ARB) - while True: - # FIXME: it's probably better to do an empty insert and a - # getLastError() to force a sync. - argv = [shellExecutable, "--port", str(slave.port), "--quiet", "--eval", 'db.printSlaveReplicationInfo()'] - res = Popen(argv, stdout=PIPE).communicate()[0] - m = re.search('(\d+)secs ', res) - if int(m.group(1)) > ARB: #res.find('initial sync') < 0: - break - time.sleep(3) + print "waiting for slave to catch up" + master.wait_for_repl() + print "caught up!" # FIXME: maybe make this run dbhash on all databases? for mongod in [master, slave]: - argv = [shellExecutable, "--port", str(mongod.port), "--quiet", "--eval", "x=db.runCommand('dbhash'); printjson(x.collections)"] - hashstr = Popen(argv, stdout=PIPE).communicate()[0] - # WARNING FIXME KLUDGE et al.: this is sleazy and unsafe. - mongod.dict = eval(hashstr) + mongod.dbhash = Connection(port=mongod.port, slave_okay=True).test.command("dbhash") + mongod.dict = mongod.dbhash["collections"] + + global lost_in_slave, lost_in_master, screwy_in_slave, replicated_collections - global lost_in_slave, lost_in_master, screwy_in_slave, replicated_dbs + replicated_collections += master.dict.keys() - for db in replicated_dbs: + for db in replicated_collections: if db not in slave.dict: lost_in_slave.append(db) mhash = master.dict[db] shash = slave.dict[db] if mhash != shash: screwy_in_slave[db] = mhash + "/" + shash + for db in slave.dict.keys(): if db not in master.dict: lost_in_master.append(db) - replicated_dbs += master.dict.keys() + + # Blech. def skipTest(path): - if smallOplog: + if small_oplog: if os.path.basename(path) in ["cursor8.js", "indexh.js"]: return True return False @@ -254,78 +248,79 @@ def runTest(test): (path, usedb) = test (ignore, ext) = os.path.splitext(path) if skipTest(path): - print "skippping " + path + print "skipping " + path return if ext == ".js": - argv=[shellExecutable, "--port", mongodPort] + argv = [shell_executable, "--port", mongod_port] if not usedb: - argv += ["--nodb"] - if smallOplog: + argv += ["--nodb"] + if small_oplog: argv += ["--eval", 'testingReplication = true;'] argv += [path] elif ext in ["", ".exe"]: # Blech. if os.path.basename(path) in ["test", "test.exe", "perftest", "perftest.exe"]: - argv=[path] + argv = [path] # more blech elif os.path.basename(path) == 'mongos': - argv=[path, "--test"] + argv = [path, "--test"] else: - argv=[testPath and os.path.abspath(os.path.join(testPath, path)) or path, - "--port", mongodPort] + argv = [test_path and os.path.abspath(os.path.join(test_path, path)) or path, + "--port", mongod_port] else: raise Bug("fell off in extenstion case: %s" % path) print " *******************************************" print " Test : " + os.path.basename(path) + " ..." - t1=time.time() + t1 = time.time() # FIXME: we don't handle the case where the subprocess # hangs... that's bad. - r = call(argv, cwd=testPath) - t2=time.time() - print " " + str((t2-t1)*1000) + "ms" + r = call(argv, cwd=test_path) + t2 = time.time() + print " " + str((t2 - t1) * 1000) + "ms" if r != 0: raise TestExitFailure(path, r) - if Popen( [ mongodExecutable, "msg", "ping", mongodPort ], stdout=PIPE ).communicate()[0].count( "****ok" ) == 0: - raise TestServerFailure(path) - if call( [ mongodExecutable, "msg", "ping", mongodPort ] ) != 0: + + try: + c = Connection( "127.0.0.1" , int(mongod_port) ) + except Exception,e: raise TestServerFailure(path) - print "" -def runTests(tests): - # If we're in one-mongo-per-test mode, we instantiate a nothing - # around the loop, and a mongod inside the loop. + print "" +def run_tests(tests): # FIXME: some suites of tests start their own mongod, so don't # need this. (So long as there are no conflicts with port, # dbpath, etc., and so long as we shut ours down properly, # starting this mongod shouldn't break anything, though.) - with nothing() if oneMongodPerTest else mongod(smallOplog=smallOplog) as master1: - with nothing() if oneMongodPerTest else (mongod(slave=True) if smallOplog else nothing()) as slave1: + + # The reason we use with is so that we get __exit__ semantics + + with mongod(small_oplog=small_oplog) as master: + with mongod(slave=True) if small_oplog else Nothing() as slave: + if small_oplog: + master.wait_for_repl() + for test in tests: try: - with mongod(smallOplog=smallOplog) if oneMongodPerTest else nothing() as master2: - with mongod(slave=True) if oneMongodPerTest and smallOplog else nothing() as slave2: - runTest(test) + runTest(test) winners.append(test) - if isinstance(slave2, mongod): - checkDbHashes(master2, slave2) except TestFailure, f: try: print f # Record the failing test and re-raise. losers[f.path] = f.status raise f - except TestServerFailure, f: - if not oneMongodPerTest: - return 2 + except TestServerFailure, f: + return 2 except TestFailure, f: - if not continueOnFailure: + if not continue_on_failure: return 1 - if isinstance(slave1, mongod): - checkDbHashes(master1, slave1) + if isinstance(slave, mongod): + check_db_hashes(master, slave) return 0 + def report(): print "%d test%s succeeded" % (len(winners), '' if len(winners) == 1 else 's') num_missed = len(tests) - (len(winners) + len(losers.keys())) @@ -335,7 +330,7 @@ def report(): print "The following tests failed (with exit code):" for loser in losers: print "%s\t%d" % (loser, losers[loser]) - + def missing(lst, src, dst): if lst: print """The following collections were present in the %s but not the %s @@ -349,149 +344,124 @@ at the end of testing:""" % (src, dst) at the end of testing:""" for db in screwy_in_slave.keys(): print "%s\t %s" % (db, screwy_in_slave[db]) - if smallOplog and not (lost_in_master or lost_in_slave or screwy_in_slave): - print "replication ok for %d collections" % (len(replicated_dbs)) - if (exit_bad or losers or lost_in_slave or lost_in_master or screwy_in_slave): - status = 1 - else: - status = 0 - exit (status) + if small_oplog and not (lost_in_master or lost_in_slave or screwy_in_slave): + print "replication ok for %d collections" % (len(replicated_collections)) + if losers or lost_in_slave or lost_in_master or screwy_in_slave: + raise Exception("Test failures") + -def expandSuites(suites): +def expand_suites(suites): globstr = None - global mongoRepo, tests + tests = [] for suite in suites: - if suite == 'smokeAll': - tests = [] - expandSuites(['smoke', 'smokePerf', 'smokeClient', 'smokeJs', 'smokeJsPerf', 'smokeJsSlowNightly', 'smokeJsSlowWeekly', 'smokeParallel', 'smokeClone', 'smokeParallel', 'smokeRepl', 'smokeAuth', 'smokeSharding', 'smokeTool']) - break - if suite == 'smoke': + if suite == 'all': + return expand_suites(['test', 'perf', 'client', 'js', 'jsPerf', 'jsSlowNightly', 'jsSlowWeekly', 'parallel', 'clone', 'parallel', 'repl', 'auth', 'sharding', 'tool']) + if suite == 'test': if os.sys.platform == "win32": program = 'test.exe' else: program = 'test' (globstr, usedb) = (program, False) - elif suite == 'smokePerf': + elif suite == 'perf': if os.sys.platform == "win32": program = 'perftest.exe' else: program = 'perftest' (globstr, usedb) = (program, False) - elif suite == 'smokeJs': - # FIXME: _runner.js seems equivalent to "[!_]*.js". - #(globstr, usedb) = ('_runner.js', True) - (globstr, usedb) = ('[!_]*.js', True) - elif suite == 'smokeQuota': - (globstr, usedb) = ('quota/*.js', True) - elif suite == 'smokeJsPerf': - (globstr, usedb) = ('perf/*.js', True) - elif suite == 'smokeDisk': - (globstr, usedb) = ('disk/*.js', True) - elif suite == 'smokeJsSlowNightly': - (globstr, usedb) = ('slowNightly/*.js', True) - elif suite == 'smokeJsSlowWeekly': - (globstr, usedb) = ('slowWeekly/*.js', True) - elif suite == 'smokeParallel': - (globstr, usedb) = ('parallel/*.js', True) - elif suite == 'smokeClone': - (globstr, usedb) = ('clone/*.js', False) - elif suite == 'smokeRepl': - (globstr, usedb) = ('repl/*.js', False) - elif suite == 'smokeReplSets': - (globstr, usedb) = ('replsets/*.js', False) - elif suite == 'smokeAuth': - (globstr, usedb) = ('auth/*.js', False) - elif suite == 'smokeSharding': - (globstr, usedb) = ('sharding/*.js', False) - elif suite == 'smokeTool': - (globstr, usedb) = ('tool/*.js', False) - # well, the above almost works for everything... - elif suite == 'smokeClient': + elif suite == 'client': paths = ["firstExample", "secondExample", "whereExample", "authTest", "clientTest", "httpClientTest"] if os.sys.platform == "win32": - paths = [path+'.exe' for path in paths] + paths = [path + '.exe' for path in paths] # hack - tests += [(testPath and path or os.path.join(mongoRepo, path), False) for path in paths] + tests += [(test_path and path or os.path.join(mongo_repo, path), False) for path in paths] elif suite == 'mongosTest': if os.sys.platform == "win32": program = 'mongos.exe' else: program = 'mongos' - tests += [(os.path.join(mongoRepo, program), False)] + tests += [(os.path.join(mongo_repo, program), False)] + elif os.path.exists( suite ): + tests += [ ( os.path.join( mongo_repo , suite ) , True ) ] else: - raise Exception('unknown test suite %s' % suite) + try: + globstr, usedb = {"js": ("[!_]*.js", True), + "quota": ("quota/*.js", True), + "jsPerf": ("perf/*.js", True), + "disk": ("disk/*.js", True), + "jsSlowNightly": ("slowNightly/*.js", True), + "jsSlowWeekly": ("slowWeekly/*.js", True), + "parallel": ("parallel/*.js", True), + "clone": ("clone/*.js", False), + "repl": ("repl/*.js", False), + "replSets": ("replsets/*.js", False), + "dur": ("dur/*.js", False), + "auth": ("auth/*.js", False), + "sharding": ("sharding/*.js", False), + "tool": ("tool/*.js", False)}[suite] + except KeyError: + raise Exception('unknown test suite %s' % suite) if globstr: - globstr = os.path.join(mongoRepo, (os.path.join(('jstests/' if globstr.endswith('.js') else ''), globstr))) + globstr = os.path.join(mongo_repo, (os.path.join(('jstests/' if globstr.endswith('.js') else ''), globstr))) paths = glob.glob(globstr) paths.sort() tests += [(path, usedb) for path in paths] - if not tests: - raise Exception( "no tests found" ) + return tests +def add_exe(e): + if os.sys.platform.startswith( "win" ) and not e.endswith( ".exe" ): + e += ".exe" + return e + def main(): + global mongod_executable, mongod_port, shell_executable, continue_on_failure, small_oplog, smoke_db_prefix, test_path parser = OptionParser(usage="usage: smoke.py [OPTIONS] ARGS*") parser.add_option('--mode', dest='mode', default='suite', - help='If "files", ARGS are filenames; if "suite", ARGS are sets of tests. (default "suite")') + help='If "files", ARGS are filenames; if "suite", ARGS are sets of tests (%default)') # Some of our tests hard-code pathnames e.g., to execute, so until - # th we don't have the freedom to run from anyplace. -# parser.add_option('--mongo-repo', dest='mongoRepo', default=None, -# help='Top-level directory of mongo checkout to use. (default: script will make a guess)') - parser.add_option('--test-path', dest='testPath', default=None, - help="Path to the test executables to run " - "(currently only used for smokeClient)") - parser.add_option('--mongod', dest='mongodExecutable', #default='./mongod', - help='Path to mongod to run (default "./mongod")') - parser.add_option('--port', dest='mongodPort', default="32000", - help='Port the mongod will bind to (default 32000)') - parser.add_option('--mongo', dest='shellExecutable', #default="./mongo", - help='Path to mongo, for .js test files (default "./mongo")') - parser.add_option('--continue-on-failure', dest='continueOnFailure', + # that changes we don't have the freedom to run from anyplace. + # parser.add_option('--mongo-repo', dest='mongo_repo', default=None, + parser.add_option('--test-path', dest='test_path', default=None, + help="Path to the test executables to run, " + "currently only used for 'client' (%default)") + parser.add_option('--mongod', dest='mongod_executable', default=os.path.join(mongo_repo, 'mongod'), + help='Path to mongod to run (%default)') + parser.add_option('--port', dest='mongod_port', default="32000", + help='Port the mongod will bind to (%default)') + parser.add_option('--mongo', dest='shell_executable', default=os.path.join(mongo_repo, 'mongo'), + help='Path to mongo, for .js test files (%default)') + parser.add_option('--continue-on-failure', dest='continue_on_failure', action="store_true", default=False, help='If supplied, continue testing even after a test fails') - parser.add_option('--one-mongod-per-test', dest='oneMongodPerTest', - action="store_true", default=False, - help='If supplied, run each test in a fresh mongod') parser.add_option('--from-file', dest='File', help="Run tests/suites named in FILE, one test per line, '-' means stdin") - parser.add_option('--smoke-db-prefix', dest='smokeDbPrefix', default='', - help="Prefix to use for the mongods' dbpaths.") - parser.add_option('--small-oplog', dest='smallOplog', default=False, + parser.add_option('--smoke-db-prefix', dest='smoke_db_prefix', default=smoke_db_prefix, + help="Prefix to use for the mongods' dbpaths ('%default')") + parser.add_option('--small-oplog', dest='small_oplog', default=False, action="store_true", help='Run tests with master/slave replication & use a small oplog') global tests (options, tests) = parser.parse_args() -# global mongoRepo -# if options.mongoRepo: -# pass -# mongoRepo = options.mongoRepo -# else: -# prefix = '' -# while True: -# if os.path.exists(prefix+'buildscripts'): -# mongoRepo = os.path.normpath(prefix) -# break -# else: -# prefix += '../' -# # FIXME: will this be a device's root directory on -# # Windows? -# if os.path.samefile('/', prefix): -# raise Exception("couldn't guess the mongo repository path") - print tests - global mongoRepo, mongodExecutable, mongodPort, shellExecutable, continueOnFailure, oneMongodPerTest, smallOplog, smokeDbPrefix, testPath - testPath = options.testPath - mongodExecutable = options.mongodExecutable if options.mongodExecutable else os.path.join(mongoRepo, 'mongod') - mongodPort = options.mongodPort if options.mongodPort else mongodPort - shellExecutable = options.shellExecutable if options.shellExecutable else os.path.join(mongoRepo, 'mongo') - continueOnFailure = options.continueOnFailure if options.continueOnFailure else continueOnFailure - oneMongodPerTest = options.oneMongodPerTest if options.oneMongodPerTest else oneMongodPerTest - smokeDbPrefix = options.smokeDbPrefix - smallOplog = options.smallOplog - + test_path = options.test_path + + mongod_executable = add_exe(options.mongod_executable) + if not os.path.exists(mongod_executable): + raise Exception("no mongod found in this directory.") + + mongod_port = options.mongod_port + + shell_executable = add_exe( options.shell_executable ) + if not os.path.exists(shell_executable): + raise Exception("no mongo shell found in this directory.") + + continue_on_failure = options.continue_on_failure + smoke_db_prefix = options.smoke_db_prefix + small_oplog = options.small_oplog + if options.File: if options.File == '-': tests = sys.stdin.readlines() @@ -500,23 +470,20 @@ def main(): tests = f.readlines() tests = [t.rstrip('\n') for t in tests] - if not tests: - raise Exception( "no tests specified" ) # If we're in suite mode, tests is a list of names of sets of tests. if options.mode == 'suite': - # Suites: smoke, smokePerf, smokeJs, smokeQuota, smokeJsPerf, - # smokeJsSlow, smokeParalell, smokeClone, smokeRepl, smokeDisk - suites = tests - tests = [] - expandSuites(suites) + tests = expand_suites(tests) elif options.mode == 'files': tests = [(os.path.abspath(test), True) for test in tests] - runTests(tests) - global exit_bad - exit_bad = False + if not tests: + raise Exception( "no tests specified" ) + + try: + run_tests(tests) + finally: + report() -atexit.register(report) if __name__ == "__main__": main() diff --git a/buildscripts/utils.py b/buildscripts/utils.py index 1ca2fdd..8021d87 100644 --- a/buildscripts/utils.py +++ b/buildscripts/utils.py @@ -5,10 +5,27 @@ import time import os # various utilities that are handy +def getAllSourceFiles( arr=None , prefix="." ): + if arr is None: + arr = [] + + for x in os.listdir( prefix ): + if x.startswith( "." ) or x.startswith( "pcre-" ) or x.startswith( "32bit" ) or x.startswith( "mongodb-" ) or x.startswith("debian") or x.startswith( "mongo-cxx-driver" ): + continue + full = prefix + "/" + x + if os.path.isdir( full ) and not os.path.islink( full ): + getAllSourceFiles( arr , full ) + else: + if full.endswith( ".cpp" ) or full.endswith( ".h" ) or full.endswith( ".c" ): + arr.append( full ) + + return arr + + def getGitBranch(): if not os.path.exists( ".git" ): return None - + version = open( ".git/HEAD" ,'r' ).read().strip() if not version.startswith( "ref: " ): return version @@ -45,7 +62,6 @@ def getGitVersion(): return version return open( f , 'r' ).read().strip() - def execsys( args ): import subprocess if isinstance( args , str ): @@ -65,7 +81,6 @@ def getprocesslist(): r = re.compile( "[\r\n]+" ) return r.split( raw ) - def removeIfInList( lst , thing ): if thing in lst: lst.remove( thing ) diff --git a/client/clientOnly.cpp b/client/clientOnly.cpp index 6178257..726c3a9 100644 --- a/client/clientOnly.cpp +++ b/client/clientOnly.cpp @@ -29,7 +29,7 @@ namespace mongo { bool dbexitCalled = false; - void dbexit( ExitCode returnCode, const char *whyMsg ) { + void dbexit( ExitCode returnCode, const char *whyMsg , bool tryToGetLock ) { dbexitCalled = true; out() << "dbexit called" << endl; if ( whyMsg ) @@ -37,12 +37,12 @@ namespace mongo { out() << "exiting" << endl; ::exit( returnCode ); } - - bool inShutdown(){ + + bool inShutdown() { return dbexitCalled; } - void setupSignals(){ + void setupSignals() { // maybe should do SIGPIPE here, not sure } @@ -50,20 +50,20 @@ namespace mongo { return "in client only mode"; } - bool haveLocalShardingInfo( const string& ns ){ + bool haveLocalShardingInfo( const string& ns ) { return false; } - DBClientBase * createDirectClient(){ + DBClientBase * createDirectClient() { uassert( 10256 , "no createDirectClient in clientOnly" , 0 ); return 0; } - void Shard::getAllShards( vector& all ){ + void Shard::getAllShards( vector& all ) { assert(0); } - bool Shard::isAShard( const string& ident ){ + bool Shard::isAShardNode( const string& ident ) { assert(0); return false; } diff --git a/client/connpool.cpp b/client/connpool.cpp index dae13f6..a521699 100644 --- a/client/connpool.cpp +++ b/client/connpool.cpp @@ -26,162 +26,240 @@ namespace mongo { + // ------ PoolForHost ------ + + PoolForHost::~PoolForHost() { + while ( ! _pool.empty() ) { + StoredConnection sc = _pool.top(); + delete sc.conn; + _pool.pop(); + } + } + + void PoolForHost::done( DBClientBase * c ) { + if ( _pool.size() >= _maxPerHost ) { + delete c; + } + else { + _pool.push(c); + } + } + + DBClientBase * PoolForHost::get() { + + time_t now = time(0); + + while ( ! _pool.empty() ) { + StoredConnection sc = _pool.top(); + _pool.pop(); + if ( sc.ok( now ) ) + return sc.conn; + delete sc.conn; + } + + return NULL; + } + + void PoolForHost::flush() { + vector all; + while ( ! _pool.empty() ) { + StoredConnection c = _pool.top(); + _pool.pop(); + all.push_back( c ); + bool res; + c.conn->isMaster( res ); + } + + for ( vector::iterator i=all.begin(); i != all.end(); ++i ) { + _pool.push( *i ); + } + } + + PoolForHost::StoredConnection::StoredConnection( DBClientBase * c ) { + conn = c; + when = time(0); + } + + bool PoolForHost::StoredConnection::ok( time_t now ) { + // if connection has been idle for an hour, kill it + return ( now - when ) < 3600; + } + + void PoolForHost::createdOne( DBClientBase * base) { + if ( _created == 0 ) + _type = base->type(); + _created++; + } + + unsigned PoolForHost::_maxPerHost = 50; + + // ------ DBConnectionPool ------ + DBConnectionPool pool; - + DBClientBase* DBConnectionPool::_get(const string& ident) { scoped_lock L(_mutex); - PoolForHost& p = _pools[ident]; - if ( p.pool.empty() ) - return 0; - - DBClientBase *c = p.pool.top(); - p.pool.pop(); - return c; + return p.get(); } - DBClientBase* DBConnectionPool::_finishCreate( const string& host , DBClientBase* conn ){ + DBClientBase* DBConnectionPool::_finishCreate( const string& host , DBClientBase* conn ) { { scoped_lock L(_mutex); PoolForHost& p = _pools[host]; - p.created++; + p.createdOne( conn ); } onCreate( conn ); onHandedOut( conn ); - + return conn; } DBClientBase* DBConnectionPool::get(const ConnectionString& url) { DBClientBase * c = _get( url.toString() ); - if ( c ){ + if ( c ) { onHandedOut( c ); return c; } - + string errmsg; c = url.connect( errmsg ); - uassert( 13328 , (string)"dbconnectionpool: connect failed " + url.toString() + " : " + errmsg , c ); - + uassert( 13328 , _name + ": connect failed " + url.toString() + " : " + errmsg , c ); + return _finishCreate( url.toString() , c ); } - + DBClientBase* DBConnectionPool::get(const string& host) { DBClientBase * c = _get( host ); - if ( c ){ + if ( c ) { onHandedOut( c ); return c; } - + string errmsg; ConnectionString cs = ConnectionString::parse( host , errmsg ); uassert( 13071 , (string)"invalid hostname [" + host + "]" + errmsg , cs.isValid() ); - + c = cs.connect( errmsg ); - uassert( 11002 , (string)"dbconnectionpool: connect failed " + host + " : " + errmsg , c ); + if ( ! c ) + throw SocketException( SocketException::CONNECT_ERROR , host , 11002 , str::stream() << _name << " error: " << errmsg ); return _finishCreate( host , c ); } - DBConnectionPool::~DBConnectionPool(){ - for ( map::iterator i = _pools.begin(); i != _pools.end(); i++ ){ - PoolForHost& p = i->second; - - while ( ! p.pool.empty() ){ - DBClientBase * c = p.pool.top(); - delete c; - p.pool.pop(); - } - } + DBConnectionPool::~DBConnectionPool() { + // connection closing is handled by ~PoolForHost } - void DBConnectionPool::flush(){ + void DBConnectionPool::flush() { scoped_lock L(_mutex); - for ( map::iterator i = _pools.begin(); i != _pools.end(); i++ ){ + for ( PoolMap::iterator i = _pools.begin(); i != _pools.end(); i++ ) { PoolForHost& p = i->second; - - vector all; - while ( ! p.pool.empty() ){ - DBClientBase * c = p.pool.top(); - p.pool.pop(); - all.push_back( c ); - bool res; - c->isMaster( res ); - } - - for ( vector::iterator i=all.begin(); i != all.end(); i++ ){ - p.pool.push( *i ); - } + p.flush(); } } - void DBConnectionPool::addHook( DBConnectionHook * hook ){ + void DBConnectionPool::addHook( DBConnectionHook * hook ) { _hooks.push_back( hook ); } - void DBConnectionPool::onCreate( DBClientBase * conn ){ + void DBConnectionPool::onCreate( DBClientBase * conn ) { if ( _hooks.size() == 0 ) return; - - for ( list::iterator i = _hooks.begin(); i != _hooks.end(); i++ ){ + + for ( list::iterator i = _hooks.begin(); i != _hooks.end(); i++ ) { (*i)->onCreate( conn ); } } - void DBConnectionPool::onHandedOut( DBClientBase * conn ){ + void DBConnectionPool::onHandedOut( DBClientBase * conn ) { if ( _hooks.size() == 0 ) return; - - for ( list::iterator i = _hooks.begin(); i != _hooks.end(); i++ ){ + + for ( list::iterator i = _hooks.begin(); i != _hooks.end(); i++ ) { (*i)->onHandedOut( conn ); } } - void DBConnectionPool::appendInfo( BSONObjBuilder& b ){ - scoped_lock lk( _mutex ); + void DBConnectionPool::appendInfo( BSONObjBuilder& b ) { BSONObjBuilder bb( b.subobjStart( "hosts" ) ); - for ( map::iterator i=_pools.begin(); i!=_pools.end(); ++i ){ - string s = i->first; - BSONObjBuilder temp( bb.subobjStart( s.c_str() ) ); - temp.append( "available" , (int)(i->second.pool.size()) ); - temp.appendNumber( "created" , i->second.created ); - temp.done(); + int avail = 0; + long long created = 0; + + + map createdByType; + + { + scoped_lock lk( _mutex ); + for ( PoolMap::iterator i=_pools.begin(); i!=_pools.end(); ++i ) { + string s = i->first; + BSONObjBuilder temp( bb.subobjStart( s ) ); + temp.append( "available" , i->second.numAvailable() ); + temp.appendNumber( "created" , i->second.numCreated() ); + temp.done(); + + avail += i->second.numAvailable(); + created += i->second.numCreated(); + + long long& x = createdByType[i->second.type()]; + x += i->second.numCreated(); + } } bb.done(); + + { + BSONObjBuilder temp( bb.subobjStart( "createdByType" ) ); + for ( map::iterator i=createdByType.begin(); i!=createdByType.end(); ++i ) { + temp.appendNumber( ConnectionString::typeToString( i->first ) , i->second ); + } + temp.done(); + } + + b.append( "totalAvailable" , avail ); + b.appendNumber( "totalCreated" , created ); } - ScopedDbConnection * ScopedDbConnection::steal(){ + bool DBConnectionPool::serverNameCompare::operator()( const string& a , const string& b ) const{ + string ap = str::before( a , "/" ); + string bp = str::before( b , "/" ); + + return ap < bp; + } + + // ------ ScopedDbConnection ------ + + ScopedDbConnection * ScopedDbConnection::steal() { assert( _conn ); ScopedDbConnection * n = new ScopedDbConnection( _host , _conn ); _conn = 0; return n; } - + ScopedDbConnection::~ScopedDbConnection() { - if ( _conn ){ + if ( _conn ) { if ( ! _conn->isFailed() ) { /* see done() comments above for why we log this line */ - log() << "~ScopedDBConnection: _conn != null" << endl; + log() << "~ScopedDbConnection: _conn != null" << endl; } kill(); } } ScopedDbConnection::ScopedDbConnection(const Shard& shard ) - : _host( shard.getConnString() ) , _conn( pool.get(_host) ){ + : _host( shard.getConnString() ) , _conn( pool.get(_host) ) { } - + ScopedDbConnection::ScopedDbConnection(const Shard* shard ) - : _host( shard->getConnString() ) , _conn( pool.get(_host) ){ + : _host( shard->getConnString() ) , _conn( pool.get(_host) ) { } class PoolFlushCmd : public Command { public: - PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ){} + PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ) {} virtual void help( stringstream &help ) const { help<<"internal"; } virtual LockType locktype() const { return NONE; } - virtual bool run(const string&, mongo::BSONObj&, std::string&, mongo::BSONObjBuilder& result, bool){ + virtual bool run(const string&, mongo::BSONObj&, std::string&, mongo::BSONObjBuilder& result, bool) { pool.flush(); return true; } @@ -193,11 +271,13 @@ namespace mongo { class PoolStats : public Command { public: - PoolStats() : Command( "connPoolStats" ){} + PoolStats() : Command( "connPoolStats" ) {} virtual void help( stringstream &help ) const { help<<"stats about connection pool"; } virtual LockType locktype() const { return NONE; } - virtual bool run(const string&, mongo::BSONObj&, std::string&, mongo::BSONObjBuilder& result, bool){ + virtual bool run(const string&, mongo::BSONObj&, std::string&, mongo::BSONObjBuilder& result, bool) { pool.appendInfo( result ); + result.append( "numDBClientConnection" , DBClientConnection::getNumConnections() ); + result.append( "numAScopedConnection" , AScopedConnection::getNumConnections() ); return true; } virtual bool slaveOk() const { @@ -206,5 +286,6 @@ namespace mongo { } poolStatsCmd; + AtomicUInt AScopedConnection::_numConnections; } // namespace mongo diff --git a/client/connpool.h b/client/connpool.h index 00570c5..e7f59d6 100644 --- a/client/connpool.h +++ b/client/connpool.h @@ -24,55 +24,109 @@ namespace mongo { class Shard; - - struct PoolForHost { + + /** + * not thread safe + * thread safety is handled by DBConnectionPool + */ + class PoolForHost { + public: PoolForHost() - : created(0){} - PoolForHost( const PoolForHost& other ){ - assert(other.pool.size() == 0); - created = other.created; - assert( created == 0 ); + : _created(0) {} + + PoolForHost( const PoolForHost& other ) { + assert(other._pool.size() == 0); + _created = other._created; + assert( _created == 0 ); } - - std::stack pool; - long long created; + + ~PoolForHost(); + + int numAvailable() const { return (int)_pool.size(); } + + void createdOne( DBClientBase * base); + long long numCreated() const { return _created; } + + ConnectionString::ConnectionType type() const { assert(_created); return _type; } + + /** + * gets a connection or return NULL + */ + DBClientBase * get(); + + void done( DBClientBase * c ); + + void flush(); + + static void setMaxPerHost( unsigned max ) { _maxPerHost = max; } + static unsigned getMaxPerHost() { return _maxPerHost; } + private: + + struct StoredConnection { + StoredConnection( DBClientBase * c ); + + bool ok( time_t now ); + + DBClientBase* conn; + time_t when; + }; + + std::stack _pool; + long long _created; + ConnectionString::ConnectionType _type; + + static unsigned _maxPerHost; }; - + class DBConnectionHook { public: - virtual ~DBConnectionHook(){} - virtual void onCreate( DBClientBase * conn ){} - virtual void onHandedOut( DBClientBase * conn ){} + virtual ~DBConnectionHook() {} + virtual void onCreate( DBClientBase * conn ) {} + virtual void onHandedOut( DBClientBase * conn ) {} }; /** Database connection pool. Generally, use ScopedDbConnection and do not call these directly. - This class, so far, is suitable for use with unauthenticated connections. - Support for authenticated connections requires some adjustements: please + This class, so far, is suitable for use with unauthenticated connections. + Support for authenticated connections requires some adjustements: please request... Usage: - + { ScopedDbConnection c("myserver"); c.conn()... } */ class DBConnectionPool { + + public: + + /** compares server namees, but is smart about replica set names */ + struct serverNameCompare { + bool operator()( const string& a , const string& b ) const; + }; + + private: + mongo::mutex _mutex; - map _pools; // servername -> pool + typedef map PoolMap; // servername -> pool + PoolMap _pools; list _hooks; + string _name; DBClientBase* _get( const string& ident ); - + DBClientBase* _finishCreate( const string& ident , DBClientBase* conn ); - public: - DBConnectionPool() : _mutex("DBConnectionPool") { } + public: + DBConnectionPool() : _mutex("DBConnectionPool") , _name( "dbconnectionpool" ) { } ~DBConnectionPool(); + /** right now just controls some asserts. defaults to "dbconnectionpool" */ + void setName( const string& name ) { _name = name; } void onCreate( DBClientBase * conn ); void onHandedOut( DBClientBase * conn ); @@ -83,72 +137,78 @@ namespace mongo { DBClientBase *get(const ConnectionString& host); void release(const string& host, DBClientBase *c) { - if ( c->isFailed() ){ + if ( c->isFailed() ) { delete c; return; } scoped_lock L(_mutex); - _pools[host].pool.push(c); + _pools[host].done(c); } void addHook( DBConnectionHook * hook ); void appendInfo( BSONObjBuilder& b ); }; - + extern DBConnectionPool pool; class AScopedConnection : boost::noncopyable { public: - virtual ~AScopedConnection(){} + AScopedConnection() { _numConnections++; } + virtual ~AScopedConnection() { _numConnections--; } virtual DBClientBase* get() = 0; virtual void done() = 0; virtual string getHost() const = 0; + + /** + * @return total number of current instances of AScopedConnection + */ + static int getNumConnections() { return _numConnections; } + + private: + static AtomicUInt _numConnections; }; /** Use to get a connection from the pool. On exceptions things - clean up nicely. + clean up nicely (i.e. the socket gets closed automatically when the + scopeddbconnection goes out of scope). */ class ScopedDbConnection : public AScopedConnection { - const string _host; - DBClientBase *_conn; public: + /** the main constructor you want to use + throws UserException if can't connect + */ + explicit ScopedDbConnection(const string& host) : _host(host), _conn( pool.get(host) ) {} + + ScopedDbConnection() : _host( "" ) , _conn(0) {} + + /* @param conn - bind to an existing connection */ + ScopedDbConnection(const string& host, DBClientBase* conn ) : _host( host ) , _conn( conn ) {} + + /** throws UserException if can't connect */ + explicit ScopedDbConnection(const ConnectionString& url ) : _host(url.toString()), _conn( pool.get(url) ) {} + + /** throws UserException if can't connect */ + explicit ScopedDbConnection(const Shard& shard ); + explicit ScopedDbConnection(const Shard* shard ); + + ~ScopedDbConnection(); + /** get the associated connection object */ - DBClientBase* operator->(){ - uassert( 11004 , "did you call done already" , _conn ); - return _conn; + DBClientBase* operator->() { + uassert( 11004 , "connection was returned to the pool already" , _conn ); + return _conn; } - + /** get the associated connection object */ DBClientBase& conn() { - uassert( 11005 , "did you call done already" , _conn ); + uassert( 11005 , "connection was returned to the pool already" , _conn ); return *_conn; } /** get the associated connection object */ DBClientBase* get() { - uassert( 13102 , "did you call done already" , _conn ); + uassert( 13102 , "connection was returned to the pool already" , _conn ); return _conn; } - - ScopedDbConnection() - : _host( "" ) , _conn(0) { - } - - /** throws UserException if can't connect */ - ScopedDbConnection(const string& host) - : _host(host), _conn( pool.get(host) ) { - } - - ScopedDbConnection(const string& host, DBClientBase* conn ) - : _host( host ) , _conn( conn ){ - } - - ScopedDbConnection(const Shard& shard ); - ScopedDbConnection(const Shard* shard ); - - ScopedDbConnection(const ConnectionString& url ) - : _host(url.toString()), _conn( pool.get(url) ) { - } - string getHost() const { return _host; } @@ -161,8 +221,8 @@ namespace mongo { } /** Call this when you are done with the connection. - - If you do not call done() before this object goes out of scope, + + If you do not call done() before this object goes out of scope, we can't be sure we fully read all expected data of a reply on the socket. so we don't try to reuse the connection in that situation. */ @@ -170,7 +230,7 @@ namespace mongo { if ( ! _conn ) return; - /* we could do this, but instead of assume one is using autoreconnect mode on the connection + /* we could do this, but instead of assume one is using autoreconnect mode on the connection if ( _conn->isFailed() ) kill(); else @@ -178,10 +238,12 @@ namespace mongo { pool.release(_host, _conn); _conn = 0; } - + ScopedDbConnection * steal(); - ~ScopedDbConnection(); + private: + const string _host; + DBClientBase *_conn; }; diff --git a/client/constants.h b/client/constants.h index 66aa9b1..54f3fd2 100644 --- a/client/constants.h +++ b/client/constants.h @@ -2,22 +2,22 @@ #pragma once -namespace mongo { +namespace mongo { /* query results include a 32 result flag word consisting of these bits */ enum ResultFlagType { - /* returned, with zero results, when getMore is called but the cursor id + /* returned, with zero results, when getMore is called but the cursor id is not valid at the server. */ - ResultFlag_CursorNotFound = 1, - + ResultFlag_CursorNotFound = 1, + /* { $err : ... } is being returned */ - ResultFlag_ErrSet = 2, - + ResultFlag_ErrSet = 2, + /* Have to update config from the server, usually $err is also set */ - ResultFlag_ShardConfigStale = 4, - - /* for backward compatability: this let's us know the server supports - the QueryOption_AwaitData option. if it doesn't, a repl slave client should sleep + ResultFlag_ShardConfigStale = 4, + + /* for backward compatability: this let's us know the server supports + the QueryOption_AwaitData option. if it doesn't, a repl slave client should sleep a little between getMore's. */ ResultFlag_AwaitCapable = 8 diff --git a/client/dbclient.cpp b/client/dbclient.cpp index aa9b7ae..b4214ab 100644 --- a/client/dbclient.cpp +++ b/client/dbclient.cpp @@ -31,8 +31,41 @@ namespace mongo { + void ConnectionString::_fillServers( string s ) { + + { + string::size_type idx = s.find( '/' ); + if ( idx != string::npos ) { + _setName = s.substr( 0 , idx ); + s = s.substr( idx + 1 ); + _type = SET; + } + } + + string::size_type idx; + while ( ( idx = s.find( ',' ) ) != string::npos ) { + _servers.push_back( s.substr( 0 , idx ) ); + s = s.substr( idx + 1 ); + } + _servers.push_back( s ); + + } + + void ConnectionString::_finishInit() { + stringstream ss; + if ( _type == SET ) + ss << _setName << "/"; + for ( unsigned i=0; i<_servers.size(); i++ ) { + if ( i > 0 ) + ss << ","; + ss << _servers[i].toString(); + } + _string = ss.str(); + } + + DBClientBase* ConnectionString::connect( string& errmsg ) const { - switch ( _type ){ + switch ( _type ) { case MASTER: { DBClientConnection * c = new DBClientConnection(true); log(1) << "creating new connection to:" << _servers[0] << endl; @@ -42,11 +75,11 @@ namespace mongo { } return c; } - - case PAIR: + + case PAIR: case SET: { DBClientReplicaSet * set = new DBClientReplicaSet( _setName , _servers ); - if( ! set->connect() ){ + if( ! set->connect() ) { delete set; errmsg = "connect failed to set "; errmsg += toString(); @@ -54,7 +87,7 @@ namespace mongo { } return set; } - + case SYNC: { // TODO , don't copy list l; @@ -62,40 +95,58 @@ namespace mongo { l.push_back( _servers[i] ); return new SyncClusterConnection( l ); } - + case INVALID: throw UserException( 13421 , "trying to connect to invalid ConnectionString" ); break; } - + assert( 0 ); return 0; } - ConnectionString ConnectionString::parse( const string& host , string& errmsg ){ - + ConnectionString ConnectionString::parse( const string& host , string& errmsg ) { + string::size_type i = host.find( '/' ); - if ( i != string::npos ){ + if ( i != string::npos && i != 0) { // replica set return ConnectionString( SET , host.substr( i + 1 ) , host.substr( 0 , i ) ); } - int numCommas = DBClientBase::countCommas( host ); - - if( numCommas == 0 ) + int numCommas = str::count( host , ',' ); + + if( numCommas == 0 ) return ConnectionString( HostAndPort( host ) ); - - if ( numCommas == 1 ) + + if ( numCommas == 1 ) return ConnectionString( PAIR , host ); if ( numCommas == 2 ) return ConnectionString( SYNC , host ); - + errmsg = (string)"invalid hostname [" + host + "]"; return ConnectionString(); // INVALID } - Query& Query::where(const string &jscode, BSONObj scope) { + string ConnectionString::typeToString( ConnectionType type ) { + switch ( type ) { + case INVALID: + return "invalid"; + case MASTER: + return "master"; + case PAIR: + return "pair"; + case SET: + return "set"; + case SYNC: + return "sync"; + } + assert(0); + return ""; + } + + + Query& Query::where(const string &jscode, BSONObj scope) { /* use where() before sort() and hint() and explain(), else this will assert. */ assert( ! isComplex() ); BSONObjBuilder b; @@ -113,44 +164,44 @@ namespace mongo { obj = b.obj(); } - Query& Query::sort(const BSONObj& s) { + Query& Query::sort(const BSONObj& s) { appendComplex( "orderby", s ); - return *this; + return *this; } Query& Query::hint(BSONObj keyPattern) { appendComplex( "$hint", keyPattern ); - return *this; + return *this; } Query& Query::explain() { appendComplex( "$explain", true ); - return *this; + return *this; } - + Query& Query::snapshot() { appendComplex( "$snapshot", true ); - return *this; + return *this; } - + Query& Query::minKey( const BSONObj &val ) { appendComplex( "$min", val ); - return *this; + return *this; } Query& Query::maxKey( const BSONObj &val ) { appendComplex( "$max", val ); - return *this; + return *this; } - bool Query::isComplex( bool * hasDollar ) const{ - if ( obj.hasElement( "query" ) ){ + bool Query::isComplex( bool * hasDollar ) const { + if ( obj.hasElement( "query" ) ) { if ( hasDollar ) hasDollar[0] = false; return true; } - if ( obj.hasElement( "$query" ) ){ + if ( obj.hasElement( "$query" ) ) { if ( hasDollar ) hasDollar[0] = true; return true; @@ -158,12 +209,12 @@ namespace mongo { return false; } - + BSONObj Query::getFilter() const { bool hasDollar; if ( ! isComplex( &hasDollar ) ) return obj; - + return obj.getObjectField( hasDollar ? "$query" : "query" ); } BSONObj Query::getSort() const { @@ -182,8 +233,8 @@ namespace mongo { bool Query::isExplain() const { return isComplex() && obj.getBoolField( "$explain" ); } - - string Query::toString() const{ + + string Query::toString() const { return obj.toString(); } @@ -203,7 +254,7 @@ namespace mongo { } return _cachedAvailableOptions; } - + inline bool DBClientWithCommands::runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options) { string ns = dbname + ".$cmd"; info = findOne(ns, cmd, 0 , options); @@ -222,38 +273,50 @@ namespace mongo { return runCommand(dbname, b.done(), *info); } - unsigned long long DBClientWithCommands::count(const string &_ns, const BSONObj& query, int options) { - NamespaceString ns(_ns); - BSONObj cmd = BSON( "count" << ns.coll << "query" << query ); + unsigned long long DBClientWithCommands::count(const string &myns, const BSONObj& query, int options, int limit, int skip ) { + NamespaceString ns(myns); + BSONObj cmd = _countCmd( myns , query , options , limit , skip ); BSONObj res; if( !runCommand(ns.db.c_str(), cmd, res, options) ) uasserted(11010,string("count fails:") + res.toString()); return res["n"].numberLong(); } + BSONObj DBClientWithCommands::_countCmd(const string &myns, const BSONObj& query, int options, int limit, int skip ) { + NamespaceString ns(myns); + BSONObjBuilder b; + b.append( "count" , ns.coll ); + b.append( "query" , query ); + if ( limit ) + b.append( "limit" , limit ); + if ( skip ) + b.append( "skip" , skip ); + return b.obj(); + } + BSONObj getlasterrorcmdobj = fromjson("{getlasterror:1}"); - BSONObj DBClientWithCommands::getLastErrorDetailed() { + BSONObj DBClientWithCommands::getLastErrorDetailed() { BSONObj info; runCommand("admin", getlasterrorcmdobj, info); - return info; + return info; } - string DBClientWithCommands::getLastError() { + string DBClientWithCommands::getLastError() { BSONObj info = getLastErrorDetailed(); return getLastErrorString( info ); } - - string DBClientWithCommands::getLastErrorString( const BSONObj& info ){ + + string DBClientWithCommands::getLastErrorString( const BSONObj& info ) { BSONElement e = info["err"]; if( e.eoo() ) return ""; if( e.type() == Object ) return e.toString(); - return e.str(); + return e.str(); } BSONObj getpreverrorcmdobj = fromjson("{getpreverror:1}"); - BSONObj DBClientWithCommands::getPrevError() { + BSONObj DBClientWithCommands::getPrevError() { BSONObj info; runCommand("admin", getpreverrorcmdobj, info); return info; @@ -261,7 +324,7 @@ namespace mongo { BSONObj getnoncecmdobj = fromjson("{getnonce:1}"); - string DBClientWithCommands::createPasswordDigest( const string & username , const string & clearTextPassword ){ + string DBClientWithCommands::createPasswordDigest( const string & username , const string & clearTextPassword ) { md5digest d; { md5_state_t st; @@ -275,11 +338,9 @@ namespace mongo { } bool DBClientWithCommands::auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword) { - //cout << "TEMP AUTH " << toString() << dbname << ' ' << username << ' ' << password_text << ' ' << digestPassword << endl; - - string password = password_text; - if( digestPassword ) - password = createPasswordDigest( username , password_text ); + string password = password_text; + if( digestPassword ) + password = createPasswordDigest( username , password_text ); BSONObj info; string nonce; @@ -310,8 +371,8 @@ namespace mongo { b << "key" << digestToString( d ); authCmd = b.done(); } - - if( runCommand(dbname, authCmd, info) ) + + if( runCommand(dbname, authCmd, info) ) return true; errmsg = info.toString(); @@ -322,7 +383,7 @@ namespace mongo { bool DBClientWithCommands::isMaster(bool& isMaster, BSONObj *info) { BSONObj o; - if ( info == 0 ) + if ( info == 0 ) info = &o; bool ok = runCommand("admin", ismastercmdobj, *info); isMaster = info->getField("ismaster").trueValue(); @@ -331,7 +392,7 @@ namespace mongo { bool DBClientWithCommands::createCollection(const string &ns, long long size, bool capped, int max, BSONObj *info) { BSONObj o; - if ( info == 0 ) info = &o; + if ( info == 0 ) info = &o; BSONObjBuilder b; string db = nsToDatabase(ns.c_str()); b.append("create", ns.c_str() + db.length() + 1); @@ -381,11 +442,11 @@ namespace mongo { return false; } - BSONObj DBClientWithCommands::mapreduce(const string &ns, const string &jsmapf, const string &jsreducef, BSONObj query, const string& outputcolname) { + BSONObj DBClientWithCommands::mapreduce(const string &ns, const string &jsmapf, const string &jsreducef, BSONObj query, const string& outputcolname) { BSONObjBuilder b; b.append("mapreduce", nsGetCollection(ns)); - b.appendCode("map", jsmapf.c_str()); - b.appendCode("reduce", jsreducef.c_str()); + b.appendCode("map", jsmapf); + b.appendCode("reduce", jsreducef); if( !query.isEmpty() ) b.append("query", query); if( !outputcolname.empty() ) @@ -397,7 +458,7 @@ namespace mongo { bool DBClientWithCommands::eval(const string &dbname, const string &jscode, BSONObj& info, BSONElement& retValue, BSONObj *args) { BSONObjBuilder b; - b.appendCode("$eval", jscode.c_str()); + b.appendCode("$eval", jscode); if ( args ) b.appendArray("args", *args); bool ok = runCommand(dbname, b.done(), info); @@ -412,27 +473,27 @@ namespace mongo { return eval(dbname, jscode, info, retValue); } - list DBClientWithCommands::getDatabaseNames(){ + list DBClientWithCommands::getDatabaseNames() { BSONObj info; uassert( 10005 , "listdatabases failed" , runCommand( "admin" , BSON( "listDatabases" << 1 ) , info ) ); uassert( 10006 , "listDatabases.databases not array" , info["databases"].type() == Array ); - + list names; - + BSONObjIterator i( info["databases"].embeddedObjectUserCheck() ); - while ( i.more() ){ + while ( i.more() ) { names.push_back( i.next().embeddedObjectUserCheck()["name"].valuestr() ); } return names; } - list DBClientWithCommands::getCollectionNames( const string& db ){ + list DBClientWithCommands::getCollectionNames( const string& db ) { list names; - + string ns = db + ".system.namespaces"; auto_ptr c = query( ns.c_str() , BSONObj() ); - while ( c->more() ){ + while ( c->more() ) { string name = c->next()["name"].valuestr(); if ( name.find( "$" ) != string::npos ) continue; @@ -441,37 +502,37 @@ namespace mongo { return names; } - bool DBClientWithCommands::exists( const string& ns ){ + bool DBClientWithCommands::exists( const string& ns ) { list names; - + string db = nsGetDB( ns ) + ".system.namespaces"; BSONObj q = BSON( "name" << ns ); - return count( db.c_str() , q ) != 0; + return count( db.c_str() , q, QueryOption_SlaveOk ) != 0; } /* --- dbclientconnection --- */ - bool DBClientConnection::auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword) { - string password = password_text; - if( digestPassword ) - password = createPasswordDigest( username , password_text ); + bool DBClientConnection::auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword) { + string password = password_text; + if( digestPassword ) + password = createPasswordDigest( username , password_text ); - if( autoReconnect ) { - /* note we remember the auth info before we attempt to auth -- if the connection is broken, we will - then have it for the next autoreconnect attempt. - */ - pair p = pair(username, password); - authCache[dbname] = p; - } + if( autoReconnect ) { + /* note we remember the auth info before we attempt to auth -- if the connection is broken, we will + then have it for the next autoreconnect attempt. + */ + pair p = pair(username, password); + authCache[dbname] = p; + } - return DBClientBase::auth(dbname, username, password.c_str(), errmsg, false); - } + return DBClientBase::auth(dbname, username, password.c_str(), errmsg, false); + } BSONObj DBClientInterface::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) { auto_ptr c = this->query(ns, query, 1, 0, fieldsToReturn, queryOptions); - uassert( 10276 , "DBClientBase::findOne: transport error", c.get() ); + uassert( 10276 , str::stream() << "DBClientBase::findOne: transport error: " << getServerAddress() << " query: " << query.toString(), c.get() ); if ( c->hasResultFlag( ResultFlag_ShardConfigStale ) ) throw StaleConfigException( ns , "findOne has stale config" ); @@ -482,20 +543,20 @@ namespace mongo { return c->nextSafe().copy(); } - bool DBClientConnection::connect(const HostAndPort& server, string& errmsg){ + bool DBClientConnection::connect(const HostAndPort& server, string& errmsg) { _server = server; _serverString = _server.toString(); return _connect( errmsg ); } - bool DBClientConnection::_connect( string& errmsg ){ + bool DBClientConnection::_connect( string& errmsg ) { _serverString = _server.toString(); // we keep around SockAddr for connection life -- maybe MessagingPort // requires that? server.reset(new SockAddr(_server.host().c_str(), _server.port())); - p.reset(new MessagingPort( _timeout, _logLevel )); + p.reset(new MessagingPort( _so_timeout, _logLevel )); - if (server->getAddr() == "0.0.0.0"){ + if (server->getAddr() == "0.0.0.0") { failed = true; return false; } @@ -513,35 +574,39 @@ namespace mongo { void DBClientConnection::_checkConnection() { if ( !failed ) return; - if ( lastReconnectTry && time(0)-lastReconnectTry < 2 ) - return; + if ( lastReconnectTry && time(0)-lastReconnectTry < 2 ) { + // we wait a little before reconnect attempt to avoid constant hammering. + // but we throw we don't want to try to use a connection in a bad state + throw SocketException(SocketException::FAILED_STATE); + } if ( !autoReconnect ) - return; + throw SocketException(SocketException::FAILED_STATE); lastReconnectTry = time(0); log(_logLevel) << "trying reconnect to " << _serverString << endl; string errmsg; failed = false; - if ( ! _connect(errmsg) ) { + if ( ! _connect(errmsg) ) { + failed = true; log(_logLevel) << "reconnect " << _serverString << " failed " << errmsg << endl; - return; - } + throw SocketException(SocketException::CONNECT_ERROR); + } - log(_logLevel) << "reconnect " << _serverString << " ok" << endl; - for( map< string, pair >::iterator i = authCache.begin(); i != authCache.end(); i++ ) { - const char *dbname = i->first.c_str(); - const char *username = i->second.first.c_str(); - const char *password = i->second.second.c_str(); - if( !DBClientBase::auth(dbname, username, password, errmsg, false) ) - log(_logLevel) << "reconnect: auth failed db:" << dbname << " user:" << username << ' ' << errmsg << '\n'; - } + log(_logLevel) << "reconnect " << _serverString << " ok" << endl; + for( map< string, pair >::iterator i = authCache.begin(); i != authCache.end(); i++ ) { + const char *dbname = i->first.c_str(); + const char *username = i->second.first.c_str(); + const char *password = i->second.second.c_str(); + if( !DBClientBase::auth(dbname, username, password, errmsg, false) ) + log(_logLevel) << "reconnect: auth failed db:" << dbname << " user:" << username << ' ' << errmsg << '\n'; + } } auto_ptr DBClientBase::query(const string &ns, Query query, int nToReturn, - int nToSkip, const BSONObj *fieldsToReturn, int queryOptions , int batchSize ) { + int nToSkip, const BSONObj *fieldsToReturn, int queryOptions , int batchSize ) { auto_ptr c( new DBClientCursor( this, - ns, query.obj, nToReturn, nToSkip, - fieldsToReturn, queryOptions , batchSize ) ); + ns, query.obj, nToReturn, nToSkip, + fieldsToReturn, queryOptions , batchSize ) ); if ( c->init() ) return c; return auto_ptr< DBClientCursor >( 0 ); @@ -562,14 +627,14 @@ namespace mongo { } boost::function _f; }; - + unsigned long long DBClientConnection::query( boost::function f, const string& ns, Query query, const BSONObj *fieldsToReturn, int queryOptions ) { DBClientFunConvertor fun; fun._f = f; boost::function ptr( fun ); return DBClientConnection::query( ptr, ns, query, fieldsToReturn, queryOptions ); } - + unsigned long long DBClientConnection::query( boost::function f, const string& ns, Query query, const BSONObj *fieldsToReturn, int queryOptions ) { // mask options queryOptions &= (int)( QueryOption_NoCursorTimeout | QueryOption_SlaveOk ); @@ -577,11 +642,11 @@ namespace mongo { bool doExhaust = ( availableOptions() & QueryOption_Exhaust ); if ( doExhaust ) { - queryOptions |= (int)QueryOption_Exhaust; + queryOptions |= (int)QueryOption_Exhaust; } auto_ptr c( this->query(ns, query, 0, 0, fieldsToReturn, queryOptions) ); - massert( 13386, "socket error for mapping query", c.get() ); - + uassert( 13386, "socket error for mapping query", c.get() ); + if ( !doExhaust ) { while( c->more() ) { DBClientCursorBatchIterator i( *c ); @@ -591,21 +656,21 @@ namespace mongo { return n; } - try { - while( 1 ) { - while( c->moreInCurrentBatch() ) { + try { + while( 1 ) { + while( c->moreInCurrentBatch() ) { DBClientCursorBatchIterator i( *c ); f( i ); n += i.n(); } - if( c->getCursorId() == 0 ) + if( c->getCursorId() == 0 ) break; c->exhaustReceiveMore(); } } - catch(std::exception&) { + catch(std::exception&) { /* connection CANNOT be used anymore as more data may be on the way from the server. we have to reconnect. */ @@ -633,16 +698,16 @@ namespace mongo { void DBClientBase::insert( const string & ns , const vector< BSONObj > &v ) { Message toSend; - + BufBuilder b; int opts = 0; b.appendNum( opts ); b.appendStr( ns ); for( vector< BSONObj >::const_iterator i = v.begin(); i != v.end(); ++i ) i->appendSelfToBufBuilder( b ); - + toSend.setData( dbInsert, b.buf(), b.len() ); - + say( toSend ); } @@ -686,63 +751,63 @@ namespace mongo { say( toSend ); } - auto_ptr DBClientWithCommands::getIndexes( const string &ns ){ + auto_ptr DBClientWithCommands::getIndexes( const string &ns ) { return query( Namespace( ns.c_str() ).getSisterNS( "system.indexes" ).c_str() , BSON( "ns" << ns ) ); } - - void DBClientWithCommands::dropIndex( const string& ns , BSONObj keys ){ + + void DBClientWithCommands::dropIndex( const string& ns , BSONObj keys ) { dropIndex( ns , genIndexName( keys ) ); } - void DBClientWithCommands::dropIndex( const string& ns , const string& indexName ){ + void DBClientWithCommands::dropIndex( const string& ns , const string& indexName ) { BSONObj info; - if ( ! runCommand( nsToDatabase( ns.c_str() ) , - BSON( "deleteIndexes" << NamespaceString( ns ).coll << "index" << indexName ) , - info ) ){ + if ( ! runCommand( nsToDatabase( ns.c_str() ) , + BSON( "deleteIndexes" << NamespaceString( ns ).coll << "index" << indexName ) , + info ) ) { log(_logLevel) << "dropIndex failed: " << info << endl; uassert( 10007 , "dropIndex failed" , 0 ); } resetIndexCache(); } - - void DBClientWithCommands::dropIndexes( const string& ns ){ + + void DBClientWithCommands::dropIndexes( const string& ns ) { BSONObj info; - uassert( 10008 , "dropIndexes failed" , runCommand( nsToDatabase( ns.c_str() ) , - BSON( "deleteIndexes" << NamespaceString( ns ).coll << "index" << "*") , - info ) ); + uassert( 10008 , "dropIndexes failed" , runCommand( nsToDatabase( ns.c_str() ) , + BSON( "deleteIndexes" << NamespaceString( ns ).coll << "index" << "*") , + info ) ); resetIndexCache(); } - void DBClientWithCommands::reIndex( const string& ns ){ + void DBClientWithCommands::reIndex( const string& ns ) { list all; auto_ptr i = getIndexes( ns ); - while ( i->more() ){ + while ( i->more() ) { all.push_back( i->next().getOwned() ); } - + dropIndexes( ns ); - - for ( list::iterator i=all.begin(); i!=all.end(); i++ ){ + + for ( list::iterator i=all.begin(); i!=all.end(); i++ ) { BSONObj o = *i; insert( Namespace( ns.c_str() ).getSisterNS( "system.indexes" ).c_str() , o ); } - + } - - string DBClientWithCommands::genIndexName( const BSONObj& keys ){ + + string DBClientWithCommands::genIndexName( const BSONObj& keys ) { stringstream ss; - + bool first = 1; for ( BSONObjIterator i(keys); i.more(); ) { BSONElement f = i.next(); - + if ( first ) first = 0; else ss << "_"; - + ss << f.fieldName() << "_"; if( f.isNumber() ) ss << f.numberInt(); @@ -750,7 +815,7 @@ namespace mongo { return ss.str(); } - bool DBClientWithCommands::ensureIndex( const string &ns , BSONObj keys , bool unique, const string & name ) { + bool DBClientWithCommands::ensureIndex( const string &ns , BSONObj keys , bool unique, const string & name , bool cache ) { BSONObjBuilder toSave; toSave.append( "ns" , ns ); toSave.append( "key" , keys ); @@ -767,13 +832,15 @@ namespace mongo { toSave.append( "name" , nn ); cacheKey += nn; } - + if ( unique ) toSave.appendBool( "unique", unique ); if ( _seenIndexes.count( cacheKey ) ) return 0; - _seenIndexes.insert( cacheKey ); + + if ( cache ) + _seenIndexes.insert( cacheKey ); insert( Namespace( ns.c_str() ).getSisterNS( "system.indexes" ).c_str() , toSave.obj() ); return 1; @@ -808,9 +875,10 @@ namespace mongo { void DBClientConnection::say( Message &toSend ) { checkConnection(); - try { + try { port().say( toSend ); - } catch( SocketException & ) { + } + catch( SocketException & ) { failed = true; throw; } @@ -820,24 +888,25 @@ namespace mongo { port().piggyBack( toSend ); } - void DBClientConnection::recv( Message &m ) { + void DBClientConnection::recv( Message &m ) { port().recv(m); } - bool DBClientConnection::call( Message &toSend, Message &response, bool assertOk ) { - /* todo: this is very ugly messagingport::call returns an error code AND can throw - an exception. we should make it return void and just throw an exception anytime + bool DBClientConnection::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { + /* todo: this is very ugly messagingport::call returns an error code AND can throw + an exception. we should make it return void and just throw an exception anytime it fails */ - try { + try { if ( !port().call(toSend, response) ) { failed = true; if ( assertOk ) - uassert( 10278 , "dbclient error communicating with server", false); + uasserted( 10278 , str::stream() << "dbclient error communicating with server: " << getServerAddress() ); + return false; } } - catch( SocketException & ) { + catch( SocketException & ) { failed = true; throw; } @@ -858,222 +927,24 @@ namespace mongo { } } - void DBClientConnection::killCursor( long long cursorId ){ + void DBClientConnection::killCursor( long long cursorId ) { BufBuilder b; b.appendNum( (int)0 ); // reserved b.appendNum( (int)1 ); // number b.appendNum( cursorId ); - + Message m; m.setData( dbKillCursors , b.buf() , b.len() ); - sayPiggyBack( m ); + if ( _lazyKillCursor ) + sayPiggyBack( m ); + else + say(m); } - /* --- class dbclientpaired --- */ + AtomicUInt DBClientConnection::_numConnections; + bool DBClientConnection::_lazyKillCursor = true; - string DBClientReplicaSet::toString() { - return getServerAddress(); - } - - DBClientReplicaSet::DBClientReplicaSet( const string& name , const vector& servers ) - : _name( name ) , _currentMaster( 0 ), _servers( servers ){ - - for ( unsigned i=0; i<_servers.size(); i++ ) - _conns.push_back( new DBClientConnection( true , this ) ); - } - - DBClientReplicaSet::~DBClientReplicaSet(){ - for ( unsigned i=0; i<_conns.size(); i++ ) - delete _conns[i]; - _conns.clear(); - } - - string DBClientReplicaSet::getServerAddress() const { - StringBuilder ss; - if ( _name.size() ) - ss << _name << "/"; - - for ( unsigned i=0; i<_servers.size(); i++ ){ - if ( i > 0 ) - ss << ","; - ss << _servers[i].toString(); - } - return ss.str(); - } - - /* find which server, the left or right, is currently master mode */ - void DBClientReplicaSet::_checkMaster() { - - bool triedQuickCheck = false; - - log( _logLevel + 1) << "_checkMaster on: " << toString() << endl; - for ( int retry = 0; retry < 2; retry++ ) { - for ( unsigned i=0; i<_conns.size(); i++ ){ - DBClientConnection * c = _conns[i]; - try { - bool im; - BSONObj o; - c->isMaster(im, &o); - - if ( retry ) - log(_logLevel) << "checkmaster: " << c->toString() << ' ' << o << '\n'; - - string maybePrimary; - if ( o["hosts"].type() == Array ){ - if ( o["primary"].type() == String ) - maybePrimary = o["primary"].String(); - - BSONObjIterator hi(o["hosts"].Obj()); - while ( hi.more() ){ - string toCheck = hi.next().String(); - int found = -1; - for ( unsigned x=0; x<_servers.size(); x++ ){ - if ( toCheck == _servers[x].toString() ){ - found = x; - break; - } - } - - if ( found == -1 ){ - HostAndPort h( toCheck ); - _servers.push_back( h ); - _conns.push_back( new DBClientConnection( true, this ) ); - string temp; - _conns[ _conns.size() - 1 ]->connect( h , temp ); - log( _logLevel ) << "updated set to: " << toString() << endl; - } - - } - } - - if ( im ) { - _currentMaster = c; - return; - } - - if ( maybePrimary.size() && ! triedQuickCheck ){ - for ( unsigned x=0; x<_servers.size(); x++ ){ - if ( _servers[i].toString() != maybePrimary ) - continue; - triedQuickCheck = true; - _conns[x]->isMaster( im , &o ); - if ( im ){ - _currentMaster = _conns[x]; - return; - } - } - } - } - catch ( std::exception& e ) { - if ( retry ) - log(_logLevel) << "checkmaster: caught exception " << c->toString() << ' ' << e.what() << endl; - } - } - sleepsecs(1); - } - - uassert( 10009 , "checkmaster: no master found", false); - } - - DBClientConnection * DBClientReplicaSet::checkMaster() { - if ( _currentMaster ){ - // a master is selected. let's just make sure connection didn't die - if ( ! _currentMaster->isFailed() ) - return _currentMaster; - _currentMaster = 0; - } - - _checkMaster(); - assert( _currentMaster ); - return _currentMaster; - } - - DBClientConnection& DBClientReplicaSet::masterConn(){ - return *checkMaster(); - } - - DBClientConnection& DBClientReplicaSet::slaveConn(){ - DBClientConnection * m = checkMaster(); - assert( ! m->isFailed() ); - - DBClientConnection * failedSlave = 0; - - for ( unsigned i=0; i<_conns.size(); i++ ){ - if ( m == _conns[i] ) - continue; - failedSlave = _conns[i]; - if ( _conns[i]->isFailed() ) - continue; - return *_conns[i]; - } - - assert(failedSlave); - return *failedSlave; - } - - bool DBClientReplicaSet::connect(){ - string errmsg; - - bool anyGood = false; - for ( unsigned i=0; i<_conns.size(); i++ ){ - if ( _conns[i]->connect( _servers[i] , errmsg ) ) - anyGood = true; - } - - if ( ! anyGood ) - return false; - - try { - checkMaster(); - } - catch (AssertionException&) { - return false; - } - return true; - } - - bool DBClientReplicaSet::auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword ) { - DBClientConnection * m = checkMaster(); - if( !m->auth(dbname, username, pwd, errmsg, digestPassword ) ) - return false; - - /* we try to authentiate with the other half of the pair -- even if down, that way the authInfo is cached. */ - for ( unsigned i=0; i<_conns.size(); i++ ){ - if ( _conns[i] == m ) - continue; - try { - string e; - _conns[i]->auth( dbname , username , pwd , e , digestPassword ); - } - catch ( AssertionException& ){ - } - } - - return true; - } - - auto_ptr DBClientReplicaSet::query(const string &a, Query b, int c, int d, - const BSONObj *e, int f, int g){ - // TODO: if slave ok is set go to a slave - return checkMaster()->query(a,b,c,d,e,f,g); - } - - BSONObj DBClientReplicaSet::findOne(const string &a, const Query& b, const BSONObj *c, int d) { - return checkMaster()->findOne(a,b,c,d); - } - - bool DBClientReplicaSet::isMember( const DBConnector * conn ) const { - if ( conn == this ) - return true; - - for ( unsigned i=0; i<_conns.size(); i++ ) - if ( _conns[i]->isMember( conn ) ) - return true; - - return false; - } - bool serverAlive( const string &uri ) { DBClientConnection c( false, 0, 20 ); // potentially the connection to server could fail while we're checking if it's alive - so use timeouts @@ -1084,5 +955,5 @@ namespace mongo { return false; return true; } - + } // namespace mongo diff --git a/client/dbclient.h b/client/dbclient.h index 9448055..9cb6571 100644 --- a/client/dbclient.h +++ b/client/dbclient.h @@ -40,7 +40,7 @@ namespace mongo { /** allow query of replica slave. normally these return an error except for namespace "local". */ QueryOption_SlaveOk = 1 << 2, - + // findingStart mode is used to find the first operation of interest when // we are scanning through a repl log. For efficiency in the common case, // where the first operation of interest is closer to the tail than the head, @@ -52,25 +52,31 @@ namespace mongo { QueryOption_OplogReplay = 1 << 3, /** The server normally times out idle cursors after an inactivy period to prevent excess memory uses - Set this option to prevent that. + Set this option to prevent that. */ QueryOption_NoCursorTimeout = 1 << 4, - /** Use with QueryOption_CursorTailable. If we are at the end of the data, block for a while rather + /** Use with QueryOption_CursorTailable. If we are at the end of the data, block for a while rather than returning no data. After a timeout period, we do return as normal. */ QueryOption_AwaitData = 1 << 5, - /** Stream the data down full blast in multiple "more" packages, on the assumption that the client - will fully read all data queried. Faster when you are pulling a lot of data and know you want to + /** Stream the data down full blast in multiple "more" packages, on the assumption that the client + will fully read all data queried. Faster when you are pulling a lot of data and know you want to pull it all down. Note: it is not allowed to not read all the data unless you close the connection. - Use the query( boost::function f, ... ) version of the connection's query() + Use the query( boost::function f, ... ) version of the connection's query() method, and it will take care of all the details for you. */ QueryOption_Exhaust = 1 << 6, - - QueryOption_AllSupported = QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay | QueryOption_NoCursorTimeout | QueryOption_AwaitData | QueryOption_Exhaust + + /** When sharded, this means its ok to return partial results + Usually we will fail a query if all required shards aren't up + If this is set, it'll be a partial result set + */ + QueryOption_PartialResults = 1 << 7 , + + QueryOption_AllSupported = QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay | QueryOption_NoCursorTimeout | QueryOption_AwaitData | QueryOption_Exhaust | QueryOption_PartialResults }; @@ -78,7 +84,7 @@ namespace mongo { /** Upsert - that is, insert the item if no matching item is found. */ UpdateOption_Upsert = 1 << 0, - /** Update multiple documents (if multiple documents match query expression). + /** Update multiple documents (if multiple documents match query expression). (Default is update a single document and stop.) */ UpdateOption_Multi = 1 << 1, @@ -96,28 +102,40 @@ namespace mongo { class DBClientBase; + /** + * ConnectionString handles parsing different ways to connect to mongo and determining method + * samples: + * server + * server:port + * foo/server:port,server:port SET + * server,server,server SYNC + * + * tyipcal use + * string errmsg, + * ConnectionString cs = ConnectionString::parse( url , errmsg ); + * if ( ! cs.isValid() ) throw "bad: " + errmsg; + * DBClientBase * conn = cs.connect( errmsg ); + */ class ConnectionString { public: enum ConnectionType { INVALID , MASTER , PAIR , SET , SYNC }; - - ConnectionString( const HostAndPort& server ){ + + ConnectionString() { + _type = INVALID; + } + + ConnectionString( const HostAndPort& server ) { _type = MASTER; _servers.push_back( server ); _finishInit(); } - // TODO Delete if nobody is using - //ConnectionString( ConnectionType type , const vector& servers ) - // : _type( type ) , _servers( servers ){ - // _finishInit(); - //} - - ConnectionString( ConnectionType type , const string& s , const string& setName = "" ){ + ConnectionString( ConnectionType type , const string& s , const string& setName = "" ) { _type = type; _setName = setName; _fillServers( s ); - - switch ( _type ){ + + switch ( _type ) { case MASTER: assert( _servers.size() == 1 ); break; @@ -131,73 +149,54 @@ namespace mongo { default: assert( _servers.size() > 0 ); } - + _finishInit(); } - ConnectionString( const string& s , ConnectionType favoredMultipleType ){ + ConnectionString( const string& s , ConnectionType favoredMultipleType ) { + _type = INVALID; + _fillServers( s ); - if ( _servers.size() == 1 ){ + if ( _type != INVALID ) { + // set already + } + else if ( _servers.size() == 1 ) { _type = MASTER; } else { _type = favoredMultipleType; - assert( _type != MASTER ); + assert( _type == SET || _type == SYNC ); } _finishInit(); } bool isValid() const { return _type != INVALID; } - - string toString() const { - return _string; - } + + string toString() const { return _string; } DBClientBase* connect( string& errmsg ) const; - static ConnectionString parse( const string& url , string& errmsg ); - - string getSetName() const{ - return _setName; - } + string getSetName() const { return _setName; } - vector getServers() const { - return _servers; - } + vector getServers() const { return _servers; } + ConnectionType type() const { return _type; } + + static ConnectionString parse( const string& url , string& errmsg ); + + static string typeToString( ConnectionType type ); + private: - ConnectionString(){ - _type = INVALID; - } - - void _fillServers( string s ){ - string::size_type idx; - while ( ( idx = s.find( ',' ) ) != string::npos ){ - _servers.push_back( s.substr( 0 , idx ) ); - s = s.substr( idx + 1 ); - } - _servers.push_back( s ); - } - - void _finishInit(){ - stringstream ss; - if ( _type == SET ) - ss << _setName << "/"; - for ( unsigned i=0; i<_servers.size(); i++ ){ - if ( i > 0 ) - ss << ","; - ss << _servers[i].toString(); - } - _string = ss.str(); - } + void _fillServers( string s ); + void _finishInit(); ConnectionType _type; vector _servers; string _string; string _setName; }; - + /** * controls how much a clients cares about writes * default is NORMAL @@ -213,7 +212,7 @@ namespace mongo { class DBClientCursor; class DBClientCursorBatchIterator; - /** Represents a Mongo query expression. Typically one uses the QUERY(...) macro to construct a Query object. + /** Represents a Mongo query expression. Typically one uses the QUERY(...) macro to construct a Query object. Examples: QUERY( "age" << 33 << "school" << "UCLA" ).sort("name") QUERY( "age" << GT << 30 << LT << 50 ) @@ -223,22 +222,22 @@ namespace mongo { BSONObj obj; Query() : obj(BSONObj()) { } Query(const BSONObj& b) : obj(b) { } - Query(const string &json) : + Query(const string &json) : obj(fromjson(json)) { } - Query(const char * json) : + Query(const char * json) : obj(fromjson(json)) { } - /** Add a sort (ORDER BY) criteria to the query expression. + /** Add a sort (ORDER BY) criteria to the query expression. @param sortPattern the sort order template. For example to order by name ascending, time descending: { name : 1, ts : -1 } i.e. BSON( "name" << 1 << "ts" << -1 ) - or + or fromjson(" name : 1, ts : -1 ") */ Query& sort(const BSONObj& sortPattern); - /** Add a sort (ORDER BY) criteria to the query expression. + /** Add a sort (ORDER BY) criteria to the query expression. This version of sort() assumes you want to sort on a single field. @param asc = 1 for ascending order asc = -1 for descending order @@ -267,8 +266,8 @@ namespace mongo { */ Query& explain(); - /** Use snapshot mode for the query. Snapshot mode assures no duplicates are returned, or objects missed, which were - present at both the start and end of the query's execution (if an object is new during the query, or deleted during + /** Use snapshot mode for the query. Snapshot mode assures no duplicates are returned, or objects missed, which were + present at both the start and end of the query's execution (if an object is new during the query, or deleted during the query, it may or may not be returned, even with snapshot mode). Note that short query responses (less than 1MB) are always effectively snapshotted. @@ -277,16 +276,16 @@ namespace mongo { */ Query& snapshot(); - /** Queries to the Mongo database support a $where parameter option which contains - a javascript function that is evaluated to see whether objects being queried match - its criteria. Use this helper to append such a function to a query object. + /** Queries to the Mongo database support a $where parameter option which contains + a javascript function that is evaluated to see whether objects being queried match + its criteria. Use this helper to append such a function to a query object. Your query may also contain other traditional Mongo query terms. - @param jscode The javascript function to evaluate against each potential object - match. The function must return true for matched objects. Use the this + @param jscode The javascript function to evaluate against each potential object + match. The function must return true for matched objects. Use the this variable to inspect the current object. - @param scope SavedContext for the javascript object. List in a BSON object any - variables you would like defined when the jscode executes. One can think + @param scope SavedContext for the javascript object. List in a BSON object any + variables you would like defined when the jscode executes. One can think of these as "bind variables". Examples: @@ -300,12 +299,12 @@ namespace mongo { * if this query has an orderby, hint, or some other field */ bool isComplex( bool * hasDollar = 0 ) const; - + BSONObj getFilter() const; BSONObj getSort() const; BSONObj getHint() const; bool isExplain() const; - + string toString() const; operator string() const { return toString(); } private: @@ -316,13 +315,13 @@ namespace mongo { BSONObjBuilder b; b.appendElements(obj); b.append(fieldName, val); - obj = b.obj(); + obj = b.obj(); } }; - -/** Typically one uses the QUERY(...) macro to construct a Query object. - Example: QUERY( "age" << 33 << "school" << "UCLA" ) -*/ + + /** Typically one uses the QUERY(...) macro to construct a Query object. + Example: QUERY( "age" << 33 << "school" << "UCLA" ) + */ #define QUERY(x) mongo::Query( BSON(x) ) /** @@ -331,15 +330,14 @@ namespace mongo { class DBConnector { public: virtual ~DBConnector() {} - virtual bool call( Message &toSend, Message &response, bool assertOk=true ) = 0; + /** actualServer is set to the actual server where they call went if there was a choice (SlaveOk) */ + virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 ) = 0; virtual void say( Message &toSend ) = 0; virtual void sayPiggyBack( Message &toSend ) = 0; virtual void checkResponse( const char* data, int nReturned ) {} /* used by QueryOption_Exhaust. To use that your subclass must implement this. */ virtual void recv( Message& m ) { assert(false); } - - virtual string getServerAddress() const = 0; }; /** @@ -352,9 +350,9 @@ namespace mongo { /** don't use this - called automatically by DBClientCursor for you */ virtual auto_ptr getMore( const string &ns, long long cursorId, int nToReturn = 0, int options = 0 ) = 0; - + virtual void insert( const string &ns, BSONObj obj ) = 0; - + virtual void insert( const string &ns, const vector< BSONObj >& v ) = 0; virtual void remove( const string &ns , Query query, bool justOne = 0 ) = 0; @@ -369,6 +367,7 @@ namespace mongo { */ virtual BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0); + virtual string getServerAddress() const = 0; }; @@ -397,18 +396,18 @@ namespace mongo { directly call runCommand. @param dbname database name. Use "admin" for global administrative commands. - @param cmd the command object to execute. For example, { ismaster : 1 } - @param info the result object the database returns. Typically has { ok : ..., errmsg : ... } fields - set. + @param cmd the command object to execute. For example, { ismaster : 1 } + @param info the result object the database returns. Typically has { ok : ..., errmsg : ... } fields + set. @param options see enum QueryOptions - normally not needed to run a command @return true if the command returned "ok". */ virtual bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0); /** Authorize access to a particular database. - Authentication is separate for each database on the server -- you may authenticate for any + Authentication is separate for each database on the server -- you may authenticate for any number of databases on a single connection. - The "admin" database is special and once authenticated provides access to all databases on the + The "admin" database is special and once authenticated provides access to all databases on the server. @param digestPassword if password is plain text, set this to true. otherwise assumed to be pre-digested @return true if successful @@ -418,7 +417,7 @@ namespace mongo { /** count number of objects in collection ns that match the query criteria specified throws UserAssertion if database returns an error */ - unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0 ); + virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 ); string createPasswordDigest( const string &username , const string &clearTextPassword ); @@ -450,14 +449,14 @@ namespace mongo { */ bool createCollection(const string &ns, long long size = 0, bool capped = false, int max = 0, BSONObj *info = 0); - /** Get error result from the last operation on this connection. + /** Get error result from the last operation on this connection. @return error message text, or empty string if no error. */ string getLastError(); - /** Get error result from the last operation on this connection. - @return full error object. - */ - virtual BSONObj getLastErrorDetailed(); + /** Get error result from the last operation on this connection. + @return full error object. + */ + virtual BSONObj getLastErrorDetailed(); static string getLastErrorString( const BSONObj& res ); @@ -466,23 +465,23 @@ namespace mongo { @return { err : , nPrev : , ok : 1 } result.err will be null if no error has occurred. - */ + */ BSONObj getPrevError(); - /** Reset the previous error state for this connection (accessed via getLastError and - getPrevError). Useful when performing several operations at once and then checking + /** Reset the previous error state for this connection (accessed via getLastError and + getPrevError). Useful when performing several operations at once and then checking for an error after attempting all operations. */ bool resetError() { return simpleCommand("admin", 0, "reseterror"); } - /** Delete the specified collection. */ - virtual bool dropCollection( const string &ns ){ + /** Delete the specified collection. */ + virtual bool dropCollection( const string &ns ) { string db = nsGetDB( ns ); string coll = nsGetCollection( ns ); uassert( 10011 , "no collection name", coll.size() ); BSONObj info; - + bool res = runCommand( db.c_str() , BSON( "drop" << coll ) , info ); resetIndexCache(); return res; @@ -494,7 +493,7 @@ namespace mongo { bool repairDatabase(const string &dbname, BSONObj *info = 0) { return simpleCommand(dbname, info, "repairDatabase"); } - + /** Copy database from one server or name to another server or name. Generally, you should dropDatabase() first as otherwise the copied information will MERGE @@ -524,23 +523,23 @@ namespace mongo { ProfileOff = 0, ProfileSlow = 1, // log very slow (>100ms) operations ProfileAll = 2 - + }; bool setDbProfilingLevel(const string &dbname, ProfilingLevel level, BSONObj *info = 0); bool getDbProfilingLevel(const string &dbname, ProfilingLevel& level, BSONObj *info = 0); - /** Run a map/reduce job on the server. + /** Run a map/reduce job on the server. See http://www.mongodb.org/display/DOCS/MapReduce ns namespace (db+collection name) of input data - jsmapf javascript map function code - jsreducef javascript reduce function code. + jsmapf javascript map function code + jsreducef javascript reduce function code. query optional query filter for the input - output optional permanent output collection name. if not specified server will + output optional permanent output collection name. if not specified server will generate a temporary collection and return its name. - returns a result object which contains: + returns a result object which contains: { result : , numObjects : , timeMillis : , @@ -548,8 +547,8 @@ namespace mongo { [, err : ] } - For example one might call: - result.getField("ok").trueValue() + For example one might call: + result.getField("ok").trueValue() on the result to check if ok. */ BSONObj mapreduce(const string &ns, const string &jsmapf, const string &jsreducef, BSONObj query = BSONObj(), const string& output = ""); @@ -560,7 +559,7 @@ namespace mongo { jscode source code for a javascript function. info the command object which contains any information on the invocation result including the return value and other information. If an error occurs running the jscode, error - information will be in info. (try "out() << info.toString()") + information will be in info. (try "out() << info.toString()") retValue return value from the jscode function. args args to pass to the jscode function. when invoked, the 'args' variable will be defined for use by the jscode. @@ -571,10 +570,10 @@ namespace mongo { */ bool eval(const string &dbname, const string &jscode, BSONObj& info, BSONElement& retValue, BSONObj *args = 0); - /** - + /** validate a collection, checking for errors and reporting back statistics. + this operation is slow and blocking. */ - bool validate( const string &ns , bool scandata=true ){ + bool validate( const string &ns , bool scandata=true ) { BSONObj cmd = BSON( "validate" << nsGetCollection( ns ) << "scandata" << scandata ); BSONObj info; return runCommand( nsGetDB( ns ).c_str() , cmd , info ); @@ -607,7 +606,7 @@ namespace mongo { ret = (NumType) retValue.number(); return true; } - + /** get a list of all the current databases uses the { listDatabases : 1 } command. @@ -623,16 +622,18 @@ namespace mongo { bool exists( const string& ns ); /** Create an index if it does not already exist. - ensureIndex calls are remembered so it is safe/fast to call this function many + ensureIndex calls are remembered so it is safe/fast to call this function many times in your code. @param ns collection to be indexed @param keys the "key pattern" for the index. e.g., { name : 1 } @param unique if true, indicates that key uniqueness should be enforced for this index @param name if not isn't specified, it will be created from the keys (recommended) + @param cache if set to false, the index cache for the connection won't remember this call @return whether or not sent message to db. should be true on first call, false on subsequent unless resetIndexCache was called */ - virtual bool ensureIndex( const string &ns , BSONObj keys , bool unique = false, const string &name = "" ); + virtual bool ensureIndex( const string &ns , BSONObj keys , bool unique = false, const string &name = "", + bool cache = true ); /** clears the index cache, so the subsequent call to ensureIndex for any index will go to the server @@ -640,17 +641,17 @@ namespace mongo { virtual void resetIndexCache(); virtual auto_ptr getIndexes( const string &ns ); - + virtual void dropIndex( const string& ns , BSONObj keys ); virtual void dropIndex( const string& ns , const string& indexName ); - + /** drops all indexes for the collection */ virtual void dropIndexes( const string& ns ); virtual void reIndex( const string& ns ); - + string genIndexName( const BSONObj& keys ); /** Erase / drop an entire database */ @@ -663,33 +664,35 @@ namespace mongo { virtual string toString() = 0; /** @return the database name portion of an ns string */ - string nsGetDB( const string &ns ){ + string nsGetDB( const string &ns ) { string::size_type pos = ns.find( "." ); if ( pos == string::npos ) return ns; - + return ns.substr( 0 , pos ); } - + /** @return the collection name portion of an ns string */ - string nsGetCollection( const string &ns ){ + string nsGetCollection( const string &ns ) { string::size_type pos = ns.find( "." ); if ( pos == string::npos ) return ""; - return ns.substr( pos + 1 ); + return ns.substr( pos + 1 ); } protected: bool isOk(const BSONObj&); - + + BSONObj _countCmd(const string &ns, const BSONObj& query, int options, int limit, int skip ); + enum QueryOptions availableOptions(); - + private: enum QueryOptions _cachedAvailableOptions; bool _haveCachedAvailableOptions; }; - + /** abstract class that implements the core db operations */ @@ -698,20 +701,20 @@ namespace mongo { WriteConcern _writeConcern; public: - DBClientBase(){ + DBClientBase() { _writeConcern = W_NORMAL; } - + WriteConcern getWriteConcern() const { return _writeConcern; } - void setWriteConcern( WriteConcern w ){ _writeConcern = w; } - + void setWriteConcern( WriteConcern w ) { _writeConcern = w; } + /** send a query to the database. @param ns namespace to query, format is .[.]* @param query query to perform on the collection. this is a BSONObj (binary JSON) You may format as { query: { ... }, orderby: { ... } } to specify a sort order. - @param nToReturn n to return. 0 = unlimited + @param nToReturn n to return (i.e., limit). 0 = unlimited @param nToSkip start with the nth item @param fieldsToReturn optional template of which fields to select. if unspecified, returns all fields @param queryOptions see options enum at top of this file @@ -744,23 +747,15 @@ namespace mongo { @param justOne if this true, then once a single match is found will stop */ virtual void remove( const string &ns , Query q , bool justOne = 0 ); - + /** updates objects matching query */ virtual void update( const string &ns , Query query , BSONObj obj , bool upsert = false , bool multi = false ); - + virtual bool isFailed() const = 0; - - virtual void killCursor( long long cursorID ) = 0; - static int countCommas( const string& s ){ - int n = 0; - for ( unsigned i=0; i p; - boost::scoped_ptr server; - bool failed; // true if some sort of fatal error has ever happened - bool autoReconnect; - time_t lastReconnectTry; - HostAndPort _server; // remember for reconnects - string _serverString; - int _port; - void _checkConnection(); - void checkConnection() { if( failed ) _checkConnection(); } - map< string, pair > authCache; - double _timeout; - - bool _connect( string& errmsg ); public: - /** @param _autoReconnect if true, automatically reconnect on a connection failure @param cp used by DBClientReplicaSet. You do not need to specify this parameter - @param timeout tcp timeout in seconds - this is for read/write, not connect. + @param timeout tcp timeout in seconds - this is for read/write, not connect. Connect timeout is fixed, but short, at 5 seconds. */ - DBClientConnection(bool _autoReconnect=false, DBClientReplicaSet* cp=0, double timeout=0) : - clientSet(cp), failed(false), autoReconnect(_autoReconnect), lastReconnectTry(0), _timeout(timeout) { } + DBClientConnection(bool _autoReconnect=false, DBClientReplicaSet* cp=0, double so_timeout=0) : + clientSet(cp), failed(false), autoReconnect(_autoReconnect), lastReconnectTry(0), _so_timeout(so_timeout) { + _numConnections++; + } + + virtual ~DBClientConnection() { + _numConnections--; + } /** Connect to a Mongo database server. @@ -821,14 +804,14 @@ namespace mongo { @deprecated please use HostAndPort @return false if fails to connect. */ - virtual bool connect(const char * hostname, string& errmsg){ + virtual bool connect(const char * hostname, string& errmsg) { // TODO: remove this method HostAndPort t( hostname ); return connect( t , errmsg ); } /** Connect to a Mongo database server. - + If autoReconnect is true, you can try to use the DBClientConnection even when false was returned -- it will try to connect again. @@ -846,9 +829,9 @@ namespace mongo { @param serverHostname host to connect to. can include port number ( 127.0.0.1 , 127.0.0.1:5555 ) */ - void connect(const string& serverHostname) { + void connect(const string& serverHostname) { string errmsg; - if( !connect(HostAndPort(serverHostname), errmsg) ) + if( !connect(HostAndPort(serverHostname), errmsg) ) throw ConnectException(string("can't connect ") + errmsg); } @@ -860,23 +843,22 @@ namespace mongo { return DBClientBase::query( ns, query, nToReturn, nToSkip, fieldsToReturn, queryOptions , batchSize ); } - /** uses QueryOption_Exhaust - use DBClientCursorBatchIterator if you want to do items in large blocks, perhpas to avoid granular locking and such. + /** Uses QueryOption_Exhaust + Exhaust mode sends back all data queries as fast as possible, with no back-and-for for OP_GETMORE. If you are certain + you will exhaust the query, it could be useful. + + Use DBClientCursorBatchIterator version if you want to do items in large blocks, perhaps to avoid granular locking and such. */ unsigned long long query( boost::function f, const string& ns, Query query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0); unsigned long long query( boost::function f, const string& ns, Query query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0); /** - @return true if this connection is currently in a failed state. When autoreconnect is on, + @return true if this connection is currently in a failed state. When autoreconnect is on, a connection will transition back to an ok state after reconnecting. */ - bool isFailed() const { - return failed; - } + bool isFailed() const { return failed; } - MessagingPort& port() { - return *p; - } + MessagingPort& port() { return *p; } string toStringLong() const { stringstream ss; @@ -886,143 +868,59 @@ namespace mongo { } /** Returns the address of the server */ - string toString() { - return _serverString; - } - - string getServerAddress() const { - return _serverString; - } - - virtual void killCursor( long long cursorID ); + string toString() { return _serverString; } - virtual bool callRead( Message& toSend , Message& response ){ - return call( toSend , response ); - } + string getServerAddress() const { return _serverString; } + virtual void killCursor( long long cursorID ); + virtual bool callRead( Message& toSend , Message& response ) { return call( toSend , response ); } virtual void say( Message &toSend ); - virtual bool call( Message &toSend, Message &response, bool assertOk = true ); - - virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; } - - virtual bool isMember( const DBConnector * conn ) const { return this == conn; }; - + virtual bool call( Message &toSend, Message &response, bool assertOk = true , string * actualServer = 0 ); + virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; } virtual void checkResponse( const char *data, int nReturned ); + void setSoTimeout(double to) { _so_timeout = to; } + + static int getNumConnections() { + return _numConnections; + } + + static void setLazyKillCursor( bool lazy ) { _lazyKillCursor = lazy; } + static bool getLazyKillCursor() { return _lazyKillCursor; } protected: friend class SyncClusterConnection; virtual void recv( Message& m ); virtual void sayPiggyBack( Message &toSend ); - }; - - /** Use this class to connect to a replica set of servers. The class will manage - checking for which server in a replica set is master, and do failover automatically. - - This can also be used to connect to replica pairs since pairs are a subset of sets - - On a failover situation, expect at least one operation to return an error (throw - an exception) before the failover is complete. Operations are not retried. - */ - class DBClientReplicaSet : public DBClientBase { - string _name; - DBClientConnection * _currentMaster; - vector _servers; - vector _conns; - - - void _checkMaster(); - DBClientConnection * checkMaster(); - - public: - /** Call connect() after constructing. autoReconnect is always on for DBClientReplicaSet connections. */ - DBClientReplicaSet( const string& name , const vector& servers ); - virtual ~DBClientReplicaSet(); - - /** Returns false if nomember of the set were reachable, or neither is - master, although, - when false returned, you can still try to use this connection object, it will - try reconnects. - */ - bool connect(); - - /** Authorize. Authorizes all nodes as needed - */ - virtual bool auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword = true ); - - /** throws userassertion "no master found" */ - virtual - auto_ptr query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0, - const BSONObj *fieldsToReturn = 0, int queryOptions = 0 , int batchSize = 0 ); - - /** throws userassertion "no master found" */ - virtual - BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0); - - /** insert */ - virtual void insert( const string &ns , BSONObj obj ) { - checkMaster()->insert(ns, obj); - } - - /** insert multiple objects. Note that single object insert is asynchronous, so this version - is only nominally faster and not worth a special effort to try to use. */ - virtual void insert( const string &ns, const vector< BSONObj >& v ) { - checkMaster()->insert(ns, v); - } - - /** remove */ - virtual void remove( const string &ns , Query obj , bool justOne = 0 ) { - checkMaster()->remove(ns, obj, justOne); - } - - /** update */ - virtual void update( const string &ns , Query query , BSONObj obj , bool upsert = 0 , bool multi = 0 ) { - return checkMaster()->update(ns, query, obj, upsert,multi); - } - - virtual void killCursor( long long cursorID ){ - checkMaster()->killCursor( cursorID ); - } - - string toString(); - - /* this is the callback from our underlying connections to notify us that we got a "not master" error. - */ - void isntMaster() { - _currentMaster = 0; - } - - string getServerAddress() const; - - DBClientConnection& masterConn(); - DBClientConnection& slaveConn(); - - - virtual bool call( Message &toSend, Message &response, bool assertOk=true ) { return checkMaster()->call( toSend , response , assertOk ); } - virtual void say( Message &toSend ) { checkMaster()->say( toSend ); } - virtual bool callRead( Message& toSend , Message& response ){ return checkMaster()->callRead( toSend , response ); } - - virtual ConnectionString::ConnectionType type() const { return ConnectionString::SET; } + DBClientReplicaSet *clientSet; + boost::scoped_ptr p; + boost::scoped_ptr server; + bool failed; + const bool autoReconnect; + time_t lastReconnectTry; + HostAndPort _server; // remember for reconnects + string _serverString; + void _checkConnection(); - virtual bool isMember( const DBConnector * conn ) const; + // throws SocketException if in failed state and not reconnecting or if waiting to reconnect + void checkConnection() { if( failed ) _checkConnection(); } - virtual void checkResponse( const char *data, int nReturned ) { checkMaster()->checkResponse( data , nReturned ); } + map< string, pair > authCache; + double _so_timeout; + bool _connect( string& errmsg ); - protected: - virtual void sayPiggyBack( Message &toSend ) { checkMaster()->say( toSend ); } - - bool isFailed() const { - return _currentMaster == 0 || _currentMaster->isFailed(); - } + static AtomicUInt _numConnections; + static bool _lazyKillCursor; // lazy means we piggy back kill cursors on next op }; - + /** pings server to check if it's up */ bool serverAlive( const string &uri ); DBClientBase * createDirectClient(); - + } // namespace mongo #include "dbclientcursor.h" +#include "dbclient_rs.h" #include "undef_macros.h" diff --git a/client/dbclient_rs.cpp b/client/dbclient_rs.cpp new file mode 100644 index 0000000..fd8ecec --- /dev/null +++ b/client/dbclient_rs.cpp @@ -0,0 +1,594 @@ +// dbclient.cpp - connect to a Mongo database as a database, from C++ + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" +#include "dbclient.h" +#include "../bson/util/builder.h" +#include "../db/jsobj.h" +#include "../db/json.h" +#include "../db/dbmessage.h" +#include "connpool.h" +#include "dbclient_rs.h" +#include "../util/background.h" + +namespace mongo { + + // -------------------------------- + // ----- ReplicaSetMonitor --------- + // -------------------------------- + + // global background job responsible for checking every X amount of time + class ReplicaSetMonitorWatcher : public BackgroundJob { + public: + ReplicaSetMonitorWatcher() : _safego("ReplicaSetMonitorWatcher::_safego") , _started(false) {} + + virtual string name() const { return "ReplicaSetMonitorWatcher"; } + + void safeGo() { + // check outside of lock for speed + if ( _started ) + return; + + scoped_lock lk( _safego ); + if ( _started ) + return; + _started = true; + + go(); + } + protected: + void run() { + while ( ! inShutdown() ) { + sleepsecs( 20 ); + try { + ReplicaSetMonitor::checkAll(); + } + catch ( std::exception& e ) { + error() << "ReplicaSetMonitorWatcher: check failed: " << e.what() << endl; + } + } + } + + mongo::mutex _safego; + bool _started; + + } replicaSetMonitorWatcher; + + + ReplicaSetMonitor::ReplicaSetMonitor( const string& name , const vector& servers ) + : _lock( "ReplicaSetMonitor instance" ) , _checkConnectionLock( "ReplicaSetMonitor check connection lock" ), _name( name ) , _master(-1) { + + uassert( 13642 , "need at least 1 node for a replica set" , servers.size() > 0 ); + + if ( _name.size() == 0 ) { + warning() << "replica set name empty, first node: " << servers[0] << endl; + } + + string errmsg; + + for ( unsigned i=0; i conn( new DBClientConnection( true , 0, 5.0 ) ); + if (!conn->connect( servers[i] , errmsg ) ) { + log(1) << "error connecting to seed " << servers[i] << ": " << errmsg << endl; + // skip seeds that don't work + continue; + } + + _nodes.push_back( Node( servers[i] , conn.release() ) ); + + string maybePrimary; + if (_checkConnection( _nodes[_nodes.size()-1].conn , maybePrimary, false)) { + break; + } + } + } + + ReplicaSetMonitor::~ReplicaSetMonitor() { + for ( unsigned i=0; i<_nodes.size(); i++ ) + delete _nodes[i].conn; + _nodes.clear(); + _master = -1; + } + + ReplicaSetMonitorPtr ReplicaSetMonitor::get( const string& name , const vector& servers ) { + scoped_lock lk( _setsLock ); + ReplicaSetMonitorPtr& m = _sets[name]; + if ( ! m ) + m.reset( new ReplicaSetMonitor( name , servers ) ); + + replicaSetMonitorWatcher.safeGo(); + + return m; + } + + void ReplicaSetMonitor::checkAll() { + set seen; + + while ( true ) { + ReplicaSetMonitorPtr m; + { + for ( map::iterator i=_sets.begin(); i!=_sets.end(); ++i ) { + string name = i->first; + if ( seen.count( name ) ) + continue; + LOG(1) << "checking replica set: " << name << endl; + seen.insert( name ); + m = i->second; + break; + } + } + + if ( ! m ) + break; + + m->check(); + } + + + } + + void ReplicaSetMonitor::setConfigChangeHook( ConfigChangeHook hook ) { + massert( 13610 , "ConfigChangeHook already specified" , _hook == 0 ); + _hook = hook; + } + + string ReplicaSetMonitor::getServerAddress() const { + StringBuilder ss; + if ( _name.size() ) + ss << _name << "/"; + + { + scoped_lock lk( _lock ); + for ( unsigned i=0; i<_nodes.size(); i++ ) { + if ( i > 0 ) + ss << ","; + ss << _nodes[i].addr.toString(); + } + } + return ss.str(); + } + + bool ReplicaSetMonitor::contains( const string& server ) const { + scoped_lock lk( _lock ); + for ( unsigned i=0; i<_nodes.size(); i++ ) { + if ( _nodes[i].addr == server ) + return true; + } + return false; + } + + + void ReplicaSetMonitor::notifyFailure( const HostAndPort& server ) { + scoped_lock lk( _lock ); + if ( _master >= 0 && _master < (int)_nodes.size() ) { + if ( server == _nodes[_master].addr ) + _master = -1; + } + } + + + + HostAndPort ReplicaSetMonitor::getMaster() { + { + scoped_lock lk( _lock ); + if ( _master >= 0 && _nodes[_master].ok ) + return _nodes[_master].addr; + } + + _check(); + + scoped_lock lk( _lock ); + uassert( 10009 , str::stream() << "ReplicaSetMonitor no master found for set: " << _name , _master >= 0 ); + return _nodes[_master].addr; + } + + HostAndPort ReplicaSetMonitor::getSlave( const HostAndPort& prev ) { + // make sure its valid + if ( prev.port() > 0 ) { + scoped_lock lk( _lock ); + for ( unsigned i=0; i<_nodes.size(); i++ ) { + if ( prev != _nodes[i].addr ) + continue; + + if ( _nodes[i].ok ) + return prev; + break; + } + } + + return getSlave(); + } + + HostAndPort ReplicaSetMonitor::getSlave() { + int x = rand() % _nodes.size(); + { + scoped_lock lk( _lock ); + for ( unsigned i=0; i<_nodes.size(); i++ ) { + int p = ( i + x ) % _nodes.size(); + if ( p == _master ) + continue; + if ( _nodes[p].ok ) + return _nodes[p].addr; + } + } + + return _nodes[0].addr; + } + + /** + * notify the monitor that server has faild + */ + void ReplicaSetMonitor::notifySlaveFailure( const HostAndPort& server ) { + int x = _find( server ); + if ( x >= 0 ) { + scoped_lock lk( _lock ); + _nodes[x].ok = false; + } + } + + void ReplicaSetMonitor::_checkStatus(DBClientConnection *conn) { + BSONObj status; + + if (!conn->runCommand("admin", BSON("replSetGetStatus" << 1), status) || + !status.hasField("members") || + status["members"].type() != Array) { + return; + } + + BSONObjIterator hi(status["members"].Obj()); + while (hi.more()) { + BSONObj member = hi.next().Obj(); + string host = member["name"].String(); + + int m = -1; + if ((m = _find(host)) <= 0) { + continue; + } + + double state = member["state"].Number(); + if (member["health"].Number() == 1 && (state == 1 || state == 2)) { + scoped_lock lk( _lock ); + _nodes[m].ok = true; + } + else { + scoped_lock lk( _lock ); + _nodes[m].ok = false; + } + } + } + + void ReplicaSetMonitor::_checkHosts( const BSONObj& hostList, bool& changed ) { + BSONObjIterator hi(hostList); + while ( hi.more() ) { + string toCheck = hi.next().String(); + + if ( _find( toCheck ) >= 0 ) + continue; + + HostAndPort h( toCheck ); + DBClientConnection * newConn = new DBClientConnection( true, 0, 5.0 ); + string temp; + newConn->connect( h , temp ); + { + scoped_lock lk( _lock ); + _nodes.push_back( Node( h , newConn ) ); + } + log() << "updated set (" << _name << ") to: " << getServerAddress() << endl; + changed = true; + } + } + + + + bool ReplicaSetMonitor::_checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose ) { + scoped_lock lk( _checkConnectionLock ); + bool isMaster = false; + bool changed = false; + try { + BSONObj o; + c->isMaster(isMaster, &o); + + log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: " << c->toString() << ' ' << o << '\n'; + + // add other nodes + string maybePrimary; + if ( o["hosts"].type() == Array ) { + if ( o["primary"].type() == String ) + maybePrimary = o["primary"].String(); + + _checkHosts(o["hosts"].Obj(), changed); + } + if (o.hasField("passives") && o["passives"].type() == Array) { + _checkHosts(o["passives"].Obj(), changed); + } + + _checkStatus(c); + } + catch ( std::exception& e ) { + log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: caught exception " << c->toString() << ' ' << e.what() << endl; + } + + if ( changed && _hook ) + _hook( this ); + + return isMaster; + } + + void ReplicaSetMonitor::_check() { + + bool triedQuickCheck = false; + + LOG(1) << "_check : " << getServerAddress() << endl; + + for ( int retry = 0; retry < 2; retry++ ) { + for ( unsigned i=0; i<_nodes.size(); i++ ) { + DBClientConnection * c; + { + scoped_lock lk( _lock ); + c = _nodes[i].conn; + } + + string maybePrimary; + if ( _checkConnection( c , maybePrimary , retry ) ) { + _master = i; + return; + } + + if ( ! triedQuickCheck && maybePrimary.size() ) { + int x = _find( maybePrimary ); + if ( x >= 0 ) { + triedQuickCheck = true; + string dummy; + DBClientConnection * testConn; + { + scoped_lock lk( _lock ); + testConn = _nodes[x].conn; + } + if ( _checkConnection( testConn , dummy , false ) ) { + _master = x; + return; + } + } + } + + } + sleepsecs(1); + } + + } + + void ReplicaSetMonitor::check() { + // first see if the current master is fine + if ( _master >= 0 ) { + string temp; + if ( _checkConnection( _nodes[_master].conn , temp , false ) ) { + // current master is fine, so we're done + return; + } + } + + // we either have no master, or the current is dead + _check(); + } + + int ReplicaSetMonitor::_find( const string& server ) const { + scoped_lock lk( _lock ); + for ( unsigned i=0; i<_nodes.size(); i++ ) + if ( _nodes[i].addr == server ) + return i; + return -1; + } + + int ReplicaSetMonitor::_find( const HostAndPort& server ) const { + scoped_lock lk( _lock ); + for ( unsigned i=0; i<_nodes.size(); i++ ) + if ( _nodes[i].addr == server ) + return i; + return -1; + } + + + mongo::mutex ReplicaSetMonitor::_setsLock( "ReplicaSetMonitor" ); + map ReplicaSetMonitor::_sets; + ReplicaSetMonitor::ConfigChangeHook ReplicaSetMonitor::_hook; + // -------------------------------- + // ----- DBClientReplicaSet --------- + // -------------------------------- + + DBClientReplicaSet::DBClientReplicaSet( const string& name , const vector& servers ) + : _monitor( ReplicaSetMonitor::get( name , servers ) ) { + } + + DBClientReplicaSet::~DBClientReplicaSet() { + } + + DBClientConnection * DBClientReplicaSet::checkMaster() { + HostAndPort h = _monitor->getMaster(); + + if ( h == _masterHost ) { + // a master is selected. let's just make sure connection didn't die + if ( ! _master->isFailed() ) + return _master.get(); + _monitor->notifyFailure( _masterHost ); + } + + _masterHost = _monitor->getMaster(); + _master.reset( new DBClientConnection( true ) ); + string errmsg; + if ( ! _master->connect( _masterHost , errmsg ) ) { + _monitor->notifyFailure( _masterHost ); + uasserted( 13639 , str::stream() << "can't connect to new replica set master [" << _masterHost.toString() << "] err: " << errmsg ); + } + _auth( _master.get() ); + return _master.get(); + } + + DBClientConnection * DBClientReplicaSet::checkSlave() { + HostAndPort h = _monitor->getSlave( _slaveHost ); + + if ( h == _slaveHost ) { + if ( ! _slave->isFailed() ) + return _slave.get(); + _monitor->notifySlaveFailure( _slaveHost ); + } + + _slaveHost = _monitor->getSlave(); + _slave.reset( new DBClientConnection( true ) ); + _slave->connect( _slaveHost ); + _auth( _slave.get() ); + return _slave.get(); + } + + + void DBClientReplicaSet::_auth( DBClientConnection * conn ) { + for ( list::iterator i=_auths.begin(); i!=_auths.end(); ++i ) { + const AuthInfo& a = *i; + string errmsg; + if ( ! conn->auth( a.dbname , a.username , a.pwd , errmsg, a.digestPassword ) ) + warning() << "cached auth failed for set: " << _monitor->getName() << " db: " << a.dbname << " user: " << a.username << endl; + + } + + } + + DBClientConnection& DBClientReplicaSet::masterConn() { + return *checkMaster(); + } + + DBClientConnection& DBClientReplicaSet::slaveConn() { + return *checkSlave(); + } + + bool DBClientReplicaSet::connect() { + try { + checkMaster(); + } + catch (AssertionException&) { + if (_master && _monitor) { + _monitor->notifyFailure(_masterHost); + } + return false; + } + return true; + } + + bool DBClientReplicaSet::auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword ) { + DBClientConnection * m = checkMaster(); + + // first make sure it actually works + if( ! m->auth(dbname, username, pwd, errmsg, digestPassword ) ) + return false; + + // now that it does, we should save so that for a new node we can auth + _auths.push_back( AuthInfo( dbname , username , pwd , digestPassword ) ); + return true; + } + + // ------------- simple functions ----------------- + + void DBClientReplicaSet::insert( const string &ns , BSONObj obj ) { + checkMaster()->insert(ns, obj); + } + + void DBClientReplicaSet::insert( const string &ns, const vector< BSONObj >& v ) { + checkMaster()->insert(ns, v); + } + + void DBClientReplicaSet::remove( const string &ns , Query obj , bool justOne ) { + checkMaster()->remove(ns, obj, justOne); + } + + void DBClientReplicaSet::update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi ) { + return checkMaster()->update(ns, query, obj, upsert,multi); + } + + auto_ptr DBClientReplicaSet::query(const string &ns, Query query, int nToReturn, int nToSkip, + const BSONObj *fieldsToReturn, int queryOptions, int batchSize) { + + if ( queryOptions & QueryOption_SlaveOk ) { + // we're ok sending to a slave + // we'll try 2 slaves before just using master + // checkSlave will try a different slave automatically after a failure + for ( int i=0; i<2; i++ ) { + try { + return checkSlave()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize); + } + catch ( DBException & ) { + LOG(1) << "can't query replica set slave: " << _slaveHost << endl; + } + } + } + + return checkMaster()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize); + } + + BSONObj DBClientReplicaSet::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) { + if ( queryOptions & QueryOption_SlaveOk ) { + // we're ok sending to a slave + // we'll try 2 slaves before just using master + // checkSlave will try a different slave automatically after a failure + for ( int i=0; i<2; i++ ) { + try { + return checkSlave()->findOne(ns,query,fieldsToReturn,queryOptions); + } + catch ( DBException & ) { + LOG(1) << "can't query replica set slave: " << _slaveHost << endl; + } + } + } + + return checkMaster()->findOne(ns,query,fieldsToReturn,queryOptions); + } + + void DBClientReplicaSet::killCursor( long long cursorID ) { + // we should neve call killCursor on a replica set conncetion + // since we don't know which server it belongs to + // can't assume master because of slave ok + // and can have a cursor survive a master change + assert(0); + } + + + bool DBClientReplicaSet::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { + if ( toSend.operation() == dbQuery ) { + // TODO: might be possible to do this faster by changing api + DbMessage dm( toSend ); + QueryMessage qm( dm ); + if ( qm.queryOptions & QueryOption_SlaveOk ) { + for ( int i=0; i<2; i++ ) { + try { + DBClientConnection* s = checkSlave(); + if ( actualServer ) + *actualServer = s->getServerAddress(); + return s->call( toSend , response , assertOk ); + } + catch ( DBException & ) { + log(1) << "can't query replica set slave: " << _slaveHost << endl; + if ( actualServer ) + *actualServer = ""; + } + } + } + } + + DBClientConnection* m = checkMaster(); + if ( actualServer ) + *actualServer = m->getServerAddress(); + return m->call( toSend , response , assertOk ); + } + +} diff --git a/client/dbclient_rs.h b/client/dbclient_rs.h new file mode 100644 index 0000000..43bf561 --- /dev/null +++ b/client/dbclient_rs.h @@ -0,0 +1,276 @@ +/** @file dbclient_rs.h - connect to a Replica Set, from C++ */ + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../pch.h" +#include "dbclient.h" + +namespace mongo { + + class ReplicaSetMonitor; + typedef shared_ptr ReplicaSetMonitorPtr; + + /** + * manages state about a replica set for client + * keeps tabs on whose master and what slaves are up + * can hand a slave to someone for SLAVE_OK + * one instace per process per replica set + * TODO: we might be able to use a regular Node * to avoid _lock + */ + class ReplicaSetMonitor { + public: + + typedef boost::function1 ConfigChangeHook; + + /** + * gets a cached Monitor per name or will create if doesn't exist + */ + static ReplicaSetMonitorPtr get( const string& name , const vector& servers ); + + /** + * checks all sets for current master and new secondaries + * usually only called from a BackgroundJob + */ + static void checkAll(); + + /** + * this is called whenever the config of any repclia set changes + * currently only 1 globally + * asserts if one already exists + * ownership passes to ReplicaSetMonitor and the hook will actually never be deleted + */ + static void setConfigChangeHook( ConfigChangeHook hook ); + + ~ReplicaSetMonitor(); + + /** @return HostAndPort or throws an exception */ + HostAndPort getMaster(); + + /** + * notify the monitor that server has faild + */ + void notifyFailure( const HostAndPort& server ); + + /** @return prev if its still ok, and if not returns a random slave that is ok for reads */ + HostAndPort getSlave( const HostAndPort& prev ); + + /** @return a random slave that is ok for reads */ + HostAndPort getSlave(); + + + /** + * notify the monitor that server has faild + */ + void notifySlaveFailure( const HostAndPort& server ); + + /** + * checks for current master and new secondaries + */ + void check(); + + string getName() const { return _name; } + + string getServerAddress() const; + + bool contains( const string& server ) const; + + private: + /** + * This populates a list of hosts from the list of seeds (discarding the + * seed list). + * @param name set name + * @param servers seeds + */ + ReplicaSetMonitor( const string& name , const vector& servers ); + + void _check(); + + /** + * Use replSetGetStatus command to make sure hosts in host list are up + * and readable. Sets Node::ok appropriately. + */ + void _checkStatus(DBClientConnection *conn); + + /** + * Add array of hosts to host list. Doesn't do anything if hosts are + * already in host list. + * @param hostList the list of hosts to add + * @param changed if new hosts were added + */ + void _checkHosts(const BSONObj& hostList, bool& changed); + + /** + * Updates host list. + * @param c the connection to check + * @param maybePrimary OUT + * @param verbose + * @return if the connection is good + */ + bool _checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose ); + + int _find( const string& server ) const ; + int _find( const HostAndPort& server ) const ; + + mutable mongo::mutex _lock; // protects _nodes + mutable mongo::mutex _checkConnectionLock; + + string _name; + struct Node { + Node( const HostAndPort& a , DBClientConnection* c ) : addr( a ) , conn(c) , ok(true) {} + HostAndPort addr; + DBClientConnection* conn; + + // if this node is in a failure state + // used for slave routing + // this is too simple, should make it better + bool ok; + }; + + /** + * Host list. + */ + vector _nodes; + + int _master; // which node is the current master. -1 means no master is known + + + static mongo::mutex _setsLock; // protects _sets + static map _sets; // set name to Monitor + + static ConfigChangeHook _hook; + }; + + /** Use this class to connect to a replica set of servers. The class will manage + checking for which server in a replica set is master, and do failover automatically. + + This can also be used to connect to replica pairs since pairs are a subset of sets + + On a failover situation, expect at least one operation to return an error (throw + an exception) before the failover is complete. Operations are not retried. + */ + class DBClientReplicaSet : public DBClientBase { + + public: + /** Call connect() after constructing. autoReconnect is always on for DBClientReplicaSet connections. */ + DBClientReplicaSet( const string& name , const vector& servers ); + virtual ~DBClientReplicaSet(); + + /** Returns false if nomember of the set were reachable, or neither is + * master, although, + * when false returned, you can still try to use this connection object, it will + * try reconnects. + */ + bool connect(); + + /** Authorize. Authorizes all nodes as needed + */ + virtual bool auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword = true ); + + // ----------- simple functions -------------- + + /** throws userassertion "no master found" */ + virtual auto_ptr query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0, + const BSONObj *fieldsToReturn = 0, int queryOptions = 0 , int batchSize = 0 ); + + /** throws userassertion "no master found" */ + virtual BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0); + + virtual void insert( const string &ns , BSONObj obj ); + + /** insert multiple objects. Note that single object insert is asynchronous, so this version + is only nominally faster and not worth a special effort to try to use. */ + virtual void insert( const string &ns, const vector< BSONObj >& v ); + + virtual void remove( const string &ns , Query obj , bool justOne = 0 ); + + virtual void update( const string &ns , Query query , BSONObj obj , bool upsert = 0 , bool multi = 0 ); + + virtual void killCursor( long long cursorID ); + + // ---- access raw connections ---- + + DBClientConnection& masterConn(); + DBClientConnection& slaveConn(); + + // ---- callback pieces ------- + + virtual void checkResponse( const char *data, int nReturned ) { checkMaster()->checkResponse( data , nReturned ); } + + /* this is the callback from our underlying connections to notify us that we got a "not master" error. + */ + void isntMaster() { _master.reset(); } + + // ----- status ------ + + virtual bool isFailed() const { return ! _master || _master->isFailed(); } + + // ----- informational ---- + + string toString() { return getServerAddress(); } + + string getServerAddress() const { return _monitor->getServerAddress(); } + + virtual ConnectionString::ConnectionType type() const { return ConnectionString::SET; } + + // ---- low level ------ + + virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 ); + virtual void say( Message &toSend ) { checkMaster()->say( toSend ); } + virtual bool callRead( Message& toSend , Message& response ) { return checkMaster()->callRead( toSend , response ); } + + + protected: + virtual void sayPiggyBack( Message &toSend ) { checkMaster()->say( toSend ); } + + private: + + DBClientConnection * checkMaster(); + DBClientConnection * checkSlave(); + + void _auth( DBClientConnection * conn ); + + ReplicaSetMonitorPtr _monitor; + + HostAndPort _masterHost; + scoped_ptr _master; + + HostAndPort _slaveHost; + scoped_ptr _slave; + + /** + * for storing authentication info + * fields are exactly for DBClientConnection::auth + */ + struct AuthInfo { + AuthInfo( string d , string u , string p , bool di ) + : dbname( d ) , username( u ) , pwd( p ) , digestPassword( di ) {} + string dbname; + string username; + string pwd; + bool digestPassword; + }; + + // we need to store so that when we connect to a new node on failure + // we can re-auth + // this could be a security issue, as the password is stored in memory + // not sure if/how we should handle + list _auths; + }; + + +} diff --git a/client/dbclientcursor.cpp b/client/dbclientcursor.cpp index 5f9db43..6c6afc0 100644 --- a/client/dbclientcursor.cpp +++ b/client/dbclientcursor.cpp @@ -26,14 +26,14 @@ namespace mongo { void assembleRequest( const string &ns, BSONObj query, int nToReturn, int nToSkip, const BSONObj *fieldsToReturn, int queryOptions, Message &toSend ); - int DBClientCursor::nextBatchSize(){ + int DBClientCursor::nextBatchSize() { if ( nToReturn == 0 ) return batchSize; if ( batchSize == 0 ) return nToReturn; - + return batchSize < nToReturn ? batchSize : nToReturn; } @@ -41,7 +41,8 @@ namespace mongo { Message toSend; if ( !cursorId ) { assembleRequest( ns, query, nextBatchSize() , nToSkip, fieldsToReturn, opts, toSend ); - } else { + } + else { BufBuilder b; b.appendNum( opts ); b.appendStr( ns ); @@ -49,10 +50,16 @@ namespace mongo { b.appendNum( cursorId ); toSend.setData( dbGetMore, b.buf(), b.len() ); } - if ( !connector->call( toSend, *m, false ) ) + if ( !_client->call( toSend, *m, false ) ) { + // log msg temp? + log() << "DBClientCursor::init call() failed" << endl; return false; - if ( m->empty() ) + } + if ( m->empty() ) { + // log msg temp? + log() << "DBClientCursor::init message from call() was empty" << endl; return false; + } dataReceived(); return true; } @@ -60,7 +67,7 @@ namespace mongo { void DBClientCursor::requestMore() { assert( cursorId && pos == nReturned ); - if (haveLimit){ + if (haveLimit) { nToReturn -= nReturned; assert(nToReturn > 0); } @@ -69,13 +76,13 @@ namespace mongo { b.appendStr(ns); b.appendNum(nextBatchSize()); b.appendNum(cursorId); - + Message toSend; toSend.setData(dbGetMore, b.buf(), b.len()); auto_ptr response(new Message()); - - if ( connector ){ - connector->call( toSend, *response ); + + if ( _client ) { + _client->call( toSend, *response ); m = response; dataReceived(); } @@ -83,10 +90,10 @@ namespace mongo { assert( _scopedHost.size() ); ScopedDbConnection conn( _scopedHost ); conn->call( toSend , *response ); - connector = conn.get(); + _client = conn.get(); m = response; dataReceived(); - connector = 0; + _client = 0; conn.done(); } } @@ -96,8 +103,8 @@ namespace mongo { assert( cursorId && pos == nReturned ); assert( !haveLimit ); auto_ptr response(new Message()); - assert( connector ); - connector->recv(*response); + assert( _client ); + _client->recv(*response); m = response; dataReceived(); } @@ -105,7 +112,7 @@ namespace mongo { void DBClientCursor::dataReceived() { QueryResult *qr = (QueryResult *) m->singleData(); resultFlags = qr->resultFlags(); - + if ( qr->resultFlags() & ResultFlag_CursorNotFound ) { // cursor id no longer valid at the server. assert( qr->cursorId == 0 ); @@ -113,7 +120,7 @@ namespace mongo { if ( ! ( opts & QueryOption_CursorTailable ) ) throw UserException( 13127 , "getMore: cursor didn't exist on server, possible restart or timeout?" ); } - + if ( cursorId == 0 || ! ( opts & QueryOption_CursorTailable ) ) { // only set initially: we don't want to kill it on end of data // if it's a tailable cursor @@ -124,7 +131,7 @@ namespace mongo { pos = 0; data = qr->data(); - connector->checkResponse( data, nReturned ); + _client->checkResponse( data, nReturned ); /* this assert would fire the way we currently work: assert( nReturned || cursorId == 0 ); */ @@ -136,7 +143,7 @@ namespace mongo { if ( !_putBack.empty() ) return true; - + if (haveLimit && pos >= nToReturn) return false; @@ -171,7 +178,7 @@ namespace mongo { int m = atMost; /* - for( stack::iterator i = _putBack.begin(); i != _putBack.end(); i++ ) { + for( stack::iterator i = _putBack.begin(); i != _putBack.end(); i++ ) { if( m == 0 ) return; v.push_back(*i); @@ -190,13 +197,22 @@ namespace mongo { v.push_back(o); } } - - void DBClientCursor::attach( AScopedConnection * conn ){ + + void DBClientCursor::attach( AScopedConnection * conn ) { assert( _scopedHost.size() == 0 ); - assert( conn->get()->isMember( connector ) ); - _scopedHost = conn->getHost(); + assert( conn ); + assert( conn->get() ); + + if ( conn->get()->type() == ConnectionString::SET || + conn->get()->type() == ConnectionString::SYNC ) { + _scopedHost = _client->getServerAddress(); + } + else { + _scopedHost = conn->getHost(); + } + conn->done(); - connector = 0; + _client = 0; } DBClientCursor::~DBClientCursor() { @@ -205,28 +221,28 @@ namespace mongo { DESTRUCTOR_GUARD ( - if ( cursorId && _ownCursor ) { - BufBuilder b; - b.appendNum( (int)0 ); // reserved - b.appendNum( (int)1 ); // number - b.appendNum( cursorId ); - - Message m; - m.setData( dbKillCursors , b.buf() , b.len() ); - - if ( connector ){ - connector->sayPiggyBack( m ); - } - else { - assert( _scopedHost.size() ); - ScopedDbConnection conn( _scopedHost ); - conn->sayPiggyBack( m ); - conn.done(); - } + if ( cursorId && _ownCursor ) { + BufBuilder b; + b.appendNum( (int)0 ); // reserved + b.appendNum( (int)1 ); // number + b.appendNum( cursorId ); + + Message m; + m.setData( dbKillCursors , b.buf() , b.len() ); + + if ( _client ) { + _client->sayPiggyBack( m ); + } + else { + assert( _scopedHost.size() ); + ScopedDbConnection conn( _scopedHost ); + conn->sayPiggyBack( m ); + conn.done(); } + } ); } - + } // namespace mongo diff --git a/client/dbclientcursor.h b/client/dbclientcursor.h index 51cdc13..5d795f4 100644 --- a/client/dbclientcursor.h +++ b/client/dbclientcursor.h @@ -1,4 +1,4 @@ -// file dbclientcursor.h +// file dbclientcursor.h /* Copyright 2009 10gen Inc. * @@ -24,41 +24,55 @@ #include namespace mongo { - + class AScopedConnection; - - /** Queries return a cursor object */ - class DBClientCursor : boost::noncopyable { + + /** for mock purposes only -- do not create variants of DBClientCursor, nor hang code here */ + class DBClientCursorInterface { public: - /** If true, safe to call next(). Requests more from server if necessary. */ + virtual ~DBClientCursorInterface() {} + + virtual bool more() = 0; + virtual BSONObj next() = 0; + + // TODO bring more of the DBClientCursor interface to here + + protected: + DBClientCursorInterface() {} + }; + + /** Queries return a cursor object */ + class DBClientCursor : public DBClientCursorInterface { + public: + /** If true, safe to call next(). Requests more from server if necessary. */ bool more(); - /** If true, there is more in our local buffers to be fetched via next(). Returns - false when a getMore request back to server would be required. You can use this - if you want to exhaust whatever data has been fetched to the client already but + /** If true, there is more in our local buffers to be fetched via next(). Returns + false when a getMore request back to server would be required. You can use this + if you want to exhaust whatever data has been fetched to the client already but then perhaps stop. */ int objsLeftInBatch() const { _assertIfNull(); return _putBack.size() + nReturned - pos; } bool moreInCurrentBatch() { return objsLeftInBatch() > 0; } /** next - @return next object in the result cursor. + @return next object in the result cursor. on an error at the remote server, you will get back: { $err: } if you do not want to handle that yourself, call nextSafe(). */ BSONObj next(); - - /** + + /** restore an object previously returned by next() to the cursor */ void putBack( const BSONObj &o ) { _putBack.push( o.getOwned() ); } - /** throws AssertionException if get back { $err : ... } */ + /** throws AssertionException if get back { $err : ... } */ BSONObj nextSafe() { BSONObj o = next(); BSONElement e = o.firstElement(); - if( strcmp(e.fieldName(), "$err") == 0 ) { + if( strcmp(e.fieldName(), "$err") == 0 ) { if( logLevel >= 5 ) log() << "nextSafe() error " << o.toString() << endl; uassert(13106, "nextSafe(): " + o.toString(), false); @@ -67,7 +81,7 @@ namespace mongo { } /** peek ahead at items buffered for future next() calls. - never requests new data from the server. so peek only effective + never requests new data from the server. so peek only effective with what is already buffered. WARNING: no support for _putBack yet! */ @@ -76,9 +90,9 @@ namespace mongo { /** iterate the rest of the cursor and return the number if items */ - int itcount(){ + int itcount() { int c = 0; - while ( more() ){ + while ( more() ) { next(); c++; } @@ -97,48 +111,48 @@ namespace mongo { bool tailable() const { return (opts & QueryOption_CursorTailable) != 0; } - - /** see ResultFlagType (constants.h) for flag values - mostly these flags are for internal purposes - + + /** see ResultFlagType (constants.h) for flag values + mostly these flags are for internal purposes - ResultFlag_ErrSet is the possible exception to that */ - bool hasResultFlag( int flag ){ + bool hasResultFlag( int flag ) { _assertIfNull(); return (resultFlags & flag) != 0; } - DBClientCursor( DBConnector *_connector, const string &_ns, BSONObj _query, int _nToReturn, + DBClientCursor( DBClientBase* client, const string &_ns, BSONObj _query, int _nToReturn, int _nToSkip, const BSONObj *_fieldsToReturn, int queryOptions , int bs ) : - connector(_connector), - ns(_ns), - query(_query), - nToReturn(_nToReturn), - haveLimit( _nToReturn > 0 && !(queryOptions & QueryOption_CursorTailable)), - nToSkip(_nToSkip), - fieldsToReturn(_fieldsToReturn), - opts(queryOptions), - batchSize(bs==1?2:bs), - m(new Message()), - cursorId(), - nReturned(), - pos(), - data(), - _ownCursor( true ){ + _client(client), + ns(_ns), + query(_query), + nToReturn(_nToReturn), + haveLimit( _nToReturn > 0 && !(queryOptions & QueryOption_CursorTailable)), + nToSkip(_nToSkip), + fieldsToReturn(_fieldsToReturn), + opts(queryOptions), + batchSize(bs==1?2:bs), + m(new Message()), + cursorId(), + nReturned(), + pos(), + data(), + _ownCursor( true ) { + } + + DBClientCursor( DBClientBase* client, const string &_ns, long long _cursorId, int _nToReturn, int options ) : + _client(client), + ns(_ns), + nToReturn( _nToReturn ), + haveLimit( _nToReturn > 0 && !(options & QueryOption_CursorTailable)), + opts( options ), + m(new Message()), + cursorId( _cursorId ), + nReturned(), + pos(), + data(), + _ownCursor( true ) { } - - DBClientCursor( DBConnector *_connector, const string &_ns, long long _cursorId, int _nToReturn, int options ) : - connector(_connector), - ns(_ns), - nToReturn( _nToReturn ), - haveLimit( _nToReturn > 0 && !(options & QueryOption_CursorTailable)), - opts( options ), - m(new Message()), - cursorId( _cursorId ), - nReturned(), - pos(), - data(), - _ownCursor( true ){ - } virtual ~DBClientCursor(); @@ -148,15 +162,15 @@ namespace mongo { message when ~DBClientCursor() is called. This function overrides that. */ void decouple() { _ownCursor = false; } - + void attach( AScopedConnection * conn ); - + private: friend class DBClientBase; friend class DBClientConnection; - bool init(); + bool init(); int nextBatchSize(); - DBConnector *connector; + DBClientBase* _client; string ns; BSONObj query; int nToReturn; @@ -180,8 +194,12 @@ namespace mongo { // Don't call from a virtual function void _assertIfNull() const { uassert(13348, "connection died", this); } + + // non-copyable , non-assignable + DBClientCursor( const DBClientCursor& ); + DBClientCursor& operator=( const DBClientCursor& ); }; - + /** iterate over objects in current batch only - will not cause a network call */ class DBClientCursorBatchIterator { @@ -198,7 +216,7 @@ namespace mongo { DBClientCursor &_c; int _n; }; - + } // namespace mongo #include "undef_macros.h" diff --git a/client/dbclientmockcursor.h b/client/dbclientmockcursor.h new file mode 100644 index 0000000..8d85ff5 --- /dev/null +++ b/client/dbclientmockcursor.h @@ -0,0 +1,40 @@ +//@file dbclientmockcursor.h + +/* Copyright 2010 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "dbclientcursor.h" + +namespace mongo { + + class DBClientMockCursor : public DBClientCursorInterface { + public: + DBClientMockCursor( const BSONArray& mockCollection ) : _iter( mockCollection ) {} + virtual ~DBClientMockCursor() {} + + bool more() { return _iter.more(); } + BSONObj next() { return _iter.next().Obj(); } + + private: + BSONObjIterator _iter; + + // non-copyable , non-assignable + DBClientMockCursor( const DBClientMockCursor& ); + DBClientMockCursor& operator=( const DBClientMockCursor& ); + }; + +} // namespace mongo diff --git a/client/distlock.cpp b/client/distlock.cpp index 05e54c0..9ec98ea 100644 --- a/client/distlock.cpp +++ b/client/distlock.cpp @@ -21,23 +21,36 @@ namespace mongo { - string lockPingNS = "config.lockpings"; + static string lockPingNS = "config.lockpings"; + static string locksNS = "config.locks"; ThreadLocalValue distLockIds(""); - - string getDistLockProcess(){ - static string s; - if ( s.empty() ){ - stringstream ss; - ss << getHostNameCached() << ":" << time(0) << ":" << rand(); - s = ss.str(); - } - return s; + + /* ================== + * Module initialization + */ + + boost::once_flag _init = BOOST_ONCE_INIT; + static string* _cachedProcessString = NULL; + + static void initModule() { + // cache process string + stringstream ss; + ss << getHostName() << ":" << time(0) << ":" << rand(); + _cachedProcessString = new string( ss.str() ); } - string getDistLockId(){ + /* =================== */ + + string getDistLockProcess() { + boost::call_once( initModule, _init ); + assert( _cachedProcessString ); + return *_cachedProcessString; + } + + string getDistLockId() { string s = distLockIds.get(); - if ( s.empty() ){ + if ( s.empty() ) { stringstream ss; ss << getDistLockProcess() << ":" << getThreadName() << ":" << rand(); s = ss.str(); @@ -45,50 +58,95 @@ namespace mongo { } return s; } - - void distLockPingThread( ConnectionString addr ){ + + void _distLockPingThread( ConnectionString addr ) { setThreadName( "LockPinger" ); + + log() << "creating dist lock ping thread for: " << addr << endl; static int loops = 0; - while( ! inShutdown() ){ + while( ! inShutdown() ) { + + string process = getDistLockProcess(); + log(4) << "dist_lock about to ping for: " << process << endl; + try { ScopedDbConnection conn( addr ); - - // do ping - conn->update( lockPingNS , - BSON( "_id" << getDistLockProcess() ) , + + // refresh the entry corresponding to this process in the lockpings collection + conn->update( lockPingNS , + BSON( "_id" << process ) , BSON( "$set" << BSON( "ping" << DATENOW ) ) , true ); - - - // remove really old entries - BSONObjBuilder f; - f.appendDate( "$lt" , jsTime() - ( 4 * 86400 * 1000 ) ); - BSONObj r = BSON( "ping" << f.obj() ); - conn->remove( lockPingNS , r ); - + string err = conn->getLastError(); + if ( ! err.empty() ) { + warning() << "dist_lock process: " << process << " pinging: " << addr << " failed: " + << err << endl; + conn.done(); + sleepsecs(30); + continue; + } + + // remove really old entries from the lockpings collection if they're not holding a lock + // (this may happen if an instance of a process was taken down and no new instance came up to + // replace it for a quite a while) + // if the lock is taken, the take-over mechanism should handle the situation + auto_ptr c = conn->query( locksNS , BSONObj() ); + vector pids; + while ( c->more() ) { + BSONObj lock = c->next(); + if ( ! lock["process"].eoo() ) { + pids.push_back( lock["process"].valuestrsafe() ); + } + } + + Date_t fourDays = jsTime() - ( 4 * 86400 * 1000 ); // 4 days + conn->remove( lockPingNS , BSON( "_id" << BSON( "$nin" << pids ) << "ping" << LT << fourDays ) ); + err = conn->getLastError(); + if ( ! err.empty() ) { + warning() << "dist_lock cleanup request from process: " << process << " to: " << addr + << " failed: " << err << endl; + conn.done(); + sleepsecs(30); + continue; + } + // create index so remove is fast even with a lot of servers - if ( loops++ == 0 ){ + if ( loops++ == 0 ) { conn->ensureIndex( lockPingNS , BSON( "ping" << 1 ) ); } - + conn.done(); } - catch ( std::exception& e ){ - log( LL_WARNING ) << "couldn't ping: " << e.what() << endl; + catch ( std::exception& e ) { + warning() << "dist_lock exception during ping: " << e.what() << endl; } + + log( loops % 10 == 0 ? 0 : 1) << "dist_lock pinged successfully for: " << process << endl; sleepsecs(30); } } - - + + void distLockPingThread( ConnectionString addr ) { + try { + _distLockPingThread( addr ); + } + catch ( std::exception& e ) { + error() << "unexpected error in distLockPingThread: " << e.what() << endl; + } + catch ( ... ) { + error() << "unexpected unknown error in distLockPingThread" << endl; + } + } + + class DistributedLockPinger { public: DistributedLockPinger() - : _mutex( "DistributedLockPinger" ){ + : _mutex( "DistributedLockPinger" ) { } - - void got( const ConnectionString& conn ){ + + void got( const ConnectionString& conn ) { string s = conn.toString(); scoped_lock lk( _mutex ); if ( _seen.count( s ) > 0 ) @@ -96,80 +154,121 @@ namespace mongo { boost::thread t( boost::bind( &distLockPingThread , conn ) ); _seen.insert( s ); } - + set _seen; mongo::mutex _mutex; - + } distLockPinger; - + DistributedLock::DistributedLock( const ConnectionString& conn , const string& name , unsigned takeoverMinutes ) - : _conn(conn),_name(name),_takeoverMinutes(takeoverMinutes){ + : _conn(conn),_name(name),_takeoverMinutes(takeoverMinutes) { _id = BSON( "_id" << name ); _ns = "config.locks"; distLockPinger.got( conn ); } - - bool DistributedLock::lock_try( string why , BSONObj * other ){ + + bool DistributedLock::lock_try( string why , BSONObj * other ) { + // write to dummy if 'other' is null + BSONObj dummyOther; + if ( other == NULL ) + other = &dummyOther; + ScopedDbConnection conn( _conn ); - + BSONObjBuilder queryBuilder; queryBuilder.appendElements( _id ); - queryBuilder.append( "state" , 0 ); + queryBuilder.append( "state" , 0 ); - { // make sure its there so we can use simple update logic below - BSONObj o = conn->findOne( _ns , _id ); - if ( o.isEmpty() ){ + { + // make sure its there so we can use simple update logic below + BSONObj o = conn->findOne( _ns , _id ).getOwned(); + if ( o.isEmpty() ) { try { + log(4) << "dist_lock inserting initial doc in " << _ns << " for lock " << _name << endl; conn->insert( _ns , BSON( "_id" << _name << "state" << 0 << "who" << "" ) ); } - catch ( UserException& ){ + catch ( UserException& e ) { + log() << "dist_lock could not insert initial doc: " << e << endl; } } - else if ( o["state"].numberInt() > 0 ){ + + else if ( o["state"].numberInt() > 0 ) { BSONObj lastPing = conn->findOne( lockPingNS , o["process"].wrap( "_id" ) ); - if ( lastPing.isEmpty() ){ - // TODO: maybe this should clear, not sure yet - log() << "lastPing is empty! this could be bad: " << o << endl; + if ( lastPing.isEmpty() ) { + // if a lock is taken but there's no ping for it, we're in an inconsistent situation + // if the lock holder (mongos or d) does not exist anymore, the lock could safely be removed + // but we'd require analysis of the situation before a manual intervention + error() << "config.locks: " << _name << " lock is taken by old process? " + << "remove the following lock if the process is not active anymore: " << o << endl; + *other = o; conn.done(); return false; } - unsigned long long elapsed = jsTime() - lastPing["ping"].Date(); // in ms - elapsed = elapsed / ( 1000 * 60 ); // convert to minutes - - if ( elapsed <= _takeoverMinutes ){ - log(1) << "dist_lock lock failed because taken by: " << o << endl; + unsigned long long now = jsTime(); + unsigned long long pingTime = lastPing["ping"].Date(); + + if ( now < pingTime ) { + // clock skew + warning() << "dist_lock has detected clock skew of " << ( pingTime - now ) << "ms" << endl; + *other = o; conn.done(); return false; } + unsigned long long elapsed = now - pingTime; + elapsed = elapsed / ( 1000 * 60 ); // convert to minutes + + if ( elapsed > ( 60 * 24 * 365 * 100 ) /* 100 years */ ) { + warning() << "distlock elapsed time seems impossible: " << lastPing << endl; + } + + if ( elapsed <= _takeoverMinutes ) { + log(1) << "dist_lock lock failed because taken by: " << o << " elapsed minutes: " << elapsed << endl; + *other = o; + conn.done(); + return false; + } + log() << "dist_lock forcefully taking over from: " << o << " elapsed minutes: " << elapsed << endl; conn->update( _ns , _id , BSON( "$set" << BSON( "state" << 0 ) ) ); + string err = conn->getLastError(); + if ( ! err.empty() ) { + warning() << "dist_lock take over from: " << o << " failed: " << err << endl; + *other = o.getOwned(); + other->getOwned(); + conn.done(); + return false; + } + } - else if ( o["ts"].type() ){ + else if ( o["ts"].type() ) { queryBuilder.append( o["ts"] ); } } - + OID ts; ts.init(); bool gotLock = false; BSONObj now; - - BSONObj whatIWant = BSON( "$set" << BSON( "state" << 1 << - "who" << getDistLockId() << "process" << getDistLockProcess() << - "when" << DATENOW << "why" << why << "ts" << ts ) ); + + BSONObj lockDetails = BSON( "state" << 1 << "who" << getDistLockId() << "process" << getDistLockProcess() << + "when" << DATENOW << "why" << why << "ts" << ts ); + BSONObj whatIWant = BSON( "$set" << lockDetails ); try { + log(4) << "dist_lock about to aquire lock: " << lockDetails << endl; + conn->update( _ns , queryBuilder.obj() , whatIWant ); - + BSONObj o = conn->getLastErrorDetailed(); now = conn->findOne( _ns , _id ); - - if ( o["n"].numberInt() == 0 ){ - if ( other ) - *other = now; + + if ( o["n"].numberInt() == 0 ) { + *other = now; + other->getOwned(); + log() << "dist_lock error trying to aquire lock: " << lockDetails << " error: " << o << endl; gotLock = false; } else { @@ -177,40 +276,40 @@ namespace mongo { } } - catch ( UpdateNotTheSame& up ){ + catch ( UpdateNotTheSame& up ) { // this means our update got through on some, but not others + log(4) << "dist_lock lock did not propagate properly" << endl; - for ( unsigned i=0; ifindOne( _ns , _id ); - if ( now.isEmpty() || now["ts"] < temp2["ts"] ){ + if ( now.isEmpty() || now["ts"] < temp2["ts"] ) { now = temp2.getOwned(); } temp.done(); } - if ( now["ts"].OID() == ts ){ + if ( now["ts"].OID() == ts ) { + log(4) << "dist_lock completed lock propagation" << endl; gotLock = true; conn->update( _ns , _id , whatIWant ); } else { + log() << "dist_lock error trying to complete propagation" << endl; gotLock = false; } } - + conn.done(); - - log(1) << "dist_lock lock gotLock: " << gotLock << " now: " << now << endl; - if ( ! gotLock ) - return false; - - return true; + log(2) << "dist_lock lock gotLock: " << gotLock << " now: " << now << endl; + + return gotLock; } - void DistributedLock::unlock(){ + void DistributedLock::unlock() { const int maxAttempts = 3; int attempted = 0; while ( ++attempted <= maxAttempts ) { @@ -218,22 +317,23 @@ namespace mongo { try { ScopedDbConnection conn( _conn ); conn->update( _ns , _id, BSON( "$set" << BSON( "state" << 0 ) ) ); - log(1) << "dist_lock unlock: " << conn->findOne( _ns , _id ) << endl; + log(2) << "dist_lock unlock: " << conn->findOne( _ns , _id ) << endl; conn.done(); return; - - } catch ( std::exception& e) { - log( LL_WARNING ) << "dist_lock " << _name << " failed to contact config server in unlock attempt " + + } + catch ( std::exception& e) { + log( LL_WARNING ) << "dist_lock " << _name << " failed to contact config server in unlock attempt " << attempted << ": " << e.what() << endl; sleepsecs(1 << attempted); } } - log( LL_WARNING ) << "dist_lock couldn't consumate unlock request. " << "Lock " << _name - << " will be taken over after " << _takeoverMinutes << " minutes timeout" << endl; + log( LL_WARNING ) << "dist_lock couldn't consumate unlock request. " << "Lock " << _name + << " will be taken over after " << _takeoverMinutes << " minutes timeout" << endl; } } diff --git a/client/distlock.h b/client/distlock.h index 8a77338..753a241 100644 --- a/client/distlock.h +++ b/client/distlock.h @@ -15,10 +15,7 @@ * limitations under the License. */ - -/** - * distributed locking mechanism - */ +#pragma once #include "../pch.h" #include "dbclient.h" @@ -28,53 +25,71 @@ namespace mongo { + /** + * The distributed lock is a configdb backed way of synchronizing system-wide tasks. A task must be identified by a + * unique name across the system (e.g., "balancer"). A lock is taken by writing a document in the configdb's locks + * collection with that name. + * + * To be maintained, each taken lock needs to be revalidaded ("pinged") within a pre-established amount of time. This + * class does this maintenance automatically once a DistributedLock object was constructed. + */ class DistributedLock { public: /** - * @param takeoverMinutes how long before we steal lock in minutes + * The constructor does not connect to the configdb yet and constructing does not mean the lock was acquired. + * Construction does trigger a lock "pinging" mechanism, though. + * + * @param conn address of config(s) server(s) + * @param name identifier for the lock + * @param takeoverMinutes how long can the log go "unpinged" before a new attempt to lock steals it (in minutes) */ - DistributedLock( const ConnectionString& conn , const string& name , unsigned takeoverMinutes = 10 ); + DistributedLock( const ConnectionString& conn , const string& name , unsigned takeoverMinutes = 15 ); + /** + * Attempts to aquire 'this' lock, checking if it could or should be stolen from the previous holder. Please + * consider using the dist_lock_try construct to acquire this lock in an exception safe way. + * + * @param why human readable description of why the lock is being taken (used to log) + * @param other configdb's lock document that is currently holding the lock, if lock is taken + * @return true if it managed to grab the lock + */ bool lock_try( string why , BSONObj * other = 0 ); + + /** + * Releases a previously taken lock. + */ void unlock(); private: ConnectionString _conn; string _name; unsigned _takeoverMinutes; - + string _ns; BSONObj _id; }; - + class dist_lock_try { public: - dist_lock_try( DistributedLock * lock , string why ) - : _lock(lock){ + : _lock(lock) { _got = _lock->lock_try( why , &_other ); } - ~dist_lock_try(){ - if ( _got ){ + ~dist_lock_try() { + if ( _got ) { _lock->unlock(); } } - bool got() const { - return _got; - } + bool got() const { return _got; } + BSONObj other() const { return _other; } - BSONObj other() const { - return _other; - } - private: DistributedLock * _lock; bool _got; BSONObj _other; - }; } diff --git a/client/distlock_test.cpp b/client/distlock_test.cpp index 0879b6e..83d143f 100644 --- a/client/distlock_test.cpp +++ b/client/distlock_test.cpp @@ -21,60 +21,84 @@ #include "../db/commands.h" namespace mongo { - + class TestDistLockWithSync : public Command { public: - TestDistLockWithSync() : Command( "_testDistLockWithSyncCluster" ){} + TestDistLockWithSync() : Command( "_testDistLockWithSyncCluster" ) {} virtual void help( stringstream& help ) const { help << "should not be calling this directly" << endl; } - + virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } - static void runThread(){ - for ( int i=0; i<1000; i++ ){ - if ( current->lock_try( "test" ) ){ - gotit++; - for ( int j=0; j<2000; j++ ){ - count++; + static void runThread() { + while ( keepGoing ) { + if ( current->lock_try( "test" ) ) { + count++; + int before = count; + sleepmillis( 3 ); + int after = count; + + if ( after != before ) { + error() << " before: " << before << " after: " << after << endl; } + current->unlock(); } } } - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + Timer t; DistributedLock lk( ConnectionString( cmdObj["host"].String() , ConnectionString::SYNC ), "testdistlockwithsync" ); current = &lk; count = 0; gotit = 0; + errors = 0; + keepGoing = true; vector > l; - for ( int i=0; i<4; i++ ){ + for ( int i=0; i<4; i++ ) { l.push_back( shared_ptr( new boost::thread( runThread ) ) ); } + int secs = 10; + if ( cmdObj["secs"].isNumber() ) + secs = cmdObj["secs"].numberInt(); + sleepsecs( secs ); + keepGoing = false; + for ( unsigned i=0; ijoin(); + current = 0; + result.append( "count" , count ); result.append( "gotit" , gotit ); - current = 0; - return count == gotit * 2000; + result.append( "errors" , errors ); + result.append( "timeMS" , t.millis() ); + + return errors == 0; } + // variables for test static DistributedLock * current; - static int count; static int gotit; + static int errors; + static AtomicUInt count; + + static bool keepGoing; } testDistLockWithSyncCmd; DistributedLock * TestDistLockWithSync::current; - int TestDistLockWithSync::count; + AtomicUInt TestDistLockWithSync::count; int TestDistLockWithSync::gotit; + int TestDistLockWithSync::errors; + bool TestDistLockWithSync::keepGoing; } diff --git a/client/examples/authTest.cpp b/client/examples/authTest.cpp index 77ce12d..71cdd39 100644 --- a/client/examples/authTest.cpp +++ b/client/examples/authTest.cpp @@ -22,7 +22,7 @@ using namespace mongo; int main( int argc, const char **argv ) { - + const char *port = "27017"; if ( argc != 1 ) { if ( argc != 3 ) @@ -37,17 +37,18 @@ int main( int argc, const char **argv ) { throw -11; } - { // clean up old data from any previous tests + { + // clean up old data from any previous tests conn.remove( "test.system.users" , BSONObj() ); } conn.insert( "test.system.users" , BSON( "user" << "eliot" << "pwd" << conn.createPasswordDigest( "eliot" , "bar" ) ) ); - + errmsg.clear(); bool ok = conn.auth( "test" , "eliot" , "bar" , errmsg ); if ( ! ok ) cout << errmsg << endl; - assert( ok ); + MONGO_assert( ok ); - assert( ! conn.auth( "test" , "eliot" , "bars" , errmsg ) ); + MONGO_assert( ! conn.auth( "test" , "eliot" , "bars" , errmsg ) ); } diff --git a/client/examples/clientTest.cpp b/client/examples/clientTest.cpp index 83a556a..bd4432e 100644 --- a/client/examples/clientTest.cpp +++ b/client/examples/clientTest.cpp @@ -19,9 +19,14 @@ * a simple test for the c++ driver */ +// this header should be first to ensure that it includes cleanly in any context +#include "client/dbclient.h" + #include -#include "client/dbclient.h" +#ifndef assert +# define assert(x) MONGO_assert(x) +#endif using namespace std; using namespace mongo; @@ -125,12 +130,14 @@ int main( int argc, const char **argv ) { } - { // ensure index + { + // ensure index assert( conn.ensureIndex( ns , BSON( "name" << 1 ) ) ); assert( ! conn.ensureIndex( ns , BSON( "name" << 1 ) ) ); } - { // hint related tests + { + // hint related tests assert( conn.findOne(ns, "{}")["name"].str() == "sara" ); assert( conn.findOne(ns, "{ name : 'eliot' }")["name"].str() == "eliot" ); @@ -141,7 +148,7 @@ int main( int argc, const char **argv ) { try { conn.findOne(ns, Query("{name:\"eliot\"}").hint("{foo:1}")); } - catch ( ... ){ + catch ( ... ) { asserted = true; } assert( asserted ); @@ -153,7 +160,8 @@ int main( int argc, const char **argv ) { assert( conn.validate( ns ) ); } - { // timestamp test + { + // timestamp test const char * tsns = "test.tstest1"; conn.dropCollection( tsns ); @@ -185,32 +193,33 @@ int main( int argc, const char **argv ) { ( oldTime == found["ts"].timestampTime() && oldInc < found["ts"].timestampInc() ) ); } - - { // check that killcursors doesn't affect last error + + { + // check that killcursors doesn't affect last error assert( conn.getLastError().empty() ); - + BufBuilder b; b.appendNum( (int)0 ); // reserved b.appendNum( (int)-1 ); // invalid # of cursors triggers exception b.appendNum( (int)-1 ); // bogus cursor id - + Message m; m.setData( dbKillCursors, b.buf(), b.len() ); - + // say() is protected in DBClientConnection, so get superclass static_cast< DBConnector* >( &conn )->say( m ); - + assert( conn.getLastError().empty() ); } { list l = conn.getDatabaseNames(); - for ( list::iterator i = l.begin(); i != l.end(); i++ ){ + for ( list::iterator i = l.begin(); i != l.end(); i++ ) { cout << "db name : " << *i << endl; } l = conn.getCollectionNames( "test" ); - for ( list::iterator i = l.begin(); i != l.end(); i++ ){ + for ( list::iterator i = l.begin(); i != l.end(); i++ ) { cout << "coll name : " << *i << endl; } } diff --git a/client/examples/first.cpp b/client/examples/first.cpp index f3b654f..ab5efb3 100644 --- a/client/examples/first.cpp +++ b/client/examples/first.cpp @@ -40,7 +40,7 @@ int main( int argc, const char **argv ) { throw -12; port = argv[ 2 ]; } - + mongo::DBClientConnection conn; string errmsg; if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) { @@ -48,14 +48,15 @@ int main( int argc, const char **argv ) { throw -11; } - { // clean up old data from any previous tests + { + // clean up old data from any previous tests mongo::BSONObjBuilder query; conn.remove( "test.people" , query.obj() ); } insert( conn , "eliot" , 15 ); insert( conn , "sara" , 23 ); - + { mongo::BSONObjBuilder query; auto_ptr cursor = conn.query( "test.people" , query.obj() ); @@ -66,14 +67,14 @@ int main( int argc, const char **argv ) { } } - + { mongo::BSONObjBuilder query; query.append( "name" , "eliot" ); mongo::BSONObj res = conn.findOne( "test.people" , query.obj() ); cout << res.isEmpty() << "\t" << res.jsonString() << endl; } - + { mongo::BSONObjBuilder query; query.append( "name" , "asd" ); diff --git a/client/examples/httpClientTest.cpp b/client/examples/httpClientTest.cpp index 5d6c429..4fa5fd8 100644 --- a/client/examples/httpClientTest.cpp +++ b/client/examples/httpClientTest.cpp @@ -23,7 +23,7 @@ using namespace mongo; int main( int argc, const char **argv ) { - + int port = 27017; if ( argc != 1 ) { if ( argc != 3 ) @@ -31,13 +31,13 @@ int main( int argc, const char **argv ) { port = atoi( argv[ 2 ] ); } port += 1000; - + stringstream ss; ss << "http://localhost:" << port << "/"; string url = ss.str(); - + cout << "[" << url << "]" << endl; HttpClient c; - assert( c.get( url ) == 200 ); + MONGO_assert( c.get( url ) == 200 ); } diff --git a/client/examples/rs.cpp b/client/examples/rs.cpp new file mode 100644 index 0000000..7813ec6 --- /dev/null +++ b/client/examples/rs.cpp @@ -0,0 +1,58 @@ +// rs.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * example of using replica sets from c++ + */ + +#include "client/dbclient.h" +#include + +using namespace mongo; +using namespace std; + +int main( int argc , const char ** argv ) { + string errmsg; + ConnectionString cs = ConnectionString::parse( "foo/127.0.0.1" , errmsg ); + if ( ! cs.isValid() ) { + cout << "error parsing url: " << errmsg << endl; + return 1; + } + + DBClientReplicaSet * conn = (DBClientReplicaSet*)cs.connect( errmsg ); + if ( ! conn ) { + cout << "error connecting: " << errmsg << endl; + return 2; + } + + string collName = "test.rs1"; + + conn->dropCollection( collName ); + while ( true ) { + try { + conn->update( collName , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) , true ); + cout << conn->findOne( collName , BSONObj() ) << endl; + cout << "\t A" << conn->slaveConn().findOne( collName , BSONObj() , 0 , QueryOption_SlaveOk ) << endl; + cout << "\t B " << conn->findOne( collName , BSONObj() , 0 , QueryOption_SlaveOk ) << endl; + } + catch ( std::exception& e ) { + cout << "ERROR: " << e.what() << endl; + } + sleepsecs( 1 ); + } + +} diff --git a/client/examples/second.cpp b/client/examples/second.cpp index 68eafaa..6cc2111 100644 --- a/client/examples/second.cpp +++ b/client/examples/second.cpp @@ -23,7 +23,7 @@ using namespace std; using namespace mongo; int main( int argc, const char **argv ) { - + const char *port = "27017"; if ( argc != 1 ) { if ( argc != 3 ) diff --git a/client/examples/tail.cpp b/client/examples/tail.cpp index 3738b4f..90e62d2 100644 --- a/client/examples/tail.cpp +++ b/client/examples/tail.cpp @@ -23,24 +23,24 @@ using namespace mongo; void tail(DBClientBase& conn, const char *ns) { - BSONElement lastId = minKey.firstElement(); - Query query = Query(); - - auto_ptr c = - conn.query(ns, query, 0, 0, 0, QueryOption_CursorTailable); - - while( 1 ) { - if( !c->more() ) { - if( c->isDead() ) { - break; // we need to requery - } - - // all data (so far) exhausted, wait for more - sleepsecs(1); - continue; - } - BSONObj o = c->next(); - lastId = o["_id"]; - cout << o.toString() << endl; - } + BSONElement lastId = minKey.firstElement(); + Query query = Query(); + + auto_ptr c = + conn.query(ns, query, 0, 0, 0, QueryOption_CursorTailable); + + while( 1 ) { + if( !c->more() ) { + if( c->isDead() ) { + break; // we need to requery + } + + // all data (so far) exhausted, wait for more + sleepsecs(1); + continue; + } + BSONObj o = c->next(); + lastId = o["_id"]; + cout << o.toString() << endl; + } } diff --git a/client/examples/tutorial.cpp b/client/examples/tutorial.cpp index 28e1b27..3cdf359 100644 --- a/client/examples/tutorial.cpp +++ b/client/examples/tutorial.cpp @@ -23,45 +23,45 @@ using namespace mongo; void printIfAge(DBClientConnection& c, int age) { - auto_ptr cursor = c.query("tutorial.persons", QUERY( "age" << age ).sort("name") ); - while( cursor->more() ) { - BSONObj p = cursor->next(); - cout << p.getStringField("name") << endl; - } + auto_ptr cursor = c.query("tutorial.persons", QUERY( "age" << age ).sort("name") ); + while( cursor->more() ) { + BSONObj p = cursor->next(); + cout << p.getStringField("name") << endl; + } } void run() { - DBClientConnection c; - c.connect("localhost"); //"192.168.58.1"); - cout << "connected ok" << endl; - BSONObj p = BSON( "name" << "Joe" << "age" << 33 ); - c.insert("tutorial.persons", p); - p = BSON( "name" << "Jane" << "age" << 40 ); - c.insert("tutorial.persons", p); - p = BSON( "name" << "Abe" << "age" << 33 ); - c.insert("tutorial.persons", p); - p = BSON( "name" << "Samantha" << "age" << 21 << "city" << "Los Angeles" << "state" << "CA" ); - c.insert("tutorial.persons", p); + DBClientConnection c; + c.connect("localhost"); //"192.168.58.1"); + cout << "connected ok" << endl; + BSONObj p = BSON( "name" << "Joe" << "age" << 33 ); + c.insert("tutorial.persons", p); + p = BSON( "name" << "Jane" << "age" << 40 ); + c.insert("tutorial.persons", p); + p = BSON( "name" << "Abe" << "age" << 33 ); + c.insert("tutorial.persons", p); + p = BSON( "name" << "Samantha" << "age" << 21 << "city" << "Los Angeles" << "state" << "CA" ); + c.insert("tutorial.persons", p); - c.ensureIndex("tutorial.persons", fromjson("{age:1}")); + c.ensureIndex("tutorial.persons", fromjson("{age:1}")); - cout << "count:" << c.count("tutorial.persons") << endl; + cout << "count:" << c.count("tutorial.persons") << endl; - auto_ptr cursor = c.query("tutorial.persons", BSONObj()); - while( cursor->more() ) { - cout << cursor->next().toString() << endl; - } + auto_ptr cursor = c.query("tutorial.persons", BSONObj()); + while( cursor->more() ) { + cout << cursor->next().toString() << endl; + } - cout << "\nprintifage:\n"; - printIfAge(c, 33); + cout << "\nprintifage:\n"; + printIfAge(c, 33); } -int main() { - try { - run(); - } - catch( DBException &e ) { - cout << "caught " << e.what() << endl; - } - return 0; +int main() { + try { + run(); + } + catch( DBException &e ) { + cout << "caught " << e.what() << endl; + } + return 0; } diff --git a/client/examples/whereExample.cpp b/client/examples/whereExample.cpp index a26d921..ce4174b 100644 --- a/client/examples/whereExample.cpp +++ b/client/examples/whereExample.cpp @@ -23,7 +23,7 @@ using namespace std; using namespace mongo; int main( int argc, const char **argv ) { - + const char *port = "27017"; if ( argc != 1 ) { if ( argc != 3 ) @@ -36,7 +36,7 @@ int main( int argc, const char **argv ) { if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) { cout << "couldn't connect : " << errmsg << endl; throw -11; - } + } const char * ns = "test.where"; @@ -44,9 +44,9 @@ int main( int argc, const char **argv ) { conn.insert( ns , BSON( "name" << "eliot" << "num" << 17 ) ); conn.insert( ns , BSON( "name" << "sara" << "num" << 24 ) ); - + auto_ptr cursor = conn.query( ns , BSONObj() ); - + while ( cursor->more() ) { BSONObj obj = cursor->next(); cout << "\t" << obj.jsonString() << endl; @@ -64,5 +64,5 @@ int main( int argc, const char **argv ) { cout << "\t" << obj.jsonString() << endl; num++; } - assert( num == 1 ); + MONGO_assert( num == 1 ); } diff --git a/client/gridfs.cpp b/client/gridfs.cpp index d740c76..233724a 100644 --- a/client/gridfs.cpp +++ b/client/gridfs.cpp @@ -34,11 +34,11 @@ namespace mongo { const unsigned DEFAULT_CHUNK_SIZE = 256 * 1024; - GridFSChunk::GridFSChunk( BSONObj o ){ + GridFSChunk::GridFSChunk( BSONObj o ) { _data = o; } - GridFSChunk::GridFSChunk( BSONObj fileObject , int chunkNumber , const char * data , int len ){ + GridFSChunk::GridFSChunk( BSONObj fileObject , int chunkNumber , const char * data , int len ) { BSONObjBuilder b; b.appendAs( fileObject["_id"] , "files_id" ); b.append( "n" , chunkNumber ); @@ -47,7 +47,7 @@ namespace mongo { } - GridFS::GridFS( DBClientBase& client , const string& dbName , const string& prefix ) : _client( client ) , _dbName( dbName ) , _prefix( prefix ){ + GridFS::GridFS( DBClientBase& client , const string& dbName , const string& prefix ) : _client( client ) , _dbName( dbName ) , _prefix( prefix ) { _filesNS = dbName + "." + prefix + ".files"; _chunksNS = dbName + "." + prefix + ".chunks"; _chunkSize = DEFAULT_CHUNK_SIZE; @@ -56,7 +56,7 @@ namespace mongo { client.ensureIndex( _chunksNS , BSON( "files_id" << 1 << "n" << 1 ) ); } - GridFS::~GridFS(){ + GridFS::~GridFS() { } @@ -65,7 +65,7 @@ namespace mongo { _chunkSize = size; } - BSONObj GridFS::storeFile( const char* data , size_t length , const string& remoteName , const string& contentType){ + BSONObj GridFS::storeFile( const char* data , size_t length , const string& remoteName , const string& contentType) { char const * const end = data + length; OID id; @@ -73,7 +73,7 @@ namespace mongo { BSONObj idObj = BSON("_id" << id); int chunkNumber = 0; - while (data < end){ + while (data < end) { int chunkLen = MIN(_chunkSize, (unsigned)(end-data)); GridFSChunk c(idObj, chunkNumber, data, chunkLen); _client.insert( _chunksNS.c_str() , c._data ); @@ -86,7 +86,7 @@ namespace mongo { } - BSONObj GridFS::storeFile( const string& fileName , const string& remoteName , const string& contentType){ + BSONObj GridFS::storeFile( const string& fileName , const string& remoteName , const string& contentType) { uassert( 10012 , "file doesn't exist" , fileName == "-" || boost::filesystem::exists( fileName ) ); FILE* fd; @@ -102,12 +102,12 @@ namespace mongo { int chunkNumber = 0; gridfs_offset length = 0; - while (!feof(fd)){ + while (!feof(fd)) { //boost::scoped_arraybuf (new char[_chunkSize+1]); char * buf = new char[_chunkSize+1]; char* bufPos = buf;//.get(); unsigned int chunkLen = 0; // how much in the chunk now - while(chunkLen != _chunkSize && !feof(fd)){ + while(chunkLen != _chunkSize && !feof(fd)) { int readLen = fread(bufPos, 1, _chunkSize - chunkLen, fd); chunkLen += readLen; bufPos += readLen; @@ -125,11 +125,11 @@ namespace mongo { if (fd != stdin) fclose( fd ); - + return insertFile((remoteName.empty() ? fileName : remoteName), id, length, contentType); } - BSONObj GridFS::insertFile(const string& name, const OID& id, gridfs_offset length, const string& contentType){ + BSONObj GridFS::insertFile(const string& name, const OID& id, gridfs_offset length, const string& contentType) { BSONObj res; if ( ! _client.runCommand( _dbName.c_str() , BSON( "filemd5" << id << "root" << _prefix ) , res ) ) @@ -143,9 +143,10 @@ namespace mongo { << "md5" << res["md5"] ; - if (length < 1024*1024*1024){ // 2^30 + if (length < 1024*1024*1024) { // 2^30 file << "length" << (int) length; - }else{ + } + else { file << "length" << (long long) length; } @@ -158,9 +159,9 @@ namespace mongo { return ret; } - void GridFS::removeFile( const string& fileName ){ + void GridFS::removeFile( const string& fileName ) { auto_ptr files = _client.query( _filesNS , BSON( "filename" << fileName ) ); - while (files->more()){ + while (files->more()) { BSONObj file = files->next(); BSONElement id = file["_id"]; _client.remove( _filesNS.c_str() , BSON( "_id" << id ) ); @@ -168,38 +169,38 @@ namespace mongo { } } - GridFile::GridFile( GridFS * grid , BSONObj obj ){ + GridFile::GridFile( GridFS * grid , BSONObj obj ) { _grid = grid; _obj = obj; } - GridFile GridFS::findFile( const string& fileName ){ + GridFile GridFS::findFile( const string& fileName ) { return findFile( BSON( "filename" << fileName ) ); }; - GridFile GridFS::findFile( BSONObj query ){ + GridFile GridFS::findFile( BSONObj query ) { query = BSON("query" << query << "orderby" << BSON("uploadDate" << -1)); return GridFile( this , _client.findOne( _filesNS.c_str() , query ) ); } - auto_ptr GridFS::list(){ + auto_ptr GridFS::list() { return _client.query( _filesNS.c_str() , BSONObj() ); } - auto_ptr GridFS::list( BSONObj o ){ + auto_ptr GridFS::list( BSONObj o ) { return _client.query( _filesNS.c_str() , o ); } - BSONObj GridFile::getMetadata(){ + BSONObj GridFile::getMetadata() { BSONElement meta_element = _obj["metadata"]; - if( meta_element.eoo() ){ + if( meta_element.eoo() ) { return BSONObj(); } return meta_element.embeddedObject(); } - GridFSChunk GridFile::getChunk( int n ){ + GridFSChunk GridFile::getChunk( int n ) { _exists(); BSONObjBuilder b; b.appendAs( _obj["_id"] , "files_id" ); @@ -210,12 +211,12 @@ namespace mongo { return GridFSChunk(o); } - gridfs_offset GridFile::write( ostream & out ){ + gridfs_offset GridFile::write( ostream & out ) { _exists(); const int num = getNumChunks(); - for ( int i=0; ifindOne(getNS(), query); conn.done(); - + if ( b.isEmpty() ) return false; - + unserialize(b); _id = b["_id"].wrap().getOwned(); return true; } - void Model::remove( bool safe ){ + void Model::remove( bool safe ) { uassert( 10016 , "_id isn't set - needed for remove()" , _id["_id"].type() ); - + ScopedDbConnection conn( modelServer() ); conn->remove( getNS() , _id ); @@ -46,34 +46,34 @@ namespace mongo { errmsg = conn->getLastError(); conn.done(); - + if ( safe && errmsg.size() ) throw UserException( 9002 , (string)"error on Model::remove: " + errmsg ); } - void Model::save( bool safe ){ + void Model::save( bool safe ) { ScopedDbConnection conn( modelServer() ); BSONObjBuilder b; serialize( b ); - + BSONElement myId; { BSONObjIterator i = b.iterator(); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); - if ( strcmp( e.fieldName() , "_id" ) == 0 ){ + if ( strcmp( e.fieldName() , "_id" ) == 0 ) { myId = e; break; } } } - if ( myId.type() ){ - if ( _id.isEmpty() ){ + if ( myId.type() ) { + if ( _id.isEmpty() ) { _id = myId.wrap(); } - else if ( myId.woCompare( _id.firstElement() ) ){ + else if ( myId.woCompare( _id.firstElement() ) ) { stringstream ss; ss << "_id from serialize and stored differ: "; ss << '[' << myId << "] != "; @@ -82,11 +82,11 @@ namespace mongo { } } - if ( _id.isEmpty() ){ + if ( _id.isEmpty() ) { OID oid; oid.init(); b.appendOID( "_id" , &oid ); - + BSONObj o = b.obj(); conn->insert( getNS() , o ); _id = o["_id"].wrap().getOwned(); @@ -94,25 +94,25 @@ namespace mongo { log(4) << "inserted new model " << getNS() << " " << o << endl; } else { - if ( myId.eoo() ){ + if ( myId.eoo() ) { myId = _id["_id"]; b.append( myId ); } - + assert( ! myId.eoo() ); BSONObjBuilder qb; qb.append( myId ); - + BSONObj q = qb.obj(); BSONObj o = b.obj(); log(4) << "updated model" << getNS() << " " << q << " " << o << endl; conn->update( getNS() , q , o , true ); - + } - + string errmsg = ""; if ( safe ) errmsg = conn->getLastError(); @@ -123,13 +123,13 @@ namespace mongo { throw UserException( 9003 , (string)"error on Model::save: " + errmsg ); } - BSONObj Model::toObject(){ + BSONObj Model::toObject() { BSONObjBuilder b; serialize( b ); return b.obj(); } - void Model::append( const char * name , BSONObjBuilder& b ){ + void Model::append( const char * name , BSONObjBuilder& b ) { BSONObjBuilder bb( b.subobjStart( name ) ); serialize( bb ); bb.done(); diff --git a/client/model.h b/client/model.h index 108efc0..7dd3143 100644 --- a/client/model.h +++ b/client/model.h @@ -43,16 +43,16 @@ namespace mongo { virtual void unserialize(const BSONObj& from) = 0; virtual BSONObj toObject(); virtual void append( const char * name , BSONObjBuilder& b ); - + virtual string modelServer() = 0; - - /** Load a single object. + + /** Load a single object. @return true if successful. */ virtual bool load(BSONObj& query); virtual void save( bool safe=false ); virtual void remove( bool safe=false ); - + protected: BSONObj _id; }; diff --git a/client/mongo_client_lib.cpp b/client/mongo_client_lib.cpp new file mode 100644 index 0000000..69f801a --- /dev/null +++ b/client/mongo_client_lib.cpp @@ -0,0 +1,66 @@ +/* @file client_lib.cpp + + MongoDB C++ Driver + + Normally one includes dbclient.h, and links against libmongoclient.a, when connecting to MongoDB + from C++. However, if you have a situation where the pre-built library does not work, you can use + this file instead to build all the necessary symbols. To do so, include client_lib.cpp in your + project. + + For example, to build and run simple_client_demo.cpp with GCC and run it: + + g++ -I .. simple_client_demo.cpp mongo_client_lib.cpp -lboost_thread-mt -lboost_filesystem + ./a.out +*/ + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../util/md5main.cpp" + +#define MONGO_EXPOSE_MACROS +#include "../pch.h" + +#include "../util/assert_util.cpp" +#include "../util/message.cpp" +#include "../util/util.cpp" +#include "../util/background.cpp" +#include "../util/base64.cpp" +#include "../util/sock.cpp" +#include "../util/log.cpp" +#include "../util/password.cpp" + +#include "../util/concurrency/thread_pool.cpp" +#include "../util/concurrency/vars.cpp" +#include "../util/concurrency/task.cpp" + +#include "connpool.cpp" +#include "syncclusterconnection.cpp" +#include "dbclient.cpp" +#include "clientOnly.cpp" +#include "gridfs.cpp" +#include "dbclientcursor.cpp" + +#include "../db/lasterror.cpp" +#include "../db/json.cpp" +#include "../db/jsobj.cpp" +#include "../db/common.cpp" +#include "../db/nonce.cpp" +#include "../db/commands.cpp" + +extern "C" { +#include "../util/md5.c" +} + diff --git a/client/parallel.cpp b/client/parallel.cpp index 92d1b04..c4905e3 100644 --- a/client/parallel.cpp +++ b/client/parallel.cpp @@ -25,10 +25,10 @@ #include "../s/shard.h" namespace mongo { - + // -------- ClusteredCursor ----------- - - ClusteredCursor::ClusteredCursor( QueryMessage& q ){ + + ClusteredCursor::ClusteredCursor( QueryMessage& q ) { _ns = q.ns; _query = q.query.copy(); _options = q.queryOptions; @@ -41,7 +41,7 @@ namespace mongo { _didInit = false; } - ClusteredCursor::ClusteredCursor( const string& ns , const BSONObj& q , int options , const BSONObj& fields ){ + ClusteredCursor::ClusteredCursor( const string& ns , const BSONObj& q , int options , const BSONObj& fields ) { _ns = ns; _query = q.getOwned(); _options = options; @@ -52,94 +52,112 @@ namespace mongo { _didInit = false; } - ClusteredCursor::~ClusteredCursor(){ + ClusteredCursor::~ClusteredCursor() { _done = true; // just in case } - void ClusteredCursor::init(){ + void ClusteredCursor::init() { if ( _didInit ) return; _didInit = true; _init(); } - - auto_ptr ClusteredCursor::query( const string& server , int num , BSONObj extra , int skipLeft ){ + + auto_ptr ClusteredCursor::query( const string& server , int num , BSONObj extra , int skipLeft ) { uassert( 10017 , "cursor already done" , ! _done ); assert( _didInit ); - + BSONObj q = _query; - if ( ! extra.isEmpty() ){ + if ( ! extra.isEmpty() ) { q = concatQuery( q , extra ); } - ShardConnection conn( server , _ns ); - - if ( conn.setVersion() ){ - conn.done(); - throw StaleConfigException( _ns , "ClusteredCursor::query ShardConnection had to change" , true ); - } - - if ( logLevel >= 5 ){ - log(5) << "ClusteredCursor::query (" << type() << ") server:" << server - << " ns:" << _ns << " query:" << q << " num:" << num - << " _fields:" << _fields << " options: " << _options << endl; - } - - auto_ptr cursor = - conn->query( _ns , q , num , 0 , ( _fields.isEmpty() ? 0 : &_fields ) , _options , _batchSize == 0 ? 0 : _batchSize + skipLeft ); - - assert( cursor.get() ); - - if ( cursor->hasResultFlag( ResultFlag_ShardConfigStale ) ){ + try { + ShardConnection conn( server , _ns ); + + if ( conn.setVersion() ) { + conn.done(); + throw StaleConfigException( _ns , "ClusteredCursor::query ShardConnection had to change" , true ); + } + + if ( logLevel >= 5 ) { + log(5) << "ClusteredCursor::query (" << type() << ") server:" << server + << " ns:" << _ns << " query:" << q << " num:" << num + << " _fields:" << _fields << " options: " << _options << endl; + } + + auto_ptr cursor = + conn->query( _ns , q , num , 0 , ( _fields.isEmpty() ? 0 : &_fields ) , _options , _batchSize == 0 ? 0 : _batchSize + skipLeft ); + + if ( ! cursor.get() && _options & QueryOption_PartialResults ) { + _done = true; + conn.done(); + return cursor; + } + + massert( 13633 , str::stream() << "error querying server: " << server , cursor.get() ); + + if ( cursor->hasResultFlag( ResultFlag_ShardConfigStale ) ) { + conn.done(); + throw StaleConfigException( _ns , "ClusteredCursor::query" ); + } + + if ( cursor->hasResultFlag( ResultFlag_ErrSet ) ) { + conn.done(); + BSONObj o = cursor->next(); + throw UserException( o["code"].numberInt() , o["$err"].String() ); + } + + + cursor->attach( &conn ); + conn.done(); - throw StaleConfigException( _ns , "ClusteredCursor::query" ); + return cursor; } - - if ( cursor->hasResultFlag( ResultFlag_ErrSet ) ){ - conn.done(); - BSONObj o = cursor->next(); - throw UserException( o["code"].numberInt() , o["$err"].String() ); + catch ( SocketException& e ) { + if ( ! ( _options & QueryOption_PartialResults ) ) + throw e; + _done = true; + return auto_ptr(); } - - - cursor->attach( &conn ); - - conn.done(); - return cursor; } - BSONObj ClusteredCursor::explain( const string& server , BSONObj extra ){ + BSONObj ClusteredCursor::explain( const string& server , BSONObj extra ) { BSONObj q = _query; - if ( ! extra.isEmpty() ){ + if ( ! extra.isEmpty() ) { q = concatQuery( q , extra ); } + BSONObj o; + ShardConnection conn( server , _ns ); - BSONObj o = conn->findOne( _ns , Query( q ).explain() ); + auto_ptr cursor = conn->query( _ns , Query( q ).explain() , abs( _batchSize ) * -1 , 0 , _fields.isEmpty() ? 0 : &_fields ); + if ( cursor.get() && cursor->more() ) + o = cursor->next().getOwned(); conn.done(); return o; } - BSONObj ClusteredCursor::concatQuery( const BSONObj& query , const BSONObj& extraFilter ){ + BSONObj ClusteredCursor::concatQuery( const BSONObj& query , const BSONObj& extraFilter ) { if ( ! query.hasField( "query" ) ) return _concatFilter( query , extraFilter ); BSONObjBuilder b; BSONObjIterator i( query ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); - if ( strcmp( e.fieldName() , "query" ) ){ + if ( strcmp( e.fieldName() , "query" ) ) { b.append( e ); continue; } - + b.append( "query" , _concatFilter( e.embeddedObjectUserCheck() , extraFilter ) ); } return b.obj(); } - - BSONObj ClusteredCursor::_concatFilter( const BSONObj& filter , const BSONObj& extra ){ + + BSONObj ClusteredCursor::_concatFilter( const BSONObj& filter , const BSONObj& extra ) { BSONObjBuilder b; b.appendElements( filter ); b.appendElements( extra ); @@ -147,32 +165,41 @@ namespace mongo { // TODO: should do some simplification here if possibl ideally } - BSONObj ClusteredCursor::explain(){ + BSONObj ClusteredCursor::explain() { + // Note: by default we filter out allPlans and oldPlan in the shell's + // explain() function. If you add any recursive structures, make sure to + // edit the JS to make sure everything gets filtered. + BSONObjBuilder b; b.append( "clusteredType" , type() ); - long long nscanned = 0; - long long nscannedObjects = 0; - long long n = 0; long long millis = 0; double numExplains = 0; - + + map counters; + map > out; { _explain( out ); - + BSONObjBuilder x( b.subobjStart( "shards" ) ); - for ( map >::iterator i=out.begin(); i!=out.end(); ++i ){ + for ( map >::iterator i=out.begin(); i!=out.end(); ++i ) { string shard = i->first; list l = i->second; - BSONArrayBuilder y( x.subarrayStart( shard.c_str() ) ); - for ( list::iterator j=l.begin(); j!=l.end(); ++j ){ + BSONArrayBuilder y( x.subarrayStart( shard ) ); + for ( list::iterator j=l.begin(); j!=l.end(); ++j ) { BSONObj temp = *j; y.append( temp ); - nscanned += temp["nscanned"].numberLong(); - nscannedObjects += temp["nscannedObjects"].numberLong(); - n += temp["n"].numberLong(); + BSONObjIterator k( temp ); + while ( k.more() ) { + BSONElement z = k.next(); + if ( z.fieldName()[0] != 'n' ) + continue; + long long& c = counters[z.fieldName()]; + c += z.numberLong(); + } + millis += temp["millis"].numberLong(); numExplains++; } @@ -181,9 +208,9 @@ namespace mongo { x.done(); } - b.appendNumber( "nscanned" , nscanned ); - b.appendNumber( "nscannedObjects" , nscannedObjects ); - b.appendNumber( "n" , n ); + for ( map::iterator i=counters.begin(); i!=counters.end(); ++i ) + b.appendNumber( i->first , i->second ); + b.appendNumber( "millisTotal" , millis ); b.append( "millisAvg" , (int)((double)millis / numExplains ) ); b.append( "numQueries" , (int)numExplains ); @@ -191,37 +218,37 @@ namespace mongo { return b.obj(); } - + // -------- FilteringClientCursor ----------- FilteringClientCursor::FilteringClientCursor( const BSONObj filter ) - : _matcher( filter ) , _done( true ){ + : _matcher( filter ) , _done( true ) { } FilteringClientCursor::FilteringClientCursor( auto_ptr cursor , const BSONObj filter ) - : _matcher( filter ) , _cursor( cursor ) , _done( cursor.get() == 0 ){ + : _matcher( filter ) , _cursor( cursor ) , _done( cursor.get() == 0 ) { } - - FilteringClientCursor::~FilteringClientCursor(){ + + FilteringClientCursor::~FilteringClientCursor() { } - - void FilteringClientCursor::reset( auto_ptr cursor ){ + + void FilteringClientCursor::reset( auto_ptr cursor ) { _cursor = cursor; _next = BSONObj(); _done = _cursor.get() == 0; } - bool FilteringClientCursor::more(){ + bool FilteringClientCursor::more() { if ( ! _next.isEmpty() ) return true; - + if ( _done ) return false; - + _advance(); return ! _next.isEmpty(); } - - BSONObj FilteringClientCursor::next(){ + + BSONObj FilteringClientCursor::next() { assert( ! _next.isEmpty() ); assert( ! _done ); @@ -231,20 +258,20 @@ namespace mongo { return ret; } - BSONObj FilteringClientCursor::peek(){ + BSONObj FilteringClientCursor::peek() { if ( _next.isEmpty() ) _advance(); return _next; } - - void FilteringClientCursor::_advance(){ + + void FilteringClientCursor::_advance() { assert( _next.isEmpty() ); if ( ! _cursor.get() || _done ) return; - - while ( _cursor->more() ){ + + while ( _cursor->more() ) { _next = _cursor->next(); - if ( _matcher.matches( _next ) ){ + if ( _matcher.matches( _next ) ) { if ( ! _cursor->moreInCurrentBatch() ) _next = _next.getOwned(); return; @@ -253,53 +280,53 @@ namespace mongo { } _done = true; } - + // -------- SerialServerClusteredCursor ----------- - - SerialServerClusteredCursor::SerialServerClusteredCursor( const set& servers , QueryMessage& q , int sortOrder) : ClusteredCursor( q ){ + + SerialServerClusteredCursor::SerialServerClusteredCursor( const set& servers , QueryMessage& q , int sortOrder) : ClusteredCursor( q ) { for ( set::const_iterator i = servers.begin(); i!=servers.end(); i++ ) _servers.push_back( *i ); - + if ( sortOrder > 0 ) sort( _servers.begin() , _servers.end() ); else if ( sortOrder < 0 ) sort( _servers.rbegin() , _servers.rend() ); - + _serverIndex = 0; _needToSkip = q.ntoskip; } - - bool SerialServerClusteredCursor::more(){ - + + bool SerialServerClusteredCursor::more() { + // TODO: optimize this by sending on first query and then back counting // tricky in case where 1st server doesn't have any after // need it to send n skipped - while ( _needToSkip > 0 && _current.more() ){ + while ( _needToSkip > 0 && _current.more() ) { _current.next(); _needToSkip--; } - + if ( _current.more() ) return true; - - if ( _serverIndex >= _servers.size() ){ + + if ( _serverIndex >= _servers.size() ) { return false; } - + ServerAndQuery& sq = _servers[_serverIndex++]; _current.reset( query( sq._server , 0 , sq._extra ) ); return more(); } - - BSONObj SerialServerClusteredCursor::next(){ + + BSONObj SerialServerClusteredCursor::next() { uassert( 10018 , "no more items" , more() ); return _current.next(); } - void SerialServerClusteredCursor::_explain( map< string,list >& out ){ - for ( unsigned i=0; i<_servers.size(); i++ ){ + void SerialServerClusteredCursor::_explain( map< string,list >& out ) { + for ( unsigned i=0; i<_servers.size(); i++ ) { ServerAndQuery& sq = _servers[i]; list & l = out[sq._server]; l.push_back( explain( sq._server , sq._extra ) ); @@ -307,132 +334,142 @@ namespace mongo { } // -------- ParallelSortClusteredCursor ----------- - - ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set& servers , QueryMessage& q , - const BSONObj& sortKey ) - : ClusteredCursor( q ) , _servers( servers ){ + + ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set& servers , QueryMessage& q , + const BSONObj& sortKey ) + : ClusteredCursor( q ) , _servers( servers ) { _sortKey = sortKey.getOwned(); _needToSkip = q.ntoskip; _finishCons(); } - ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set& servers , const string& ns , - const Query& q , - int options , const BSONObj& fields ) - : ClusteredCursor( ns , q.obj , options , fields ) , _servers( servers ){ + ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set& servers , const string& ns , + const Query& q , + int options , const BSONObj& fields ) + : ClusteredCursor( ns , q.obj , options , fields ) , _servers( servers ) { _sortKey = q.getSort().copy(); _needToSkip = 0; _finishCons(); } - void ParallelSortClusteredCursor::_finishCons(){ + void ParallelSortClusteredCursor::_finishCons() { _numServers = _servers.size(); _cursors = 0; - if ( ! _sortKey.isEmpty() && ! _fields.isEmpty() ){ - // we need to make sure the sort key is in the project - bool isNegative = false; + if ( ! _sortKey.isEmpty() && ! _fields.isEmpty() ) { + // we need to make sure the sort key is in the projection + + set sortKeyFields; + _sortKey.getFieldNames(sortKeyFields); + BSONObjBuilder b; + bool isNegative = false; { BSONObjIterator i( _fields ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); b.append( e ); - if ( ! e.trueValue() ) - isNegative = true; - } - } - - { - BSONObjIterator i( _sortKey ); - while ( i.more() ){ - BSONElement e = i.next(); - BSONElement f = _fields.getField( e.fieldName() ); - if ( isNegative ){ - uassert( 13431 , "have to have sort key in projection and removing it" , f.eoo() ); + + string fieldName = e.fieldName(); + + // exact field + bool found = sortKeyFields.erase(fieldName); + + // subfields + set::const_iterator begin = sortKeyFields.lower_bound(fieldName + ".\x00"); + set::const_iterator end = sortKeyFields.lower_bound(fieldName + ".\xFF"); + sortKeyFields.erase(begin, end); + + if ( ! e.trueValue() ) { + uassert( 13431 , "have to have sort key in projection and removing it" , !found && begin == end ); } - else if ( f.eoo() ){ - // add to projection - b.append( e ); + else if (!e.isABSONObj()) { + isNegative = true; } } } - + + if (isNegative) { + for (set::const_iterator it(sortKeyFields.begin()), end(sortKeyFields.end()); it != end; ++it) { + b.append(*it, 1); + } + } + _fields = b.obj(); } } - - void ParallelSortClusteredCursor::_init(){ + + void ParallelSortClusteredCursor::_init() { assert( ! _cursors ); _cursors = new FilteringClientCursor[_numServers]; - + // TODO: parellize int num = 0; - for ( set::iterator i = _servers.begin(); i!=_servers.end(); ++i ){ + for ( set::iterator i = _servers.begin(); i!=_servers.end(); ++i ) { const ServerAndQuery& sq = *i; _cursors[num++].reset( query( sq._server , 0 , sq._extra , _needToSkip ) ); } - + } - - ParallelSortClusteredCursor::~ParallelSortClusteredCursor(){ + + ParallelSortClusteredCursor::~ParallelSortClusteredCursor() { delete [] _cursors; _cursors = 0; } - bool ParallelSortClusteredCursor::more(){ + bool ParallelSortClusteredCursor::more() { - if ( _needToSkip > 0 ){ + if ( _needToSkip > 0 ) { int n = _needToSkip; _needToSkip = 0; - while ( n > 0 && more() ){ + while ( n > 0 && more() ) { BSONObj x = next(); n--; } _needToSkip = n; } - - for ( int i=0; i<_numServers; i++ ){ + + for ( int i=0; i<_numServers; i++ ) { if ( _cursors[i].more() ) return true; } return false; } - - BSONObj ParallelSortClusteredCursor::next(){ + + BSONObj ParallelSortClusteredCursor::next() { BSONObj best = BSONObj(); int bestFrom = -1; - - for ( int i=0; i<_numServers; i++){ + + for ( int i=0; i<_numServers; i++) { if ( ! _cursors[i].more() ) continue; - + BSONObj me = _cursors[i].peek(); - if ( best.isEmpty() ){ + if ( best.isEmpty() ) { best = me; bestFrom = i; continue; } - + int comp = best.woSortOrder( me , _sortKey , true ); if ( comp < 0 ) continue; - + best = me; bestFrom = i; } - + uassert( 10019 , "no more elements" , ! best.isEmpty() ); _cursors[bestFrom].next(); - + return best; } - void ParallelSortClusteredCursor::_explain( map< string,list >& out ){ - for ( set::iterator i=_servers.begin(); i!=_servers.end(); ++i ){ + void ParallelSortClusteredCursor::_explain( map< string,list >& out ) { + for ( set::iterator i=_servers.begin(); i!=_servers.end(); ++i ) { const ServerAndQuery& sq = *i; list & l = out[sq._server]; l.push_back( explain( sq._server , sq._extra ) ); @@ -444,39 +481,50 @@ namespace mongo { // ---- Future ----- // ----------------- - Future::CommandResult::CommandResult( const string& server , const string& db , const BSONObj& cmd ){ + Future::CommandResult::CommandResult( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn ) { _server = server; _db = db; _cmd = cmd; + _conn = conn; _done = false; } - bool Future::CommandResult::join(){ + bool Future::CommandResult::join() { _thr->join(); assert( _done ); return _ok; } - void Future::commandThread( shared_ptr res ){ + void Future::commandThread(shared_ptr res) { setThreadName( "future" ); try { - ScopedDbConnection conn( res->_server ); + DBClientBase * conn = res->_conn; + + scoped_ptr myconn; + if ( ! conn ){ + myconn.reset( new ScopedDbConnection( res->_server ) ); + conn = myconn->get(); + } + res->_ok = conn->runCommand( res->_db , res->_cmd , res->_res ); - conn.done(); + + if ( myconn ) + myconn->done(); + } - catch ( std::exception& e ){ + catch ( std::exception& e ) { error() << "Future::commandThread exception: " << e.what() << endl; res->_ok = false; } res->_done = true; } - shared_ptr Future::spawnCommand( const string& server , const string& db , const BSONObj& cmd ){ - shared_ptr res( new Future::CommandResult( server , db , cmd ) ); - res->_thr.reset( new boost::thread( boost::bind( Future::commandThread , res ) ) ); + shared_ptr Future::spawnCommand( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn ) { + shared_ptr res (new Future::CommandResult( server , db , cmd , conn )); + res->_thr.reset( new boost::thread( boost::bind(Future::commandThread, res) ) ); + return res; } - - + } diff --git a/client/parallel.h b/client/parallel.h index 603cfe7..0809376 100644 --- a/client/parallel.h +++ b/client/parallel.h @@ -24,6 +24,7 @@ #include "redef_macros.h" #include "../db/dbmessage.h" #include "../db/matcher.h" +#include "../util/concurrency/mvar.h" namespace mongo { @@ -32,14 +33,14 @@ namespace mongo { */ class ServerAndQuery { public: - ServerAndQuery( const string& server , BSONObj extra = BSONObj() , BSONObj orderObject = BSONObj() ) : - _server( server ) , _extra( extra.getOwned() ) , _orderObject( orderObject.getOwned() ){ + ServerAndQuery( const string& server , BSONObj extra = BSONObj() , BSONObj orderObject = BSONObj() ) : + _server( server ) , _extra( extra.getOwned() ) , _orderObject( orderObject.getOwned() ) { } - bool operator<( const ServerAndQuery& other ) const{ + bool operator<( const ServerAndQuery& other ) const { if ( ! _orderObject.isEmpty() ) return _orderObject.woCompare( other._orderObject ) < 0; - + if ( _server < other._server ) return true; if ( other._server > _server ) @@ -71,28 +72,28 @@ namespace mongo { ClusteredCursor( QueryMessage& q ); ClusteredCursor( const string& ns , const BSONObj& q , int options=0 , const BSONObj& fields=BSONObj() ); virtual ~ClusteredCursor(); - + /** call before using */ void init(); - + virtual bool more() = 0; virtual BSONObj next() = 0; - + static BSONObj concatQuery( const BSONObj& query , const BSONObj& extraFilter ); - + virtual string type() const = 0; virtual BSONObj explain(); protected: - + virtual void _init() = 0; auto_ptr query( const string& server , int num = 0 , BSONObj extraFilter = BSONObj() , int skipLeft = 0 ); BSONObj explain( const string& server , BSONObj extraFilter = BSONObj() ); - + static BSONObj _concatFilter( const BSONObj& filter , const BSONObj& extraFilter ); - + virtual void _explain( map< string,list >& out ) = 0; string _ns; @@ -112,19 +113,19 @@ namespace mongo { FilteringClientCursor( const BSONObj filter = BSONObj() ); FilteringClientCursor( auto_ptr cursor , const BSONObj filter = BSONObj() ); ~FilteringClientCursor(); - + void reset( auto_ptr cursor ); - + bool more(); BSONObj next(); - + BSONObj peek(); private: void _advance(); - + Matcher _matcher; auto_ptr _cursor; - + BSONObj _next; bool _done; }; @@ -132,22 +133,22 @@ namespace mongo { class Servers { public: - Servers(){ + Servers() { } - - void add( const ServerAndQuery& s ){ + + void add( const ServerAndQuery& s ) { add( s._server , s._extra ); } - - void add( const string& server , const BSONObj& filter ){ + + void add( const string& server , const BSONObj& filter ) { vector& mine = _filters[server]; mine.push_back( filter.getOwned() ); } - + // TOOO: pick a less horrible name class View { - View( const Servers* s ){ - for ( map >::const_iterator i=s->_filters.begin(); i!=s->_filters.end(); ++i ){ + View( const Servers* s ) { + for ( map >::const_iterator i=s->_filters.begin(); i!=s->_filters.end(); ++i ) { _servers.push_back( i->first ); _filters.push_back( i->second ); } @@ -164,7 +165,7 @@ namespace mongo { vector getFilter( int n ) const { return _filters[ n ]; } - + private: vector _servers; vector< vector > _filters; @@ -175,7 +176,7 @@ namespace mongo { View view() const { return View( this ); } - + private: map > _filters; @@ -198,13 +199,13 @@ namespace mongo { protected: virtual void _explain( map< string,list >& out ); - void _init(){} + void _init() {} vector _servers; unsigned _serverIndex; - + FilteringClientCursor _current; - + int _needToSkip; }; @@ -212,11 +213,11 @@ namespace mongo { /** * runs a query in parellel across N servers * sots - */ + */ class ParallelSortClusteredCursor : public ClusteredCursor { public: ParallelSortClusteredCursor( const set& servers , QueryMessage& q , const BSONObj& sortKey ); - ParallelSortClusteredCursor( const set& servers , const string& ns , + ParallelSortClusteredCursor( const set& servers , const string& ns , const Query& q , int options=0, const BSONObj& fields=BSONObj() ); virtual ~ParallelSortClusteredCursor(); virtual bool more(); @@ -231,7 +232,7 @@ namespace mongo { int _numServers; set _servers; BSONObj _sortKey; - + FilteringClientCursor * _cursors; int _needToSkip; }; @@ -245,11 +246,11 @@ namespace mongo { public: class CommandResult { public: - + string getServer() const { return _server; } bool isDone() const { return _done; } - + bool ok() const { assert( _done ); return _ok; @@ -265,30 +266,37 @@ namespace mongo { returns ok() */ bool join(); - + private: - - CommandResult( const string& server , const string& db , const BSONObj& cmd ); - + + CommandResult( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn ); + string _server; string _db; BSONObj _cmd; + DBClientBase * _conn; scoped_ptr _thr; - + BSONObj _res; - bool _done; bool _ok; - + bool _done; + friend class Future; }; + + static void commandThread(shared_ptr res); - static void commandThread( shared_ptr res ); - - static shared_ptr spawnCommand( const string& server , const string& db , const BSONObj& cmd ); + /** + * @param server server name + * @param db db name + * @param cmd cmd to exec + * @param conn optional connection to use. will use standard pooled if non-specified + */ + static shared_ptr spawnCommand( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn = 0 ); }; - + } #include "undef_macros.h" diff --git a/client/redef_macros.h b/client/redef_macros.h index dd2e66f..a4cb1c9 100644 --- a/client/redef_macros.h +++ b/client/redef_macros.h @@ -50,6 +50,9 @@ #define RARELY MONGO_RARELY #define ONCE MONGO_ONCE +// util/log.h +#define LOG MONGO_LOG + #undef MONGO_MACROS_CLEANED #endif diff --git a/client/simple_client_demo.cpp b/client/simple_client_demo.cpp new file mode 100644 index 0000000..fa2f4a8 --- /dev/null +++ b/client/simple_client_demo.cpp @@ -0,0 +1,36 @@ +/* simple_client_demo.cpp + + See also : http://www.mongodb.org/pages/viewpage.action?pageId=133415 + + How to build and run: + + (1) Using the mongoclient: + g++ simple_client_demo.cpp -lmongoclient -lboost_thread-mt -lboost_filesystem -lboost_program_options + ./a.out + + (2) using client_lib.cpp: + g++ -I .. simple_client_demo.cpp mongo_client_lib.cpp -lboost_thread-mt -lboost_filesystem + ./a.out +*/ + +#include +#include "dbclient.h" // the mongo c++ driver + +using namespace std; +using namespace mongo; +using namespace bson; + +int main() { + cout << "connecting to localhost..." << endl; + DBClientConnection c; + c.connect("localhost"); + cout << "connected ok" << endl; + unsigned long long count = c.count("test.foo"); + cout << "count of exiting documents in collection test.foo : " << count << endl; + + bo o = BSON( "hello" << "world" ); + c.insert("test.foo", o); + + return 0; +} + diff --git a/client/syncclusterconnection.cpp b/client/syncclusterconnection.cpp index 99f6067..4fafdc1 100644 --- a/client/syncclusterconnection.cpp +++ b/client/syncclusterconnection.cpp @@ -37,11 +37,11 @@ namespace mongo { for( list::const_iterator i = L.begin(); i != L.end(); i++ ) _connect( i->toString() ); } - + SyncClusterConnection::SyncClusterConnection( string commaSeperated ) : _mutex("SyncClusterConnection") { _address = commaSeperated; string::size_type idx; - while ( ( idx = commaSeperated.find( ',' ) ) != string::npos ){ + while ( ( idx = commaSeperated.find( ',' ) ) != string::npos ) { string h = commaSeperated.substr( 0 , idx ); commaSeperated = commaSeperated.substr( idx + 1 ); _connect( h ); @@ -50,7 +50,7 @@ namespace mongo { uassert( 8004 , "SyncClusterConnection needs 3 servers" , _conns.size() == 3 ); } - SyncClusterConnection::SyncClusterConnection( string a , string b , string c ) : _mutex("SyncClusterConnection") { + SyncClusterConnection::SyncClusterConnection( string a , string b , string c ) : _mutex("SyncClusterConnection") { _address = a + "," + b + "," + c; // connect to all even if not working _connect( a ); @@ -62,52 +62,55 @@ namespace mongo { assert(0); } - SyncClusterConnection::~SyncClusterConnection(){ + SyncClusterConnection::~SyncClusterConnection() { for ( size_t i=0; i<_conns.size(); i++ ) delete _conns[i]; _conns.clear(); } - bool SyncClusterConnection::prepare( string& errmsg ){ + bool SyncClusterConnection::prepare( string& errmsg ) { _lastErrors.clear(); return fsync( errmsg ); } - - bool SyncClusterConnection::fsync( string& errmsg ){ + + bool SyncClusterConnection::fsync( string& errmsg ) { bool ok = true; errmsg = ""; - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { BSONObj res; try { if ( _conns[i]->simpleCommand( "admin" , 0 , "fsync" ) ) continue; } - catch ( std::exception& e ){ + catch ( DBException& e ) { + errmsg += e.toString(); + } + catch ( std::exception& e ) { errmsg += e.what(); } - catch ( ... ){ + catch ( ... ) { } ok = false; - errmsg += _conns[i]->toString() + ":" + res.toString(); + errmsg += " " + _conns[i]->toString() + ":" + res.toString(); } return ok; } - void SyncClusterConnection::_checkLast(){ + void SyncClusterConnection::_checkLast() { _lastErrors.clear(); vector errors; - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { BSONObj res; string err; try { if ( ! _conns[i]->runCommand( "admin" , BSON( "getlasterror" << 1 << "fsync" << 1 ) , res ) ) err = "cmd failed: "; } - catch ( std::exception& e ){ + catch ( std::exception& e ) { err += e.what(); } - catch ( ... ){ + catch ( ... ) { err += "unknown failure"; } _lastErrors.push_back( res.getOwned() ); @@ -115,13 +118,13 @@ namespace mongo { } assert( _lastErrors.size() == errors.size() && _lastErrors.size() == _conns.size() ); - + stringstream err; bool ok = true; - - for ( size_t i = 0; i<_conns.size(); i++ ){ + + for ( size_t i = 0; i<_conns.size(); i++ ) { BSONObj res = _lastErrors[i]; - if ( res["ok"].trueValue() && res["fsyncFiles"].numberInt() > 0 ) + if ( res["ok"].trueValue() && (res["fsyncFiles"].numberInt() > 0 || res.hasElement("waited"))) continue; ok = false; err << _conns[i]->toString() << ": " << res << " " << errors[i]; @@ -132,13 +135,13 @@ namespace mongo { throw UserException( 8001 , (string)"SyncClusterConnection write op failed: " + err.str() ); } - BSONObj SyncClusterConnection::getLastErrorDetailed(){ + BSONObj SyncClusterConnection::getLastErrorDetailed() { if ( _lastErrors.size() ) return _lastErrors[0]; return DBClientBase::getLastErrorDetailed(); } - void SyncClusterConnection::_connect( string host ){ + void SyncClusterConnection::_connect( string host ) { log() << "SyncClusterConnection connecting to [" << host << "]" << endl; DBClientConnection * c = new DBClientConnection( true ); string errmsg; @@ -148,40 +151,42 @@ namespace mongo { _conns.push_back( c ); } - bool SyncClusterConnection::callRead( Message& toSend , Message& response ){ + bool SyncClusterConnection::callRead( Message& toSend , Message& response ) { // TODO: need to save state of which one to go back to somehow... return _conns[0]->callRead( toSend , response ); } BSONObj SyncClusterConnection::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) { - - if ( ns.find( ".$cmd" ) != string::npos ){ + + if ( ns.find( ".$cmd" ) != string::npos ) { string cmdName = query.obj.firstElement().fieldName(); int lockType = _lockType( cmdName ); - if ( lockType > 0 ){ // write $cmd + if ( lockType > 0 ) { // write $cmd string errmsg; if ( ! prepare( errmsg ) ) throw UserException( 13104 , (string)"SyncClusterConnection::findOne prepare failed: " + errmsg ); - + vector all; - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { all.push_back( _conns[i]->findOne( ns , query , 0 , queryOptions ).getOwned() ); } - + _checkLast(); - for ( size_t i=0; itoString(); + ss << " ns: " << ns; + ss << " cmd: " << query.toString(); throw UserException( 13105 , ss.str() ); } - + return all[0]; } } @@ -191,9 +196,9 @@ namespace mongo { auto_ptr SyncClusterConnection::query(const string &ns, Query query, int nToReturn, int nToSkip, - const BSONObj *fieldsToReturn, int queryOptions, int batchSize ){ + const BSONObj *fieldsToReturn, int queryOptions, int batchSize ) { _lastErrors.clear(); - if ( ns.find( ".$cmd" ) != string::npos ){ + if ( ns.find( ".$cmd" ) != string::npos ) { string cmdName = query.obj.firstElement().fieldName(); int lockType = _lockType( cmdName ); uassert( 13054 , (string)"write $cmd not supported in SyncClusterConnection::query for:" + cmdName , lockType <= 0 ); @@ -202,7 +207,7 @@ namespace mongo { return _queryOnActive( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions , batchSize ); } - bool SyncClusterConnection::_commandOnActive(const string &dbname, const BSONObj& cmd, BSONObj &info, int options ){ + bool SyncClusterConnection::_commandOnActive(const string &dbname, const BSONObj& cmd, BSONObj &info, int options ) { auto_ptr cursor = _queryOnActive( dbname + ".$cmd" , cmd , 1 , 0 , 0 , options , 0 ); if ( cursor->more() ) info = cursor->next().copy(); @@ -210,153 +215,164 @@ namespace mongo { info = BSONObj(); return isOk( info ); } - + auto_ptr SyncClusterConnection::_queryOnActive(const string &ns, Query query, int nToReturn, int nToSkip, - const BSONObj *fieldsToReturn, int queryOptions, int batchSize ){ - - for ( size_t i=0; i<_conns.size(); i++ ){ + const BSONObj *fieldsToReturn, int queryOptions, int batchSize ) { + + for ( size_t i=0; i<_conns.size(); i++ ) { try { - auto_ptr cursor = + auto_ptr cursor = _conns[i]->query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions , batchSize ); if ( cursor.get() ) return cursor; log() << "query failed to: " << _conns[i]->toString() << " no data" << endl; } - catch ( ... ){ + catch ( ... ) { log() << "query failed to: " << _conns[i]->toString() << " exception" << endl; } } throw UserException( 8002 , "all servers down!" ); } - - auto_ptr SyncClusterConnection::getMore( const string &ns, long long cursorId, int nToReturn, int options ){ - uassert( 10022 , "SyncClusterConnection::getMore not supported yet" , 0); + + auto_ptr SyncClusterConnection::getMore( const string &ns, long long cursorId, int nToReturn, int options ) { + uassert( 10022 , "SyncClusterConnection::getMore not supported yet" , 0); auto_ptr c; return c; } - - void SyncClusterConnection::insert( const string &ns, BSONObj obj ){ - uassert( 13119 , (string)"SyncClusterConnection::insert obj has to have an _id: " + obj.jsonString() , + void SyncClusterConnection::insert( const string &ns, BSONObj obj ) { + + uassert( 13119 , (string)"SyncClusterConnection::insert obj has to have an _id: " + obj.jsonString() , ns.find( ".system.indexes" ) != string::npos || obj["_id"].type() ); - + string errmsg; if ( ! prepare( errmsg ) ) throw UserException( 8003 , (string)"SyncClusterConnection::insert prepare failed: " + errmsg ); - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { _conns[i]->insert( ns , obj ); } - + _checkLast(); } - - void SyncClusterConnection::insert( const string &ns, const vector< BSONObj >& v ){ - uassert( 10023 , "SyncClusterConnection bulk insert not implemented" , 0); + + void SyncClusterConnection::insert( const string &ns, const vector< BSONObj >& v ) { + uassert( 10023 , "SyncClusterConnection bulk insert not implemented" , 0); } - void SyncClusterConnection::remove( const string &ns , Query query, bool justOne ){ + void SyncClusterConnection::remove( const string &ns , Query query, bool justOne ) { string errmsg; if ( ! prepare( errmsg ) ) throw UserException( 8020 , (string)"SyncClusterConnection::remove prepare failed: " + errmsg ); - - for ( size_t i=0; i<_conns.size(); i++ ){ + + for ( size_t i=0; i<_conns.size(); i++ ) { _conns[i]->remove( ns , query , justOne ); } - + _checkLast(); } - void SyncClusterConnection::update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi ){ + void SyncClusterConnection::update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi ) { - if ( upsert ){ + if ( upsert ) { uassert( 13120 , "SyncClusterConnection::update upsert query needs _id" , query.obj["_id"].type() ); } - if ( _writeConcern ){ + if ( _writeConcern ) { string errmsg; if ( ! prepare( errmsg ) ) throw UserException( 8005 , (string)"SyncClusterConnection::udpate prepare failed: " + errmsg ); } - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { try { _conns[i]->update( ns , query , obj , upsert , multi ); } - catch ( std::exception& e ){ + catch ( std::exception& e ) { if ( _writeConcern ) throw e; } } - - if ( _writeConcern ){ + + if ( _writeConcern ) { _checkLast(); assert( _lastErrors.size() > 1 ); - + int a = _lastErrors[0]["n"].numberInt(); - for ( unsigned i=1; i<_lastErrors.size(); i++ ){ + for ( unsigned i=1; i<_lastErrors.size(); i++ ) { int b = _lastErrors[i]["n"].numberInt(); if ( a == b ) continue; - - throw UpdateNotTheSame( 8017 , "update not consistent" , _connAddresses , _lastErrors ); + + throw UpdateNotTheSame( 8017 , + str::stream() + << "update not consistent " + << " ns: " << ns + << " query: " << query.toString() + << " update: " << obj + << " gle1: " << _lastErrors[0] + << " gle2: " << _lastErrors[i] , + _connAddresses , _lastErrors ); } } } - string SyncClusterConnection::_toString() const { + string SyncClusterConnection::_toString() const { stringstream ss; ss << "SyncClusterConnection [" << _address << "]"; return ss.str(); } - bool SyncClusterConnection::call( Message &toSend, Message &response, bool assertOk ){ - uassert( 8006 , "SyncClusterConnection::call can only be used directly for dbQuery" , + bool SyncClusterConnection::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { + uassert( 8006 , "SyncClusterConnection::call can only be used directly for dbQuery" , toSend.operation() == dbQuery ); - + DbMessage d( toSend ); uassert( 8007 , "SyncClusterConnection::call can't handle $cmd" , strstr( d.getns(), "$cmd" ) == 0 ); - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { try { bool ok = _conns[i]->call( toSend , response , assertOk ); - if ( ok ) + if ( ok ) { + if ( actualServer ) + *actualServer = _connAddresses[i]; return ok; + } log() << "call failed to: " << _conns[i]->toString() << " no data" << endl; } - catch ( ... ){ + catch ( ... ) { log() << "call failed to: " << _conns[i]->toString() << " exception" << endl; } } throw UserException( 8008 , "all servers down!" ); } - - void SyncClusterConnection::say( Message &toSend ){ + + void SyncClusterConnection::say( Message &toSend ) { string errmsg; if ( ! prepare( errmsg ) ) throw UserException( 13397 , (string)"SyncClusterConnection::say prepare failed: " + errmsg ); - for ( size_t i=0; i<_conns.size(); i++ ){ + for ( size_t i=0; i<_conns.size(); i++ ) { _conns[i]->say( toSend ); } - + _checkLast(); } - - void SyncClusterConnection::sayPiggyBack( Message &toSend ){ + + void SyncClusterConnection::sayPiggyBack( Message &toSend ) { assert(0); } - int SyncClusterConnection::_lockType( const string& name ){ + int SyncClusterConnection::_lockType( const string& name ) { { scoped_lock lk(_mutex); map::iterator i = _lockTypes.find( name ); if ( i != _lockTypes.end() ) return i->second; } - + BSONObj info; - uassert( 13053 , "help failed" , _commandOnActive( "admin" , BSON( name << "1" << "help" << 1 ) , info ) ); + uassert( 13053 , str::stream() << "help failed: " << info , _commandOnActive( "admin" , BSON( name << "1" << "help" << 1 ) , info ) ); int lockType = info["lockType"].numberInt(); @@ -365,20 +381,9 @@ namespace mongo { return lockType; } - void SyncClusterConnection::killCursor( long long cursorID ){ + void SyncClusterConnection::killCursor( long long cursorID ) { // should never need to do this assert(0); } - bool SyncClusterConnection::isMember( const DBConnector * conn ) const { - if ( conn == this ) - return true; - - for ( unsigned i=0; i<_conns.size(); i++ ) - if ( _conns[i]->isMember( conn ) ) - return true; - - return false; - } - } diff --git a/client/syncclusterconnection.h b/client/syncclusterconnection.h index 4292e3d..c946073 100644 --- a/client/syncclusterconnection.h +++ b/client/syncclusterconnection.h @@ -16,6 +16,7 @@ * limitations under the License. */ +#pragma once #include "../pch.h" #include "dbclient.h" @@ -26,15 +27,15 @@ namespace mongo { /** * This is a connection to a cluster of servers that operate as one * for super high durability. - * + * * Write operations are two-phase. First, all nodes are asked to fsync. If successful - * everywhere, the write is sent everywhere and then followed by an fsync. There is no - * rollback if a problem occurs during the second phase. Naturally, with all these fsyncs, + * everywhere, the write is sent everywhere and then followed by an fsync. There is no + * rollback if a problem occurs during the second phase. Naturally, with all these fsyncs, * these operations will be quite slow -- use sparingly. - * + * * Read operations are sent to a single random node. - * - * The class checks if a command is read or write style, and sends to a single + * + * The class checks if a command is read or write style, and sends to a single * node if a read lock command and to all in two phases with a write style command. */ class SyncClusterConnection : public DBClientBase { @@ -46,7 +47,7 @@ namespace mongo { SyncClusterConnection( string commaSeparated ); SyncClusterConnection( string a , string b , string c ); ~SyncClusterConnection(); - + /** * @return true if all servers are up and ready for writes */ @@ -65,36 +66,34 @@ namespace mongo { const BSONObj *fieldsToReturn, int queryOptions, int batchSize ); virtual auto_ptr getMore( const string &ns, long long cursorId, int nToReturn, int options ); - + virtual void insert( const string &ns, BSONObj obj ); - + virtual void insert( const string &ns, const vector< BSONObj >& v ); virtual void remove( const string &ns , Query query, bool justOne ); virtual void update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi ); - virtual bool call( Message &toSend, Message &response, bool assertOk ); + virtual bool call( Message &toSend, Message &response, bool assertOk , string * actualServer ); virtual void say( Message &toSend ); virtual void sayPiggyBack( Message &toSend ); virtual void killCursor( long long cursorID ); - + virtual string getServerAddress() const { return _address; } virtual bool isFailed() const { return false; } virtual string toString() { return _toString(); } - virtual BSONObj getLastErrorDetailed(); + virtual BSONObj getLastErrorDetailed(); virtual bool callRead( Message& toSend , Message& response ); - virtual ConnectionString::ConnectionType type() const { return ConnectionString::SYNC; } - - virtual bool isMember( const DBConnector * conn ) const; + virtual ConnectionString::ConnectionType type() const { return ConnectionString::SYNC; } private: SyncClusterConnection( SyncClusterConnection& prev ); - string _toString() const; + string _toString() const; bool _commandOnActive(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0); auto_ptr _queryOnActive(const string &ns, Query query, int nToReturn, int nToSkip, const BSONObj *fieldsToReturn, int queryOptions, int batchSize ); @@ -107,17 +106,17 @@ namespace mongo { vector _conns; map _lockTypes; mongo::mutex _mutex; - + vector _lastErrors; }; - + class UpdateNotTheSame : public UserException { public: UpdateNotTheSame( int code , const string& msg , const vector& addrs , const vector& lastErrors ) - : UserException( code , msg ) , _addrs( addrs ) , _lastErrors( lastErrors ){ + : UserException( code , msg ) , _addrs( addrs ) , _lastErrors( lastErrors ) { assert( _addrs.size() == _lastErrors.size() ); } - + virtual ~UpdateNotTheSame() throw() { } @@ -134,7 +133,7 @@ namespace mongo { vector _addrs; vector _lastErrors; }; - + }; #include "undef_macros.h" diff --git a/client/undef_macros.h b/client/undef_macros.h index cce8692..bc59a84 100644 --- a/client/undef_macros.h +++ b/client/undef_macros.h @@ -54,5 +54,8 @@ #undef RARELY #undef ONCE +// util/log.h +#undef LOG + #define MONGO_MACROS_CLEANED #endif diff --git a/db/background.h b/db/background.h index 24ea1cb..ea424c9 100644 --- a/db/background.h +++ b/db/background.h @@ -21,16 +21,16 @@ #pragma once -namespace mongo { +namespace mongo { - /* these are administrative operations / jobs - for a namespace running in the background, and that only one + /* these are administrative operations / jobs + for a namespace running in the background, and that only one at a time per namespace is permitted, and that if in progress, you aren't allowed to do other NamespaceDetails major manipulations - (such as dropping ns or db) even in the foreground and must - instead uassert. + (such as dropping ns or db) even in the foreground and must + instead uassert. - It's assumed this is not for super-high RPS things, so we don't do + It's assumed this is not for super-high RPS things, so we don't do anything special in the implementation here to be fast. */ class BackgroundOperation : public boost::noncopyable { diff --git a/db/btree.cpp b/db/btree.cpp index d646de8..d547a1b 100644 --- a/db/btree.cpp +++ b/db/btree.cpp @@ -24,48 +24,92 @@ #include "clientcursor.h" #include "client.h" #include "dbhelpers.h" -#include "curop.h" +#include "curop-inl.h" #include "stats/counters.h" +#include "dur_commitjob.h" namespace mongo { #define VERIFYTHISLOC dassert( thisLoc.btree() == this ); + /** + * give us a writable version of the btree bucket (declares write intent). + * note it is likely more efficient to declare write intent on something smaller when you can. + */ + BtreeBucket* DiskLoc::btreemod() const { + assert( _a != -1 ); + BtreeBucket *b = const_cast< BtreeBucket * >( btree() ); + return static_cast< BtreeBucket* >( getDur().writingPtr( b, BucketSize ) ); + } + + _KeyNode& _KeyNode::writing() const { + return *getDur().writing( const_cast< _KeyNode* >( this ) ); + } + KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) : - prevChildBucket(k.prevChildBucket), - recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) + prevChildBucket(k.prevChildBucket), + recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) { } - const int KeyMax = BucketSize / 10; + // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. + static const int KeyMax = BucketSize / 10; + + // We define this value as the maximum number of bytes such that, if we have + // fewer than this many bytes, we must be able to either merge with or receive + // keys from any neighboring node. If our utilization goes below this value we + // know we can bring up the utilization with a simple operation. Ignoring the + // 90/10 split policy which is sometimes employed and our 'unused' nodes, this + // is a lower bound on bucket utilization for non root buckets. + // + // Note that the exact value here depends on the implementation of + // rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as + // follows: We know we cannot merge with the neighbor, so the total data size + // for us, the neighbor, and the separator must be at least + // BtreeBucket::bodySize() + 1. We must be able to accept one key of any + // allowed size, so our size plus storage for that additional key must be + // <= BtreeBucket::bodySize() / 2. This way, with the extra key we'll have a + // new bucket data size < half the total data size and by the implementation + // of rebalancedSeparatorPos() the key must be added. + static const int lowWaterMark = BtreeBucket::bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1; + + static const int split_debug = 0; + static const int insert_debug = 0; extern int otherTraceLevel; - const int split_debug = 0; - const int insert_debug = 0; - static void alreadyInIndex() { + /** + * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly + * for the 10287 error code. + */ + static void alreadyInIndex() { // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord() throw MsgAssertionException(10287, "btree: key+recloc already in index"); } /* BucketBasics --------------------------------------------------- */ - inline void BucketBasics::modified(const DiskLoc& thisLoc) { - VERIFYTHISLOC - btreeStore->modified(thisLoc); + void BucketBasics::assertWritable() { + if( cmdLine.dur ) + dur::assertAlreadyDeclared(this, sizeof(*this)); + } + + string BtreeBucket::bucketSummary() const { + stringstream ss; + ss << " Bucket info:" << endl; + ss << " n: " << n << endl; + ss << " parent: " << parent.toString() << endl; + ss << " nextChild: " << parent.toString() << endl; + ss << " flags:" << flags << endl; + ss << " emptySize: " << emptySize << " topSize: " << topSize << endl; + return ss.str(); } int BucketBasics::Size() const { assert( _wasSize == BucketSize ); return BucketSize; } - inline void BucketBasics::setNotPacked() { - flags &= ~Packed; - } - inline void BucketBasics::setPacked() { - flags |= Packed; - } - void BucketBasics::_shape(int level, stringstream& ss) { + void BucketBasics::_shape(int level, stringstream& ss) const { for ( int i = 0; i < level; i++ ) ss << ' '; ss << "*\n"; for ( int i = 0; i < n; i++ ) @@ -78,13 +122,13 @@ namespace mongo { int bt_fv=0; int bt_dmp=0; - void BucketBasics::dumpTree(DiskLoc thisLoc, const BSONObj &order) { + void BtreeBucket::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const { bt_dmp=1; fullValidate(thisLoc, order); bt_dmp=0; } - int BucketBasics::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount) { + int BtreeBucket::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount, bool strict) const { { bool f = false; assert( f = true ); @@ -93,8 +137,6 @@ namespace mongo { killCurrentOp.checkForInterrupt(); assertValid(order, true); -// if( bt_fv==0 ) -// return; if ( bt_dmp ) { out() << thisLoc.toString() << ' '; @@ -105,26 +147,37 @@ namespace mongo { int kc = 0; for ( int i = 0; i < n; i++ ) { - _KeyNode& kn = k(i); + const _KeyNode& kn = k(i); if ( kn.isUsed() ) { kc++; - } else { + } + else { if ( unusedCount ) { ++( *unusedCount ); } } if ( !kn.prevChildBucket.isNull() ) { DiskLoc left = kn.prevChildBucket; - BtreeBucket *b = left.btree(); - wassert( b->parent == thisLoc ); - kc += b->fullValidate(kn.prevChildBucket, order, unusedCount); + const BtreeBucket *b = left.btree(); + if ( strict ) { + assert( b->parent == thisLoc ); + } + else { + wassert( b->parent == thisLoc ); + } + kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict); } } if ( !nextChild.isNull() ) { - BtreeBucket *b = nextChild.btree(); - wassert( b->parent == thisLoc ); - kc += b->fullValidate(nextChild, order, unusedCount); + const BtreeBucket *b = nextChild.btree(); + if ( strict ) { + assert( b->parent == thisLoc ); + } + else { + wassert( b->parent == thisLoc ); + } + kc += b->fullValidate(nextChild, order, unusedCount, strict); } return kc; @@ -132,12 +185,20 @@ namespace mongo { int nDumped = 0; - void BucketBasics::assertValid(const Ordering &order, bool force) { + void BucketBasics::assertValid(const Ordering &order, bool force) const { if ( !debug && !force ) return; wassert( n >= 0 && n < Size() ); wassert( emptySize >= 0 && emptySize < BucketSize ); wassert( topSize >= n && topSize <= BucketSize ); + + // this is very slow so don't do often + { + static int _k; + if( ++_k % 128 ) + return; + } + DEV { // slow: for ( int i = 0; i < n-1; i++ ) { @@ -204,15 +265,16 @@ namespace mongo { reserved = 0; } - /* see _alloc */ + /** see _alloc */ inline void BucketBasics::_unalloc(int bytes) { topSize -= bytes; emptySize += bytes; } - /* we allocate space from the end of the buffer for data. - the keynodes grow from the front. - */ + /** + * we allocate space from the end of the buffer for data. + * the keynodes grow from the front. + */ inline int BucketBasics::_alloc(int bytes) { topSize += bytes; emptySize -= bytes; @@ -221,21 +283,23 @@ namespace mongo { return ofs; } - void BucketBasics::_delKeyAtPos(int keypos) { + void BucketBasics::_delKeyAtPos(int keypos, bool mayEmpty) { assert( keypos >= 0 && keypos <= n ); assert( childForPos(keypos).isNull() ); + // TODO audit cases where nextChild is null + assert( ( mayEmpty && n > 0 ) || n > 1 || nextChild.isNull() ); + emptySize += sizeof(_KeyNode); n--; - assert( n > 0 || nextChild.isNull() ); for ( int j = keypos; j < n; j++ ) k(j) = k(j+1); - emptySize += sizeof(_KeyNode); setNotPacked(); } - /* pull rightmost key from the bucket. this version requires its right child to be null so it - does not bother returning that value. - */ - void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { + /** + * pull rightmost key from the bucket. this version requires its right child to be null so it + * does not bother returning that value. + */ + void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { massert( 10282 , "n==0 in btree popBack()", n > 0 ); assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that KeyNode kn = keyNode(n-1); @@ -243,18 +307,18 @@ namespace mongo { key = kn.key; int keysize = kn.key.objsize(); - massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull()); + massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull()); - /* weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. */ - nextChild = kn.prevChildBucket; + // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. + nextChild = kn.prevChildBucket; n--; emptySize += sizeof(_KeyNode); _unalloc(keysize); } - /* add a key. must be > all existing. be careful to set next ptr right. */ - bool BucketBasics::_pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild) { + /** add a key. must be > all existing. be careful to set next ptr right. */ + bool BucketBasics::_pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) { int bytesNeeded = key.objsize() + sizeof(_KeyNode); if ( bytesNeeded > emptySize ) return false; @@ -269,38 +333,96 @@ namespace mongo { memcpy(p, key.objdata(), key.objsize()); return true; } - /*void BucketBasics::pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild, DiskLoc nextChild) { - pushBack(recordLoc, key, order, prevChild); - childForPos(n) = nextChild; - }*/ - /* insert a key in a bucket with no complexity -- no splits required */ - bool BucketBasics::basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order) { - modified(thisLoc); + /* durability note + we do separate intent declarations herein. arguably one could just declare + the whole bucket given we do group commits. this is something we could investigate + later as to what is faster under what situations. + */ + /** insert a key in a bucket with no complexity -- no splits required + @return false if a split is required. + */ + bool BucketBasics::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const { assert( keypos >= 0 && keypos <= n ); int bytesNeeded = key.objsize() + sizeof(_KeyNode); if ( bytesNeeded > emptySize ) { - pack( order, keypos ); + _pack(thisLoc, order, keypos); if ( bytesNeeded > emptySize ) return false; } - for ( int j = n; j > keypos; j-- ) // make room - k(j) = k(j-1); - n++; - emptySize -= sizeof(_KeyNode); - _KeyNode& kn = k(keypos); + + BucketBasics *b; + { + const char *p = (const char *) &k(keypos); + const char *q = (const char *) &k(n+1); + // declare that we will write to [k(keypos),k(n)] + // todo: this writes a medium amount to the journal. we may want to add a verb "shift" to the redo log so + // we can log a very small amount. + b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p); + + // e.g. n==3, keypos==2 + // 1 4 9 + // -> + // 1 4 _ 9 + for ( int j = n; j > keypos; j-- ) // make room + b->k(j) = b->k(j-1); + } + + getDur().declareWriteIntent(&b->emptySize, 12); // [b->emptySize..b->n] is 12 bytes and we are going to write those + b->emptySize -= sizeof(_KeyNode); + b->n++; + + _KeyNode& kn = b->k(keypos); kn.prevChildBucket.Null(); kn.recordLoc = recordLoc; - kn.setKeyDataOfs((short) _alloc(key.objsize()) ); - char *p = dataAt(kn.keyDataOfs()); + kn.setKeyDataOfs((short) b->_alloc(key.objsize()) ); + char *p = b->dataAt(kn.keyDataOfs()); + getDur().declareWriteIntent(p, key.objsize()); memcpy(p, key.objdata(), key.objsize()); return true; } - /* when we delete things we just leave empty space until the node is - full and then we repack it. - */ - void BucketBasics::pack( const Ordering &order, int &refPos ) { + /** with this implementation, refPos == 0 disregards effect of refPos */ + bool BucketBasics::mayDropKey( int index, int refPos ) const { + return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull(); + } + + int BucketBasics::packedDataSize( int refPos ) const { + if ( flags & Packed ) { + return BucketSize - emptySize - headerSize(); + } + int size = 0; + for( int j = 0; j < n; ++j ) { + if ( mayDropKey( j, refPos ) ) { + continue; + } + size += keyNode( j ).key.objsize() + sizeof( _KeyNode ); + } + return size; + } + + /** + * when we delete things we just leave empty space until the node is + * full and then we repack it. + */ + void BucketBasics::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const { + if ( flags & Packed ) + return; + + VERIFYTHISLOC + + /** TODO perhaps this can be optimized. for example if packing does no write, we can skip intent decl. + an empirical approach is probably best than just adding new code : perhaps the bucket would need + declaration anyway within the group commit interval, in which case we would just be adding + code and complexity without benefit. + */ + thisLoc.btreemod()->_packReadyForMod(order, refPos); + } + + /** version when write intent already declared */ + void BucketBasics::_packReadyForMod( const Ordering &order, int &refPos ) { + assertWritable(); + if ( flags & Packed ) return; @@ -310,7 +432,7 @@ namespace mongo { topSize = 0; int i = 0; for ( int j = 0; j < n; j++ ) { - if( j > 0 && ( j != refPos ) && k( j ).isUnused() && k( j ).prevChildBucket.isNull() ) { + if( mayDropKey( j, refPos ) ) { continue; // key is unused and has no children - drop it } if( i != j ) { @@ -333,26 +455,104 @@ namespace mongo { n = i; int dataUsed = tdz - ofs; memcpy(data + ofs, temp + ofs, dataUsed); + + // assertWritable(); + // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this)); + emptySize = tdz - dataUsed - n * sizeof(_KeyNode); assert( emptySize >= 0 ); setPacked(); + assertValid( order ); } inline void BucketBasics::truncateTo(int N, const Ordering &order, int &refPos) { + dbMutex.assertWriteLocked(); + assertWritable(); + n = N; setNotPacked(); - pack( order, refPos ); + _packReadyForMod( order, refPos ); + } + + /** + * In the standard btree algorithm, we would split based on the + * existing keys _and_ the new key. But that's more work to + * implement, so we split the existing keys and then add the new key. + * + * There are several published heuristic algorithms for doing splits, + * but basically what you want are (1) even balancing between the two + * sides and (2) a small split key so the parent can have a larger + * branching factor. + * + * We just have a simple algorithm right now: if a key includes the + * halfway point (or 10% way point) in terms of bytes, split on that key; + * otherwise split on the key immediately to the left of the halfway + * point. + * + * This function is expected to be called on a packed bucket. + */ + int BucketBasics::splitPos( int keypos ) const { + assert( n > 2 ); + int split = 0; + int rightSize = 0; + // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split. + // see SERVER-983 + int rightSizeLimit = ( topSize + sizeof( _KeyNode ) * n ) / ( keypos == n ? 10 : 2 ); + for( int i = n - 1; i > -1; --i ) { + rightSize += keyNode( i ).key.objsize() + sizeof( _KeyNode ); + if ( rightSize > rightSizeLimit ) { + split = i; + break; + } + } + // safeguards - we must not create an empty bucket + if ( split < 1 ) { + split = 1; + } + else if ( split > n - 2 ) { + split = n - 2; + } + + return split; + } + + void BucketBasics::reserveKeysFront( int nAdd ) { + assert( emptySize >= int( sizeof( _KeyNode ) * nAdd ) ); + emptySize -= sizeof( _KeyNode ) * nAdd; + for( int i = n - 1; i > -1; --i ) { + k( i + nAdd ) = k( i ); + } + n += nAdd; + } + + void BucketBasics::setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ) { + _KeyNode &kn = k( i ); + kn.recordLoc = recordLoc; + kn.prevChildBucket = prevChildBucket; + short ofs = (short) _alloc( key.objsize() ); + kn.setKeyDataOfs( ofs ); + char *p = dataAt( ofs ); + memcpy( p, key.objdata(), key.objsize() ); + } + + void BucketBasics::dropFront( int nDrop, const Ordering &order, int &refpos ) { + for( int i = nDrop; i < n; ++i ) { + k( i - nDrop ) = k( i ); + } + n -= nDrop; + setNotPacked(); + _packReadyForMod( order, refpos ); } /* - BtreeBucket --------------------------------------------------- */ - /* return largest key in the subtree. */ + /** @return largest key in the subtree. */ void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) { DiskLoc loc = thisLoc; while ( 1 ) { - BtreeBucket *b = loc.btree(); + const BtreeBucket *b = loc.btree(); if ( !b->nextChild.isNull() ) { loc = b->nextChild; continue; @@ -365,23 +565,34 @@ namespace mongo { break; } } - - int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, const vector< const BSONElement * > &rEnd, const Ordering &o ) { + + /** + * NOTE Currently the Ordering implementation assumes a compound index will + * not have more keys than an unsigned variable has bits. The same + * assumption is used in the implementation below with respect to the 'mask' + * variable. + */ + int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) { BSONObjIterator ll( l ); BSONObjIterator rr( rBegin ); vector< const BSONElement * >::const_iterator rr2 = rEnd.begin(); + vector< bool >::const_iterator inc = rEndInclusive.begin(); unsigned mask = 1; for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) { BSONElement lll = ll.next(); BSONElement rrr = rr.next(); ++rr2; - + ++inc; + int x = lll.woCompare( rrr, false ); if ( o.descending( mask ) ) x = -x; if ( x != 0 ) return x; } + if ( rSup ) { + return -direction; + } for( ; ll.more(); mask <<= 1 ) { BSONElement lll = ll.next(); BSONElement rrr = **rr2; @@ -391,11 +602,15 @@ namespace mongo { x = -x; if ( x != 0 ) return x; + if ( !*inc ) { + return -direction; + } + ++inc; } return 0; } - bool BtreeBucket::exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, const Ordering& order) { + bool BtreeBucket::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const { int pos; bool found; DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); @@ -404,8 +619,8 @@ namespace mongo { while ( 1 ) { if( b.isNull() ) break; - BtreeBucket *bucket = b.btree(); - _KeyNode& kn = bucket->k(pos); + const BtreeBucket *bucket = b.btree(); + const _KeyNode& kn = bucket->k(pos); if ( kn.isUsed() ) return bucket->keyAt(pos).woEqual(key); b = bucket->advance(b, pos, 1, "BtreeBucket::exists"); @@ -413,22 +628,22 @@ namespace mongo { return false; } - /* @param self - don't complain about ourself already being in the index case. - @return true = there is a duplicate. - */ + /** + * @param self - don't complain about ourself already being in the index case. + * @return true = there is a duplicate. + */ bool BtreeBucket::wouldCreateDup( - const IndexDetails& idx, DiskLoc thisLoc, + const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order, - DiskLoc self) - { + const DiskLoc &self) const { int pos; bool found; DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); while ( !b.isNull() ) { // we skip unused keys - BtreeBucket *bucket = b.btree(); - _KeyNode& kn = bucket->k(pos); + const BtreeBucket *bucket = b.btree(); + const _KeyNode& kn = bucket->k(pos); if ( kn.isUsed() ) { if( bucket->keyAt(pos).woEqual(key) ) return kn.recordLoc != self; @@ -440,7 +655,7 @@ namespace mongo { return false; } - string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){ + string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ) { stringstream ss; ss << "E11000 duplicate key error "; ss << "index: " << idx.indexNamespace() << " "; @@ -448,37 +663,38 @@ namespace mongo { return ss.str(); } - /* Find a key withing this btree bucket. - - When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the - key. That assures that even when there are many duplicates (e.g., 1 million) for a key, - our performance is still good. - - assertIfDup: if the key exists (ignoring the recordLoc), uassert - - pos: for existing keys k0...kn-1. - returns # it goes BEFORE. so key[pos-1] < key < key[pos] - returns n if it goes after the last existing key. - note result might be an Unused location! - */ - char foo; - bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const Ordering &order, int& pos, bool assertIfDup) { + /** + * Find a key withing this btree bucket. + * + * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the + * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, + * our performance is still good. + * + * assertIfDup: if the key exists (ignoring the recordLoc), uassert + * + * pos: for existing keys k0...kn-1. + * returns # it goes BEFORE. so key[pos-1] < key < key[pos] + * returns n if it goes after the last existing key. + * note result might be an Unused location! + */ + char foo; + bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const { #if defined(_EXPERIMENT1) - { - char *z = (char *) this; - int i = 0; - while( 1 ) { - i += 4096; - if( i >= BucketSize ) - break; - foo += z[i]; - } - } + { + char *z = (char *) this; + int i = 0; + while( 1 ) { + i += 4096; + if( i >= BucketSize ) + break; + foo += z[i]; + } + } #endif - + globalIndexCounters.btree( (char*)this ); - - /* binary search for this key */ + + // binary search for this key bool dupsChecked = false; int l=0; int h=n-1; @@ -486,13 +702,13 @@ namespace mongo { int m = (l+h)/2; KeyNode M = keyNode(m); int x = key.woCompare(M.key, order); - if ( x == 0 ) { + if ( x == 0 ) { if( assertIfDup ) { - if( k(m).isUnused() ) { - // ok that key is there if unused. but we need to check that there aren't other - // entries for the key then. as it is very rare that we get here, we don't put any + if( k(m).isUnused() ) { + // ok that key is there if unused. but we need to check that there aren't other + // entries for the key then. as it is very rare that we get here, we don't put any // coding effort in here to make this particularly fast - if( !dupsChecked ) { + if( !dupsChecked ) { dupsChecked = true; if( idx.head.btree()->exists(idx, idx.head, key, order) ) { if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) ) @@ -503,7 +719,7 @@ namespace mongo { } } else { - if( M.recordLoc == recordLoc ) + if( M.recordLoc == recordLoc ) alreadyInIndex(); uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); } @@ -537,86 +753,378 @@ namespace mongo { return false; } - void BtreeBucket::delBucket(const DiskLoc& thisLoc, IndexDetails& id) { + void BtreeBucket::delBucket(const DiskLoc thisLoc, const IndexDetails& id) { ClientCursor::informAboutToDeleteBucket(thisLoc); // slow... assert( !isHead() ); - BtreeBucket *p = parent.btreemod(); - if ( p->nextChild == thisLoc ) { - p->nextChild.Null(); - } - else { - for ( int i = 0; i < p->n; i++ ) { - if ( p->k(i).prevChildBucket == thisLoc ) { - p->k(i).prevChildBucket.Null(); - goto found; - } - } - out() << "ERROR: can't find ref to deleted bucket.\n"; - out() << "To delete:\n"; - dump(); - out() << "Parent:\n"; - p->dump(); - assert(false); - } -found: + const BtreeBucket *p = parent.btree(); + int parentIdx = indexInParent( thisLoc ); + p->childForPos( parentIdx ).writing().Null(); deallocBucket( thisLoc, id ); } - - void BtreeBucket::deallocBucket(const DiskLoc &thisLoc, IndexDetails &id) { + + void BtreeBucket::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) { #if 0 - /* as a temporary defensive measure, we zap the whole bucket, AND don't truly delete - it (meaning it is ineligible for reuse). - */ + // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete + // it (meaning it is ineligible for reuse). memset(this, 0, Size()); - modified(thisLoc); #else - //defensive: + // defensive: n = -1; parent.Null(); string ns = id.indexNamespace(); - btreeStore->deleteRecord(ns.c_str(), thisLoc); + theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc); #endif } - /* note: may delete the entire bucket! this invalid upon return sometimes. */ - void BtreeBucket::delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p) { - modified(thisLoc); + /** note: may delete the entire bucket! this invalid upon return sometimes. */ + void BtreeBucket::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) { assert(n>0); DiskLoc left = childForPos(p); if ( n == 1 ) { if ( left.isNull() && nextChild.isNull() ) { - if ( isHead() ) - _delKeyAtPos(p); // we don't delete the top bucket ever - else - delBucket(thisLoc, id); + _delKeyAtPos(p); + if ( isHead() ) { + // we don't delete the top bucket ever + } + else { + if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) { + // An empty bucket is only allowed as a transient state. If + // there are no neighbors to balance with, we delete ourself. + // This condition is only expected in legacy btrees. + delBucket(thisLoc, id); + } + } return; } - markUnused(p); + deleteInternalKey( thisLoc, p, id, order ); return; } - if ( left.isNull() ) + if ( left.isNull() ) { _delKeyAtPos(p); - else - markUnused(p); + mayBalanceWithNeighbors( thisLoc, id, order ); + } + else { + deleteInternalKey( thisLoc, p, id, order ); + } } - int qqq = 0; + /** + * This function replaces the specified key (k) by either the prev or next + * key in the btree (k'). We require that k have either a left or right + * child. If k has a left child, we set k' to the prev key of k, which must + * be a leaf present in the left child. If k does not have a left child, we + * set k' to the next key of k, which must be a leaf present in the right + * child. When we replace k with k', we copy k' over k (which may cause a + * split) and then remove k' from its original location. Because k' is + * stored in a descendent of k, replacing k by k' will not modify the + * storage location of the original k', and we can easily remove k' from + * its original location. + * + * This function is only needed in cases where k has a left or right child; + * in other cases a simpler key removal implementation is possible. + * + * NOTE on legacy btree structures: + * In legacy btrees, k' can be a nonleaf. In such a case we 'delete' k by + * marking it as an unused node rather than replacing it with k'. Also, k' + * may be a leaf but marked as an unused node. In such a case we replace + * k by k', preserving the key's unused marking. This function is only + * expected to mark a key as unused when handling a legacy btree. + */ + void BtreeBucket::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) { + DiskLoc lchild = childForPos( keypos ); + DiskLoc rchild = childForPos( keypos + 1 ); + assert( !lchild.isNull() || !rchild.isNull() ); + int advanceDirection = lchild.isNull() ? 1 : -1; + int advanceKeyOfs = keypos; + DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ ); + + if ( !advanceLoc.btree()->childForPos( advanceKeyOfs ).isNull() || + !advanceLoc.btree()->childForPos( advanceKeyOfs + 1 ).isNull() ) { + // only expected with legacy btrees, see note above + markUnused( keypos ); + return; + } - /* remove a key from the index */ - bool BtreeBucket::unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc ) { - if ( key.objsize() > KeyMax ) { - OCCASIONALLY problem() << "unindex: key too large to index, skipping " << id.indexNamespace() << /* ' ' << key.toString() << */ endl; + KeyNode kn = advanceLoc.btree()->keyNode( advanceKeyOfs ); + setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, childForPos( keypos ), childForPos( keypos + 1 ), id ); + advanceLoc.btreemod()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order ); + } + + void BtreeBucket::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) { + assert( n == 0 && !nextChild.isNull() ); + if ( parent.isNull() ) { + assert( id.head == thisLoc ); + id.head.writing() = nextChild; + } + else { + parent.btree()->childForPos( indexInParent( thisLoc ) ).writing() = nextChild; + } + nextChild.btree()->parent.writing() = parent; + ClientCursor::informAboutToDeleteBucket( thisLoc ); + deallocBucket( thisLoc, id ); + } + + bool BtreeBucket::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const { + assert( leftIndex >= 0 && leftIndex < n ); + DiskLoc leftNodeLoc = childForPos( leftIndex ); + DiskLoc rightNodeLoc = childForPos( leftIndex + 1 ); + if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) { + // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway return false; } + int pos = 0; + { + const BtreeBucket *l = leftNodeLoc.btree(); + const BtreeBucket *r = rightNodeLoc.btree(); + if ( ( headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.objsize() + sizeof(_KeyNode) > unsigned( BucketSize ) ) ) { + return false; + } + } + return true; + } + /** + * This implementation must respect the meaning and value of lowWaterMark. + * Also see comments in splitPos(). + */ + int BtreeBucket::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const { + int split = -1; + int rightSize = 0; + const BtreeBucket *l = childForPos( leftIndex ).btree(); + const BtreeBucket *r = childForPos( leftIndex + 1 ).btree(); + + int KNS = sizeof( _KeyNode ); + int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.objsize() + KNS + r->topSize + r->n * KNS ) / 2; + // This constraint should be ensured by only calling this function + // if we go below the low water mark. + assert( rightSizeLimit < BtreeBucket::bodySize() ); + for( int i = r->n - 1; i > -1; --i ) { + rightSize += r->keyNode( i ).key.objsize() + KNS; + if ( rightSize > rightSizeLimit ) { + split = l->n + 1 + i; + break; + } + } + if ( split == -1 ) { + rightSize += keyNode( leftIndex ).key.objsize() + KNS; + if ( rightSize > rightSizeLimit ) { + split = l->n; + } + } + if ( split == -1 ) { + for( int i = l->n - 1; i > -1; --i ) { + rightSize += l->keyNode( i ).key.objsize() + KNS; + if ( rightSize > rightSizeLimit ) { + split = i; + break; + } + } + } + // safeguards - we must not create an empty bucket + if ( split < 1 ) { + split = 1; + } + else if ( split > l->n + 1 + r->n - 2 ) { + split = l->n + 1 + r->n - 2; + } + + return split; + } + + void BtreeBucket::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { + DiskLoc leftNodeLoc = childForPos( leftIndex ); + DiskLoc rightNodeLoc = childForPos( leftIndex + 1 ); + BtreeBucket *l = leftNodeLoc.btreemod(); + BtreeBucket *r = rightNodeLoc.btreemod(); + int pos = 0; + l->_packReadyForMod( order, pos ); + r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys + + int oldLNum = l->n; + { + KeyNode kn = keyNode( leftIndex ); + l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child + } + for( int i = 0; i < r->n; ++i ) { + KeyNode kn = r->keyNode( i ); + l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket ); + } + l->nextChild = r->nextChild; + l->fixParentPtrs( leftNodeLoc, oldLNum ); + r->delBucket( rightNodeLoc, id ); + childForPos( leftIndex + 1 ) = leftNodeLoc; + childForPos( leftIndex ) = DiskLoc(); + _delKeyAtPos( leftIndex, true ); + if ( n == 0 ) { + // will trash this and thisLoc + // TODO To ensure all leaves are of equal height, we should ensure + // this is only called on the root. + replaceWithNextChild( thisLoc, id ); + } + else { + // balance recursively - maybe we should do this even when n == 0? + mayBalanceWithNeighbors( thisLoc, id, order ); + } + } + + int BtreeBucket::indexInParent( const DiskLoc &thisLoc ) const { + assert( !parent.isNull() ); + const BtreeBucket *p = parent.btree(); + if ( p->nextChild == thisLoc ) { + return p->n; + } + else { + for( int i = 0; i < p->n; ++i ) { + if ( p->k( i ).prevChildBucket == thisLoc ) { + return i; + } + } + } + out() << "ERROR: can't find ref to child bucket.\n"; + out() << "child: " << thisLoc << "\n"; + dump(); + out() << "Parent: " << parent << "\n"; + p->dump(); + assert(false); + return -1; // just to compile + } + + bool BtreeBucket::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const { + // If we can merge, then we must merge rather than balance to preserve + // bucket utilization constraints. + if ( canMergeChildren( thisLoc, leftIndex ) ) { + return false; + } + thisLoc.btreemod()->doBalanceChildren( thisLoc, leftIndex, id, order ); + return true; + } + + void BtreeBucket::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket *l, const DiskLoc lchild, + BtreeBucket *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ) { + // TODO maybe do some audits the same way pushBack() does? + int rAdd = l->n - split; + r->reserveKeysFront( rAdd ); + for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) { + KeyNode kn = l->keyNode( i ); + r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket ); + } + { + KeyNode kn = keyNode( leftIndex ); + r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child + } + r->fixParentPtrs( rchild, 0, rAdd - 1 ); + { + KeyNode kn = l->keyNode( split ); + l->nextChild = kn.prevChildBucket; + setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); + } + int zeropos = 0; + l->truncateTo( split, order, zeropos ); + } + + void BtreeBucket::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket *l, const DiskLoc lchild, + BtreeBucket *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ) { + int lN = l->n; + { + KeyNode kn = keyNode( leftIndex ); + l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child + } + for( int i = 0; i < split - lN - 1; ++i ) { + KeyNode kn = r->keyNode( i ); + l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket ); + } + { + KeyNode kn = r->keyNode( split - lN - 1 ); + l->nextChild = kn.prevChildBucket; + l->fixParentPtrs( lchild, lN + 1, l->n ); + setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); + } + int zeropos = 0; + r->dropFront( split - lN, order, zeropos ); + } + + void BtreeBucket::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { + DiskLoc lchild = childForPos( leftIndex ); + DiskLoc rchild = childForPos( leftIndex + 1 ); + int zeropos = 0; + BtreeBucket *l = lchild.btreemod(); + l->_packReadyForMod( order, zeropos ); + BtreeBucket *r = rchild.btreemod(); + r->_packReadyForMod( order, zeropos ); + int split = rebalancedSeparatorPos( thisLoc, leftIndex ); + + // By definition, if we are below the low water mark and cannot merge + // then we must actively balance. + assert( split != l->n ); + if ( split < l->n ) { + doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order ); + } + else { + doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order ); + } + } + + bool BtreeBucket::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const { + if ( parent.isNull() ) { // we are root, there are no neighbors + return false; + } + + if ( packedDataSize( 0 ) >= lowWaterMark ) { + return false; + } + + const BtreeBucket *p = parent.btree(); + int parentIdx = indexInParent( thisLoc ); + + // TODO will missing neighbor case be possible long term? Should we try to merge/balance somehow in that case if so? + bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() ); + bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() ); + + // Balance if possible on one side - we merge only if absolutely necessary + // to preserve btree bucket utilization constraints since that's a more + // heavy duty operation (especially if we must re-split later). + if ( mayBalanceRight && + p->tryBalanceChildren( parent, parentIdx, id, order ) ) { + return true; + } + if ( mayBalanceLeft && + p->tryBalanceChildren( parent, parentIdx - 1, id, order ) ) { + return true; + } + + BtreeBucket *pm = parent.btreemod(); + if ( mayBalanceRight ) { + pm->doMergeChildren( parent, parentIdx, id, order ); + return true; + } + else if ( mayBalanceLeft ) { + pm->doMergeChildren( parent, parentIdx - 1, id, order ); + return true; + } + + return false; + } + + /** remove a key from the index */ + bool BtreeBucket::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const { int pos; bool found; DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1); if ( found ) { - loc.btree()->delKeyAtPos(loc, id, pos); + + if ( key.objsize() > KeyMax ) { + OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl; + } + + loc.btreemod()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern())); + return true; } return false; @@ -628,40 +1136,68 @@ found: return b; } - inline void fix(const DiskLoc& thisLoc, const DiskLoc& child) { + inline void BtreeBucket::fix(const DiskLoc thisLoc, const DiskLoc child) { if ( !child.isNull() ) { if ( insert_debug ) out() << " " << child.toString() << ".parent=" << thisLoc.toString() << endl; - child.btreemod()->parent = thisLoc; + child.btree()->parent.writing() = thisLoc; } } - /* this sucks. maybe get rid of parent ptrs. */ - void BtreeBucket::fixParentPtrs(const DiskLoc& thisLoc) { + /** this sucks. maybe get rid of parent ptrs. */ + void BtreeBucket::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const { VERIFYTHISLOC - fix(thisLoc, nextChild); - for ( int i = 0; i < n; i++ ) - fix(thisLoc, k(i).prevChildBucket); + if ( lastIndex == -1 ) { + lastIndex = n; + } + for ( int i = firstIndex; i <= lastIndex; i++ ) { + fix(thisLoc, childForPos(i)); + } } - /* insert a key in this bucket, splitting if necessary. - keypos - where to insert the key i3n range 0..n. 0=make leftmost, n=make rightmost. - NOTE this function may free some data, and as a result the value passed for keypos may - be invalid after calling insertHere() - */ - void BtreeBucket::insertHere(DiskLoc thisLoc, int keypos, - DiskLoc recordLoc, const BSONObj& key, const Ordering& order, - DiskLoc lchild, DiskLoc rchild, IndexDetails& idx) - { - modified(thisLoc); + void BtreeBucket::setInternalKey( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const BSONObj &key, const Ordering &order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) { + childForPos( keypos ).Null(); + + // This may leave the bucket empty (n == 0) which is ok only as a + // transient state. In the instant case, the implementation of + // insertHere behaves correctly when n == 0 and as a side effect + // increments n. + _delKeyAtPos( keypos, true ); + + // Ensure we do not orphan neighbor's old child. + assert( childForPos( keypos ) == rchild ); + + // Just set temporarily - required to pass validation in insertHere() + childForPos( keypos ) = lchild; + + insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx ); + } + + /** + * insert a key in this bucket, splitting if necessary. + * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. + * NOTE this function may free some data, and as a result the value passed for keypos may + * be invalid after calling insertHere() + */ + void BtreeBucket::insertHere( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const { if ( insert_debug ) out() << " " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' ' - << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl; + << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl; DiskLoc oldLoc = thisLoc; - if ( basicInsert(thisLoc, keypos, recordLoc, key, order) ) { - _KeyNode& kn = k(keypos); + if ( !basicInsert(thisLoc, keypos, recordLoc, key, order) ) { + thisLoc.btreemod()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx); + return; + } + + { + const _KeyNode *_kn = &k(keypos); + _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert() if ( keypos+1 == n ) { // last key if ( nextChild != lchild ) { out() << "ERROR nextChild != lchild" << endl; @@ -671,22 +1207,16 @@ found: out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; out() << " key: " << key.toString() << endl; dump(); -#if 0 - out() << "\n\nDUMPING FULL INDEX" << endl; - bt_dmp=1; - bt_fv=1; - idx.head.btree()->fullValidate(idx.head); -#endif assert(false); } - kn.prevChildBucket = nextChild; - assert( kn.prevChildBucket == lchild ); - nextChild = rchild; + kn->prevChildBucket = nextChild; + assert( kn->prevChildBucket == lchild ); + nextChild.writing() = rchild; if ( !rchild.isNull() ) - rchild.btreemod()->parent = thisLoc; + rchild.btree()->parent.writing() = thisLoc; } else { - k(keypos).prevChildBucket = lchild; + kn->prevChildBucket = lchild; if ( k(keypos+1).prevChildBucket != lchild ) { out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl; out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; @@ -695,33 +1225,24 @@ found: out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; out() << " key: " << key.toString() << endl; dump(); -#if 0 - out() << "\n\nDUMPING FULL INDEX" << endl; - bt_dmp=1; - bt_fv=1; - idx.head.btree()->fullValidate(idx.head); -#endif assert(false); } - k(keypos+1).prevChildBucket = rchild; + const DiskLoc *pc = &k(keypos+1).prevChildBucket; + *getDur().alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert() if ( !rchild.isNull() ) - rchild.btreemod()->parent = thisLoc; + rchild.btree()->parent.writing() = thisLoc; } return; } + } - /* ---------- split ---------------- */ + void BtreeBucket::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) { + assertWritable(); if ( split_debug ) out() << " " << thisLoc.toString() << ".split" << endl; - int split = n / 2; - if ( keypos == n ) { // see SERVER-983 - split = (int) (0.9 * n); - if ( split > n - 2 ) - split = n - 2; - } - + int split = splitPos( keypos ); DiskLoc rLoc = addBucket(idx); BtreeBucket *r = rLoc.btreemod(); if ( split_debug ) @@ -753,15 +1274,14 @@ found: p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc); p->nextChild = rLoc; p->assertValid( order ); - parent = idx.head = L; + parent = idx.head.writing() = L; if ( split_debug ) out() << " we were root, making new root:" << hex << parent.getOfs() << dec << endl; - rLoc.btreemod()->parent = parent; + rLoc.btree()->parent.writing() = parent; } else { - /* set this before calling _insert - if it splits it will do fixParent() logic and change the value. - */ - rLoc.btreemod()->parent = parent; + // set this before calling _insert - if it splits it will do fixParent() logic and change the value. + rLoc.btree()->parent.writing() = parent; if ( split_debug ) out() << " promoting splitkey key " << splitkey.key.toString() << endl; parent.btree()->_insert(parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx); @@ -769,16 +1289,17 @@ found: } int newpos = keypos; + // note this may trash splitkey.key. thus we had to promote it before finishing up here. truncateTo(split, order, newpos); // note this may trash splitkey.key. thus we had to promote it before finishing up here. // add our new key, there is room now { - if ( keypos <= split ) { if ( split_debug ) out() << " keypos=0); rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx); @@ -789,26 +1310,27 @@ found: out() << " split end " << hex << thisLoc.getOfs() << dec << endl; } - /* start a new index off, empty */ - DiskLoc BtreeBucket::addBucket(IndexDetails& id) { - DiskLoc loc = btreeStore->insert(id.indexNamespace().c_str(), 0, BucketSize, true); + /** start a new index off, empty */ + DiskLoc BtreeBucket::addBucket(const IndexDetails& id) { + string ns = id.indexNamespace(); + DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, BucketSize, true); BtreeBucket *b = loc.btreemod(); b->init(); return loc; } void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) { - btreeStore->rename( oldNs, newNs ); + renameNamespace( oldNs, newNs ); } - DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) { + const DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) const { DiskLoc p = thisLoc; while ( !p.btree()->isHead() ) p = p.btree()->parent; return p; } - DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { + DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const { if ( keyOfs < 0 || keyOfs >= n ) { out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl; out() << " thisLoc: " << thisLoc.toString() << endl; @@ -841,7 +1363,7 @@ found: while ( 1 ) { if ( ancestor.isNull() ) break; - BtreeBucket *an = ancestor.btree(); + const BtreeBucket *an = ancestor.btree(); for ( int i = 0; i < an->n; i++ ) { if ( an->childForPos(i+adj) == childLoc ) { keyOfs = i; @@ -857,7 +1379,7 @@ found: return DiskLoc(); } - DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, DiskLoc recordLoc, int direction) { + DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { int p; found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false); if ( found ) { @@ -880,7 +1402,7 @@ found: return pos == n ? DiskLoc() /*theend*/ : thisLoc; } - bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) { + bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const { while( 1 ) { if ( l + 1 == h ) { keyOfs = ( direction > 0 ) ? h : l; @@ -889,101 +1411,123 @@ found: bestParent = make_pair( thisLoc, keyOfs ); thisLoc = next; return true; - } else { + } + else { return false; } } int m = l + ( h - l ) / 2; - int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, keyEnd, order ); + int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); if ( cmp < 0 ) { l = m; - } else if ( cmp > 0 ) { + } + else if ( cmp > 0 ) { h = m; - } else { + } + else { if ( direction < 0 ) { l = m; - } else { + } + else { h = m; } } - } + } } - - // find smallest/biggest value greater-equal/less-equal than specified - // starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd - // All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient - void BtreeBucket::advanceTo(const IndexDetails &id, DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction ) { + + /** + * find smallest/biggest value greater-equal/less-equal than specified + * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd + * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient + */ + void BtreeBucket::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const { int l,h; bool dontGoUp; if ( direction > 0 ) { l = keyOfs; h = n - 1; - dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 ); - } else { + dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); + } + else { l = 0; h = keyOfs; - dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 ); + dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); } pair< DiskLoc, int > bestParent; if ( dontGoUp ) { // this comparison result assures h > l - if ( !customFind( l, h, keyBegin, keyBeginLen, keyEnd, order, direction, thisLoc, keyOfs, bestParent ) ) { + if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) { return; } - } else { + } + else { // go up parents until rightmost/leftmost node is >=/<= target or at top while( !thisLoc.btree()->parent.isNull() ) { thisLoc = thisLoc.btree()->parent; if ( direction > 0 ) { - if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 ) { + if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) { break; } - } else { - if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 ) { + } + else { + if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) { break; - } + } } } } + customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent ); + } + + void BtreeBucket::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const { + if ( thisLoc.btree()->n == 0 ) { + thisLoc = DiskLoc(); + return; + } // go down until find smallest/biggest >=/<= target while( 1 ) { - l = 0; - h = thisLoc.btree()->n - 1; + int l = 0; + int h = thisLoc.btree()->n - 1; // leftmost/rightmost key may possibly be >=/<= search key bool firstCheck; if ( direction > 0 ) { - firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 ); - } else { - firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 ); + firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); + } + else { + firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); } if ( firstCheck ) { DiskLoc next; if ( direction > 0 ) { next = thisLoc.btree()->k( 0 ).prevChildBucket; keyOfs = 0; - } else { + } + else { next = thisLoc.btree()->nextChild; keyOfs = h; } if ( !next.isNull() ) { - bestParent = make_pair( thisLoc, keyOfs ); + bestParent = pair< DiskLoc, int >( thisLoc, keyOfs ); thisLoc = next; continue; - } else { + } + else { return; } } bool secondCheck; if ( direction > 0 ) { - secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) < 0 ); - } else { - secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) > 0 ); + secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 ); + } + else { + secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 ); } if ( secondCheck ) { DiskLoc next; if ( direction > 0 ) { next = thisLoc.btree()->nextChild; - } else { + } + else { next = thisLoc.btree()->k( 0 ).prevChildBucket; } if ( next.isNull() ) { @@ -991,23 +1535,23 @@ found: thisLoc = bestParent.first; keyOfs = bestParent.second; return; - } else { + } + else { thisLoc = next; continue; } } - if ( !customFind( l, h, keyBegin, keyBeginLen, keyEnd, order, direction, thisLoc, keyOfs, bestParent ) ) { + if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) { return; } } } - - /* @thisLoc disk location of *this - */ - int BtreeBucket::_insert(DiskLoc thisLoc, DiskLoc recordLoc, + + /** @thisLoc disk location of *this */ + int BtreeBucket::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, bool dupsAllowed, - DiskLoc lChild, DiskLoc rChild, IndexDetails& idx) { + const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const { if ( key.objsize() > KeyMax ) { problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl; return 2; @@ -1018,34 +1562,34 @@ found: bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed); if ( insert_debug ) { out() << " " << thisLoc.toString() << '.' << "_insert " << - key.toString() << '/' << recordLoc.toString() << - " l:" << lChild.toString() << " r:" << rChild.toString() << endl; + key.toString() << '/' << recordLoc.toString() << + " l:" << lChild.toString() << " r:" << rChild.toString() << endl; out() << " found:" << found << " pos:" << pos << " n:" << n << endl; } if ( found ) { - _KeyNode& kn = k(pos); + const _KeyNode& kn = k(pos); if ( kn.isUnused() ) { log(4) << "btree _insert: reusing unused key" << endl; massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull()); massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull()); - kn.setUsed(); + kn.writing().setUsed(); return 0; } - DEV { - out() << "_insert(): key already exists in index (ok for background:true)\n"; - out() << " " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n'; - out() << " " << key.toString() << '\n'; - out() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; - out() << " old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl; - out() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; + DEV { + log() << "_insert(): key already exists in index (ok for background:true)\n"; + log() << " " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n'; + log() << " " << key.toString() << '\n'; + log() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; + log() << " old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl; + log() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; } alreadyInIndex(); } DEBUGGING out() << "TEMP: key: " << key.toString() << endl; - DiskLoc& child = childForPos(pos); + DiskLoc child = childForPos(pos); if ( insert_debug ) out() << " getChild(" << pos << "): " << child.toString() << endl; if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) { @@ -1056,28 +1600,27 @@ found: return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false); } - void BtreeBucket::dump() { + void BtreeBucket::dump() const { out() << "DUMP btreebucket n:" << n; out() << " parent:" << hex << parent.getOfs() << dec; for ( int i = 0; i < n; i++ ) { out() << '\n'; KeyNode k = keyNode(i); out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex << - k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec; + k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec; if ( this->k(i).isUnused() ) out() << " UNUSED"; } out() << " right:" << hex << nextChild.getOfs() << dec << endl; } - /* todo: meaning of return code unclear clean up */ - int BtreeBucket::bt_insert(DiskLoc thisLoc, DiskLoc recordLoc, - const BSONObj& key, const Ordering &order, bool dupsAllowed, - IndexDetails& idx, bool toplevel) - { + /** todo: meaning of return code unclear clean up */ + int BtreeBucket::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel) const { if ( toplevel ) { if ( key.objsize() > KeyMax ) { - problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << endl; + problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl; return 3; } } @@ -1088,22 +1631,30 @@ found: return x; } - void BtreeBucket::shape(stringstream& ss) { + void BtreeBucket::shape(stringstream& ss) const { _shape(0, ss); } - - DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ){ + + int BtreeBucket::getLowWaterMark() { + return lowWaterMark; + } + + int BtreeBucket::getKeyMax() { + return KeyMax; + } + + DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const { int pos; bool found; - /* TODO: is it really ok here that the order is a default? */ + // TODO: is it really ok here that the order is a default? Ordering o = Ordering::make(BSONObj()); DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc ); if ( bucket.isNull() ) return bucket; - BtreeBucket *b = bucket.btree(); - while ( 1 ){ - _KeyNode& knraw = b->k(pos); + const BtreeBucket *b = bucket.btree(); + while ( 1 ) { + const _KeyNode& knraw = b->k(pos); if ( knraw.isUsed() ) break; bucket = b->advance( bucket , pos , 1 , "findSingle" ); @@ -1125,7 +1676,7 @@ found: namespace mongo { void BtreeBucket::a_test(IndexDetails& id) { - BtreeBucket *b = id.head.btree(); + BtreeBucket *b = id.head.btreemod(); // record locs for testing DiskLoc A(1, 20); @@ -1171,26 +1722,37 @@ namespace mongo { /* --- BtreeBuilder --- */ - BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : - dupsAllowed(_dupsAllowed), - idx(_idx), - n(0), - order( idx.keyPattern() ), - ordering( Ordering::make(idx.keyPattern()) ) - { + BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : + dupsAllowed(_dupsAllowed), + idx(_idx), + n(0), + order( idx.keyPattern() ), + ordering( Ordering::make(idx.keyPattern()) ) { first = cur = BtreeBucket::addBucket(idx); b = cur.btreemod(); committed = false; } - void BtreeBuilder::newBucket() { + void BtreeBuilder::newBucket() { DiskLoc L = BtreeBucket::addBucket(idx); b->tempNext() = L; cur = L; b = cur.btreemod(); } - void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { + void BtreeBuilder::mayCommitProgressDurably() { + if ( getDur().commitIfNeeded() ) { + b = cur.btreemod(); + } + } + + void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { + if ( key.objsize() > KeyMax ) { + problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() + << ' ' << key.objsize() << ' ' << key.toString() << endl; + return; + } + if( !dupsAllowed ) { if( n > 0 ) { int cmp = keyLast.woCompare(key, order); @@ -1203,26 +1765,21 @@ namespace mongo { keyLast = key; } - if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ){ - // no room - if ( key.objsize() > KeyMax ) { - problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << endl; - } - else { - // bucket was full - newBucket(); - b->pushBack(loc, key, ordering, DiskLoc()); - } + if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ) { + // bucket was full + newBucket(); + b->pushBack(loc, key, ordering, DiskLoc()); } n++; + mayCommitProgressDurably(); } - void BtreeBuilder::buildNextLevel(DiskLoc loc) { + void BtreeBuilder::buildNextLevel(DiskLoc loc) { int levels = 1; - while( 1 ) { - if( loc.btree()->tempNext().isNull() ) { + while( 1 ) { + if( loc.btree()->tempNext().isNull() ) { // only 1 bucket at this level. we are done. - idx.head = loc; + getDur().writingDiskLoc(idx.head) = loc; break; } levels++; @@ -1232,59 +1789,70 @@ namespace mongo { BtreeBucket *up = upLoc.btreemod(); DiskLoc xloc = loc; - while( !xloc.isNull() ) { + while( !xloc.isNull() ) { + if ( getDur().commitIfNeeded() ) { + b = cur.btreemod(); + up = upLoc.btreemod(); + } + BtreeBucket *x = xloc.btreemod(); - BSONObj k; + BSONObj k; DiskLoc r; x->popBack(r,k); bool keepX = ( x->n != 0 ); DiskLoc keepLoc = keepX ? xloc : x->nextChild; - if ( ! up->_pushBack(r, k, ordering, keepLoc) ){ + if ( ! up->_pushBack(r, k, ordering, keepLoc) ) { // current bucket full DiskLoc n = BtreeBucket::addBucket(idx); up->tempNext() = n; - upLoc = n; + upLoc = n; up = upLoc.btreemod(); up->pushBack(r, k, ordering, keepLoc); } - DiskLoc nextLoc = x->tempNext(); /* get next in chain at current level */ + DiskLoc nextLoc = x->tempNext(); // get next in chain at current level if ( keepX ) { - x->parent = upLoc; - } else { + x->parent = upLoc; + } + else { if ( !x->nextChild.isNull() ) x->nextChild.btreemod()->parent = upLoc; x->deallocBucket( xloc, idx ); } xloc = nextLoc; } - + loc = upStart; + mayCommitProgressDurably(); } if( levels > 1 ) log(2) << "btree levels: " << levels << endl; } - /* when all addKeys are done, we then build the higher levels of the tree */ - void BtreeBuilder::commit() { + /** when all addKeys are done, we then build the higher levels of the tree */ + void BtreeBuilder::commit() { buildNextLevel(first); committed = true; } - BtreeBuilder::~BtreeBuilder() { - if( !committed ) { - log(2) << "Rolling back partially built index space" << endl; - DiskLoc x = first; - while( !x.isNull() ) { - DiskLoc next = x.btree()->tempNext(); - btreeStore->deleteRecord(idx.indexNamespace().c_str(), x); - x = next; + BtreeBuilder::~BtreeBuilder() { + DESTRUCTOR_GUARD( + if( !committed ) { + log(2) << "Rolling back partially built index space" << endl; + DiskLoc x = first; + while( !x.isNull() ) { + DiskLoc next = x.btree()->tempNext(); + string ns = idx.indexNamespace(); + theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x); + x = next; + getDur().commitIfNeeded(); + } + assert( idx.head.isNull() ); + log(2) << "done rollback" << endl; } - assert( idx.head.isNull() ); - log(2) << "done rollback" << endl; - } + ) } } diff --git a/db/btree.h b/db/btree.h index 233b4dc..bced95e 100644 --- a/db/btree.h +++ b/db/btree.h @@ -25,8 +25,12 @@ namespace mongo { + const int BucketSize = 8192; + #pragma pack(1) struct _KeyNode { + /** Signals that we are writing this _KeyNode and casts away const */ + _KeyNode& writing() const; DiskLoc prevChildBucket; // the lchild DiskLoc recordLoc; // location of the record associated with the key short keyDataOfs() const { @@ -41,15 +45,12 @@ namespace mongo { _kdo = s; assert(s>=0); } - void setUsed() { - recordLoc.GETOFS() &= ~1; - } + void setUsed() { recordLoc.GETOFS() &= ~1; } void setUnused() { - /* Setting ofs to odd is the sentinel for unused, as real recordLoc's are always - even numbers. - Note we need to keep its value basically the same as we use the recordLoc - as part of the key in the index (to handle duplicate keys efficiently). - */ + // Setting ofs to odd is the sentinel for unused, as real recordLoc's are always + // even numbers. + // Note we need to keep its value basically the same as we use the recordLoc + // as part of the key in the index (to handle duplicate keys efficiently). recordLoc.GETOFS() |= 1; } int isUnused() const { @@ -63,7 +64,12 @@ namespace mongo { class BucketBasics; - /* wrapper - this is our in memory representation of the key. _KeyNode is the disk representation. */ + /** + * wrapper - this is our in memory representation of the key. + * _KeyNode is the disk representation. + * + * This object and its bson key will become invalid if the key is moved. + */ class KeyNode { public: KeyNode(const BucketBasics& bb, const _KeyNode &k); @@ -73,51 +79,111 @@ namespace mongo { }; #pragma pack(1) - /* this class is all about the storage management */ - class BucketBasics { + class BtreeData { + protected: + DiskLoc parent; + DiskLoc nextChild; // child bucket off and to the right of the highest key. + unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010 + unsigned short _reserved1; // zero + int flags; + + // basicInsert() assumes these three are together and in this order: + int emptySize; // size of the empty region + int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom') + int n; // # of keys so far. + + int reserved; + char data[4]; + }; + + /** + * This class is all about the storage management + * + * Const member functions of this class are those which may be called on + * an object for which writing has not been signaled. Non const member + * functions may only be called on objects for which writing has been + * signaled. Note that currently some const functions write to the + * underlying memory representation of this bucket using optimized methods + * to signal write operations. + * + * DiskLoc parameters that may shadow references within the btree should + * be passed by value rather than by reference to non const member + * functions or const member functions which may perform writes. This way + * a callee need not worry that write operations will change or invalidate + * its arguments. + * + * The current policy for dealing with bson arguments is the opposite of + * what is described above for DiskLoc arguments. We do + * not want to want to copy bson into memory as an intermediate step for + * btree changes, so if bson is to be moved it must be copied to the new + * location before the old location is invalidated. + */ + class BucketBasics : public BtreeData { friend class BtreeBuilder; friend class KeyNode; public: - void dumpTree(DiskLoc thisLoc, const BSONObj &order); - bool isHead() { return parent.isNull(); } - void assertValid(const Ordering &order, bool force = false); - void assertValid(const BSONObj &orderObj, bool force = false) { - return assertValid(Ordering::make(orderObj),force); - } - int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0); /* traverses everything */ + /** assert write intent declared for this bucket already */ + void assertWritable(); - KeyNode keyNode(int i) const { - if ( i >= n ){ + void assertValid(const Ordering &order, bool force = false) const; + void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); } + + /** + * @return KeyNode for key at index i. The KeyNode will become invalid + * if the key is moved or reassigned, or if the node is packed. + */ + const KeyNode keyNode(int i) const { + if ( i >= n ) { massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << n ).jsonString() , i < n ); } return KeyNode(*this, k(i)); } - protected: + static int headerSize() { + const BucketBasics *d = 0; + return (char*)&(d->data) - (char*)&(d->parent); + } + static int bodySize() { return BucketSize - headerSize(); } - void modified(const DiskLoc& thisLoc); + // for testing + int nKeys() const { return n; } + const DiskLoc getNextChild() const { return nextChild; } - char * dataAt(short ofs) { - return data + ofs; - } + protected: + char * dataAt(short ofs) { return data + ofs; } void init(); // initialize a new node - /* returns false if node is full and must be split - keypos is where to insert -- inserted after that key #. so keypos=0 is the leftmost one. - */ - bool basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order); - /** - * @return true if works, false if not enough space + * @return false if node is full and must be split + * @keypos is where to insert -- inserted before that key #. so keypos=0 is the leftmost one. + * keypos will be updated if keys are moved as a result of pack() + * This function will modify the btree bucket memory representation even + * though it is marked const. */ - bool _pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild); - void pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild){ + bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const; + + /** @return true if works, false if not enough space */ + bool _pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild); + void pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) { bool ok = _pushBack( recordLoc , key , order , prevChild ); assert(ok); } + + /** + * This is a special purpose function used by BtreeBuilder. The + * interface is quite dangerous if you're not careful. The bson key + * returned here points to bucket memory that has been invalidated but + * not yet reclaimed. + * + * TODO Maybe this could be replaced with two functions, one which + * returns the last key without deleting it and another which simply + * deletes the last key. Then the caller would have enough control to + * ensure proper memory integrity. + */ void popBack(DiskLoc& recLoc, BSONObj& key); - void _delKeyAtPos(int keypos); // low level version that doesn't deal with child ptrs. + + void _delKeyAtPos(int keypos, bool mayEmpty = false); // low level version that doesn't deal with child ptrs. /* !Packed means there is deleted fragment space within the bucket. We "repack" when we run out of space before considering the node @@ -125,145 +191,257 @@ namespace mongo { */ enum Flags { Packed=1 }; - DiskLoc& childForPos(int p) { - return p == n ? nextChild : k(p).prevChildBucket; - } + const DiskLoc& childForPos(int p) const { return p == n ? nextChild : k(p).prevChildBucket; } + DiskLoc& childForPos(int p) { return p == n ? nextChild : k(p).prevChildBucket; } int totalDataSize() const; - void pack( const Ordering &order, int &refPos); - void setNotPacked(); - void setPacked(); + /** @return true if the key may be dropped by pack() */ + bool mayDropKey( int index, int refPos ) const; + + /** + * Pack the bucket to reclaim space from invalidated memory. + * @refPos is an index in the bucket which will may be updated if we + * delete keys from the bucket + * This function may cast away const and perform a write. + */ + void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const; + /** Pack when already writable */ + void _packReadyForMod(const Ordering &order, int &refPos); + + /** + * @return the size of non header data in this bucket if we were to + * call pack(). + */ + int packedDataSize( int refPos ) const; + void setNotPacked() { flags &= ~Packed; } + void setPacked() { flags |= Packed; } int _alloc(int bytes); void _unalloc(int bytes); void truncateTo(int N, const Ordering &order, int &refPos); + /** drop specified number of keys from beginning of key array, and pack */ + void dropFront(int nDrop, const Ordering &order, int &refPos); void markUnused(int keypos); - /* BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. - we use tempNext() when we do that to be less confusing. (one might have written a union in C) - */ + /** + * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. + * we use tempNext() when we do that to be less confusing. (one might have written a union in C) + */ + const DiskLoc& tempNext() const { return parent; } DiskLoc& tempNext() { return parent; } - public: - DiskLoc parent; - - string bucketSummary() const { - stringstream ss; - ss << " Bucket info:" << endl; - ss << " n: " << n << endl; - ss << " parent: " << parent.toString() << endl; - ss << " nextChild: " << parent.toString() << endl; - ss << " flags:" << flags << endl; - ss << " emptySize: " << emptySize << " topSize: " << topSize << endl; - return ss.str(); - } - - bool isUsed( int i ) const { - return k(i).isUsed(); - } + void _shape(int level, stringstream&) const; + int Size() const; + const _KeyNode& k(int i) const { return ((const _KeyNode*)data)[i]; } + _KeyNode& k(int i) { return ((_KeyNode*)data)[i]; } - protected: - void _shape(int level, stringstream&); - DiskLoc nextChild; // child bucket off and to the right of the highest key. + /** @return the key position where a split should occur on insert */ + int splitPos( int keypos ) const; - private: - unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010 - unsigned short _reserved1; // zero + /** + * Adds new entries to beginning of key array, shifting existing + * entries to the right. After this is called, setKey() must be called + * on all the newly created entries in the key array. + */ + void reserveKeysFront( int nAdd ); - protected: - int Size() const; - int flags; - int emptySize; // size of the empty region - int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom') - int n; // # of keys so far. - int reserved; - const _KeyNode& k(int i) const { - return ((_KeyNode*)data)[i]; - } - _KeyNode& k(int i) { - return ((_KeyNode*)data)[i]; - } - char data[4]; + /** + * Sets an existing key using the given parameters. + * @i index of key to set + */ + void setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ); }; -#pragma pack() -#pragma pack(1) + /** + * This class adds functionality for manipulating buckets that are assembled + * in a tree. The requirements for const and non const functions and + * arguments are generally the same as in BtreeBucket. Because this class + * deals with tree structure, some functions that are marked const may + * trigger modification of another node in the btree or potentially of the + * current node. In such cases, the function's implementation explicitly + * casts away const when indicating an intent to write to the durability + * layer. The DiskLocs provided to such functions should be passed by + * value if they shadow pointers within the btree. + * + * To clarify enforcement of referential integrity in this implementation, + * we use the following pattern when deleting data we have a persistent + * pointer to. The pointer is cleared or removed explicitly, then the data + * it pointed to is cleaned up with a helper function. + * + * TODO It might make sense to put some of these functions in a class + * representing a full btree instead of a single btree bucket. That would + * allow us to use the const qualifier in a manner more consistent with + * standard usage. Right now the interface is for both a node and a tree, + * so assignment of const is sometimes nonideal. + * + * TODO There are several cases in which the this pointer is invalidated + * as a result of deallocation. A seperate class representing a btree would + * alleviate some fragile cases where the implementation must currently + * behave correctly if the this pointer is suddenly invalidated by a + * callee. + */ class BtreeBucket : public BucketBasics { friend class BtreeCursor; public: - void dump(); + bool isHead() const { return parent.isNull(); } + void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const; + int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0, bool strict = false) const; /* traverses everything */ - /* @return true if key exists in index + bool isUsed( int i ) const { return k(i).isUsed(); } + string bucketSummary() const; + void dump() const; - order - indicates order of keys in the index. this is basically the index's key pattern, e.g.: - BSONObj order = ((IndexDetails&)idx).keyPattern(); - likewise below in bt_insert() etc. - */ - bool exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, const Ordering& order); + /** + * @return true if key exists in index + * + * @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.: + * BSONObj order = ((IndexDetails&)idx).keyPattern(); + * likewise below in bt_insert() etc. + */ + bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const; bool wouldCreateDup( - const IndexDetails& idx, DiskLoc thisLoc, + const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order, - DiskLoc self); + const DiskLoc &self) const; + + static DiskLoc addBucket(const IndexDetails&); /* start a new index off, empty */ + /** invalidates 'this' and thisLoc */ + void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id); - static DiskLoc addBucket(IndexDetails&); /* start a new index off, empty */ - void deallocBucket(const DiskLoc &thisLoc, IndexDetails &id); - static void renameIndexNamespace(const char *oldNs, const char *newNs); - int bt_insert(DiskLoc thisLoc, DiskLoc recordLoc, - const BSONObj& key, const Ordering &order, bool dupsAllowed, - IndexDetails& idx, bool toplevel = true); + /** This function may change the btree root */ + int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel = true) const; - bool unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc); + /** This function may change the btree root */ + bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const; - /* locate may return an "unused" key that is just a marker. so be careful. - looks for a key:recordloc pair. + /** + * locate may return an "unused" key that is just a marker. so be careful. + * looks for a key:recordloc pair. + * + * @found - returns true if exact match found. note you can get back a position + * result even if found is false. + */ + DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, + int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const; - found - returns true if exact match found. note you can get back a position - result even if found is false. - */ - DiskLoc locate(const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, - int& pos, bool& found, DiskLoc recordLoc, int direction=1); - /** * find the first instance of the key * does not handle dups - * returned DiskLock isNull if can't find anything with that + * returned DiskLoc isNull if can't find anything with that + * @return the record location of the first match */ - DiskLoc findSingle( const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key ); + DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const; + + /** advance one key position in the index: */ + DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const; - /* advance one key position in the index: */ - DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller); - - void advanceTo(const IndexDetails &id, DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction ); - - DiskLoc getHead(const DiskLoc& thisLoc); + void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const; + void customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const; - /* get tree shape */ - void shape(stringstream&); + const DiskLoc getHead(const DiskLoc& thisLoc) const; + + /** get tree shape */ + void shape(stringstream&) const; static void a_test(IndexDetails&); - private: - void fixParentPtrs(const DiskLoc& thisLoc); - void delBucket(const DiskLoc& thisLoc, IndexDetails&); - void delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p); - BSONObj keyAt(int keyOfs) { + static int getLowWaterMark(); + static int getKeyMax(); + + protected: + /** + * Fix parent pointers for children + * @firstIndex first index to modify + * @lastIndex last index to modify (-1 means last index is n) + */ + void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const; + + /** invalidates this and thisLoc */ + void delBucket(const DiskLoc thisLoc, const IndexDetails&); + /** may invalidate this and thisLoc */ + void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order); + + /** + * May balance utilization of this bucket with a neighbor, either by + * merging the buckets or shifting nodes. + * @return true iff balancing was performed. + * NOTE This function may invalidate thisLoc. + */ + bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const; + + /** @return true if balance succeeded */ + bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const; + void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ); + void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket *l, const DiskLoc lchild, + BtreeBucket *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ); + void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket *l, const DiskLoc lchild, + BtreeBucket *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ); + + /** may invalidate this and thisLoc */ + void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order); + + /** will invalidate this and thisLoc */ + void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ); + + /** @return true iff left and right child can be merged into one node */ + bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const; + + /** + * @return index of the rebalanced separator; the index value is + * determined as if we had an array + * .push( ).concat( ) + * This is only expected to be called if the left and right child + * cannot be merged. + * This function is expected to be called on packed buckets, see also + * comments for splitPos(). + */ + int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const; + + int indexInParent( const DiskLoc &thisLoc ) const; + BSONObj keyAt(int keyOfs) const { return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key; } static BtreeBucket* allocTemp(); /* caller must release with free() */ - void insertHere(DiskLoc thisLoc, int keypos, - DiskLoc recordLoc, const BSONObj& key, const Ordering &order, - DiskLoc lchild, DiskLoc rchild, IndexDetails&); - int _insert(DiskLoc thisLoc, DiskLoc recordLoc, + + /** split bucket */ + void split(const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const BSONObj& key, + const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx); + + void insertHere(const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const; + + int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, bool dupsAllowed, - DiskLoc lChild, DiskLoc rChild, IndexDetails&); - bool find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const Ordering &order, int& pos, bool assertIfDup); - bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ); + const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const; + bool find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const; + bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const; static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey); - static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, const vector< const BSONElement * > &rEnd, const Ordering &o ); + static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ); + static void fix(const DiskLoc thisLoc, const DiskLoc child); + + /** Replaces an existing key with the new specified key, splitting if necessary */ + void setInternalKey( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const BSONObj &key, const Ordering &order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx); + + /** + * Deletes the specified key, replacing it with the key immediately + * preceding or succeeding it in the btree. Either the left or right + * child of the specified key must be non null. + */ + void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ); public: - // simply builds and returns a dup key error message string + /** simply builds and returns a dup key error message string */ static string dupKeyError( const IndexDetails& idx , const BSONObj& key ); }; #pragma pack() @@ -271,76 +449,59 @@ namespace mongo { class BtreeCursor : public Cursor { public: BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); - BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); - ~BtreeCursor(){ - } - virtual bool ok() { - return !bucket.isNull(); - } - bool eof() { - return !ok(); - } + virtual bool ok() { return !bucket.isNull(); } virtual bool advance(); - virtual void noteLocation(); // updates keyAtKeyOfs... virtual void checkLocation(); virtual bool supportGetMore() { return true; } virtual bool supportYields() { return true; } - /* used for multikey index traversal to avoid sending back dups. see Matcher::matches(). - if a multikey index traversal: - if loc has already been sent, returns true. - otherwise, marks loc as sent. - @return true if the loc has not been seen - */ + /** + * used for multikey index traversal to avoid sending back dups. see Matcher::matches(). + * if a multikey index traversal: + * if loc has already been sent, returns true. + * otherwise, marks loc as sent. + * @return true if the loc has not been seen + */ virtual bool getsetdup(DiskLoc loc) { - if( multikey ) { - pair::iterator, bool> p = dups.insert(loc); + if( _multikey ) { + pair::iterator, bool> p = _dups.insert(loc); return !p.second; } return false; } - _KeyNode& _currKeyNode() { + virtual bool modifiedKeys() const { return _multikey; } + virtual bool isMultiKey() const { return _multikey; } + + const _KeyNode& _currKeyNode() const { assert( !bucket.isNull() ); - _KeyNode& kn = bucket.btree()->k(keyOfs); + const _KeyNode& kn = bucket.btree()->k(keyOfs); assert( kn.isUsed() ); return kn; } - KeyNode currKeyNode() const { + const KeyNode currKeyNode() const { assert( !bucket.isNull() ); return bucket.btree()->keyNode(keyOfs); } - virtual BSONObj currKey() const { - return currKeyNode().key; - } - virtual BSONObj indexKeyPattern() { - return indexDetails.keyPattern(); - } + virtual BSONObj currKey() const { return currKeyNode().key; } + virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); } virtual void aboutToDeleteBucket(const DiskLoc& b) { if ( bucket == b ) keyOfs = -1; } - virtual DiskLoc currLoc() { - return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); - } - virtual DiskLoc refLoc() { - return currLoc(); - } - virtual Record* _current() { - return currLoc().rec(); - } - virtual BSONObj current() { - return BSONObj(_current()); - } + virtual DiskLoc currLoc() { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); } + virtual DiskLoc refLoc() { return currLoc(); } + virtual Record* _current() { return currLoc().rec(); } + virtual BSONObj current() { return BSONObj(_current()); } virtual string toString() { string s = string("BtreeCursor ") + indexDetails.indexName(); - if ( direction < 0 ) s += " reverse"; - if ( bounds_.get() && bounds_->size() > 1 ) s += " multi"; + if ( _direction < 0 ) s += " reverse"; + if ( _bounds.get() && _bounds->size() > 1 ) s += " multi"; return s; } @@ -351,77 +512,81 @@ namespace mongo { virtual BSONObj prettyIndexBounds() const { if ( !_independentFieldRanges ) { return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) ); - } else { - return bounds_->obj(); + } + else { + return _bounds->obj(); } } - + void forgetEndKey() { endKey = BSONObj(); } virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); } - - virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { - _matcher = matcher; - } - // for debugging only - DiskLoc getBucket() const { return bucket; } - + virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; } + + virtual long long nscanned() { return _nscanned; } + + /** for debugging only */ + const DiskLoc getBucket() const { return bucket; } + private: - /* Our btrees may (rarely) have "unused" keys when items are deleted. - Skip past them. - */ + /** + * Our btrees may (rarely) have "unused" keys when items are deleted. + * Skip past them. + */ bool skipUnusedKeys( bool mayJump ); bool skipOutOfRangeKeysAndCheckEnd(); void skipAndCheck(); void checkEnd(); - // selective audits on construction + /** selective audits on construction */ void audit(); - // set initial bucket + /** set initial bucket */ void init(); - void advanceTo( const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd); - + /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */ + void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive ); + friend class BtreeBucket; - set dups; - NamespaceDetails *d; - int idxNo; - + + set _dups; + NamespaceDetails * const d; + const int idxNo; BSONObj startKey; BSONObj endKey; - bool endKeyInclusive_; - - bool multikey; // note this must be updated every getmore batch in case someone added a multikey... - + bool _endKeyInclusive; + bool _multikey; // this must be updated every getmore batch in case someone added a multikey const IndexDetails& indexDetails; - BSONObj order; - Ordering _ordering; + const BSONObj _order; + const Ordering _ordering; DiskLoc bucket; int keyOfs; - int direction; // 1=fwd,-1=reverse + const int _direction; // 1=fwd,-1=reverse BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call DiskLoc locAtKeyOfs; - shared_ptr< FieldRangeVector > bounds_; + const shared_ptr< FieldRangeVector > _bounds; auto_ptr< FieldRangeVector::Iterator > _boundsIterator; const IndexSpec& _spec; shared_ptr< CoveredIndexMatcher > _matcher; bool _independentFieldRanges; + long long _nscanned; }; - inline bool IndexDetails::hasKey(const BSONObj& key) { + inline bool IndexDetails::hasKey(const BSONObj& key) { return head.btree()->exists(*this, head, key, Ordering::make(keyPattern())); } - inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) { + inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) { return head.btree()->wouldCreateDup(*this, head, key, Ordering::make(keyPattern()), self); } - /* build btree from the bottom up */ - /* _ TODO dropDups */ + /** + * build btree from the bottom up + * _ TODO dropDups + */ class BtreeBuilder { - bool dupsAllowed; + bool dupsAllowed; IndexDetails& idx; unsigned long long n; BSONObj keyLast; @@ -434,18 +599,20 @@ namespace mongo { void newBucket(); void buildNextLevel(DiskLoc); + void mayCommitProgressDurably(); public: ~BtreeBuilder(); BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx); - /* keys must be added in order */ + /** keys must be added in order */ void addKey(BSONObj& key, DiskLoc loc); - /* commit work. if not called, destructor will clean up partially completed work - (in case exception has happened). - */ + /** + * commit work. if not called, destructor will clean up partially completed work + * (in case exception has happened). + */ void commit(); unsigned long long getn() { return n; } diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp index d6d0c09..9cab95f 100644 --- a/db/btreecursor.cpp +++ b/db/btreecursor.cpp @@ -20,54 +20,56 @@ #include "btree.h" #include "pdfile.h" #include "jsobj.h" -#include "curop.h" +#include "curop-inl.h" namespace mongo { extern int otherTraceLevel; - BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id, + BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id, const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) : - d(_d), idxNo(_idxNo), - startKey( _startKey ), - endKey( _endKey ), - endKeyInclusive_( endKeyInclusive ), - multikey( d->isMultikey( idxNo ) ), - indexDetails( _id ), - order( _id.keyPattern() ), - _ordering( Ordering::make( order ) ), - direction( _direction ), - _spec( _id.getSpec() ), - _independentFieldRanges( false ) - { + d(_d), idxNo(_idxNo), + startKey( _startKey ), + endKey( _endKey ), + _endKeyInclusive( endKeyInclusive ), + _multikey( d->isMultikey( idxNo ) ), + indexDetails( _id ), + _order( _id.keyPattern() ), + _ordering( Ordering::make( _order ) ), + _direction( _direction ), + _spec( _id.getSpec() ), + _independentFieldRanges( false ), + _nscanned( 0 ) { audit(); init(); - DEV assert( dups.size() == 0 ); + dassert( _dups.size() == 0 ); } BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ) : - d(_d), idxNo(_idxNo), - endKeyInclusive_( true ), - multikey( d->isMultikey( idxNo ) ), - indexDetails( _id ), - order( _id.keyPattern() ), - _ordering( Ordering::make( order ) ), - direction( _direction ), - bounds_( ( assert( _bounds.get() ), _bounds ) ), - _boundsIterator( new FieldRangeVector::Iterator( *bounds_ ) ), - _spec( _id.getSpec() ), - _independentFieldRanges( true ) - { + d(_d), idxNo(_idxNo), + _endKeyInclusive( true ), + _multikey( d->isMultikey( idxNo ) ), + indexDetails( _id ), + _order( _id.keyPattern() ), + _ordering( Ordering::make( _order ) ), + _direction( _direction ), + _bounds( ( assert( _bounds.get() ), _bounds ) ), + _boundsIterator( new FieldRangeVector::Iterator( *_bounds ) ), + _spec( _id.getSpec() ), + _independentFieldRanges( true ), + _nscanned( 0 ) { massert( 13384, "BtreeCursor FieldRangeVector constructor doesn't accept special indexes", !_spec.getType() ); audit(); - startKey = bounds_->startKey(); - bool found; + startKey = _bounds->startKey(); _boundsIterator->advance( startKey ); // handles initialization - bucket = indexDetails.head.btree()-> - locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction); + _boundsIterator->prepDive(); + pair< DiskLoc, int > noBestParent; + bucket = indexDetails.head; + keyOfs = 0; + indexDetails.head.btree()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent ); skipAndCheck(); - DEV assert( dups.size() == 0 ); + dassert( _dups.size() == 0 ); } void BtreeCursor::audit() { @@ -76,7 +78,7 @@ namespace mongo { if ( otherTraceLevel >= 12 ) { if ( otherTraceLevel >= 200 ) { out() << "::BtreeCursor() qtl>200. validating entire index." << endl; - indexDetails.head.btree()->fullValidate(indexDetails.head, order); + indexDetails.head.btree()->fullValidate(indexDetails.head, _order); } else { out() << "BTreeCursor(). dumping head bucket" << endl; @@ -86,17 +88,20 @@ namespace mongo { } void BtreeCursor::init() { - if ( _spec.getType() ){ + if ( _spec.getType() ) { startKey = _spec.getType()->fixKey( startKey ); endKey = _spec.getType()->fixKey( endKey ); } bool found; bucket = indexDetails.head.btree()-> - locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction); + locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, _direction > 0 ? minDiskLoc : maxDiskLoc, _direction); + if ( ok() ) { + _nscanned = 1; + } skipUnusedKeys( false ); checkEnd(); } - + void BtreeCursor::skipAndCheck() { skipUnusedKeys( true ); while( 1 ) { @@ -109,7 +114,7 @@ namespace mongo { } } } - + bool BtreeCursor::skipOutOfRangeKeysAndCheckEnd() { if ( !ok() ) { return false; @@ -118,25 +123,30 @@ namespace mongo { if ( ret == -2 ) { bucket = DiskLoc(); return false; - } else if ( ret == -1 ) { + } + else if ( ret == -1 ) { + ++_nscanned; return false; } - advanceTo( currKeyNode().key, ret, _boundsIterator->cmp() ); + ++_nscanned; + advanceTo( currKeyNode().key, ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() ); return true; } - + /* skip unused keys. */ bool BtreeCursor::skipUnusedKeys( bool mayJump ) { int u = 0; while ( 1 ) { if ( !ok() ) break; - BtreeBucket *b = bucket.btree(); - _KeyNode& kn = b->k(keyOfs); + const BtreeBucket *b = bucket.btree(); + const _KeyNode& kn = b->k(keyOfs); if ( kn.isUsed() ) break; - bucket = b->advance(bucket, keyOfs, direction, "skipUnusedKeys"); + bucket = b->advance(bucket, keyOfs, _direction, "skipUnusedKeys"); u++; + //don't include unused keys in nscanned + //++_nscanned; if ( mayJump && ( u % 10 == 0 ) ) { skipOutOfRangeKeysAndCheckEnd(); } @@ -158,31 +168,34 @@ namespace mongo { if ( bucket.isNull() ) return; if ( !endKey.isEmpty() ) { - int cmp = sgn( endKey.woCompare( currKey(), order ) ); - if ( ( cmp != 0 && cmp != direction ) || - ( cmp == 0 && !endKeyInclusive_ ) ) + int cmp = sgn( endKey.woCompare( currKey(), _order ) ); + if ( ( cmp != 0 && cmp != _direction ) || + ( cmp == 0 && !_endKeyInclusive ) ) bucket = DiskLoc(); } } - - void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd) { - bucket.btree()->advanceTo( indexDetails, bucket, keyOfs, keyBegin, keyBeginLen, keyEnd, _ordering, direction ); + + void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) { + bucket.btree()->advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction ); } - + bool BtreeCursor::advance() { killCurrentOp.checkForInterrupt(); if ( bucket.isNull() ) return false; - bucket = bucket.btree()->advance(bucket, keyOfs, direction, "BtreeCursor::advance"); + bucket = bucket.btree()->advance(bucket, keyOfs, _direction, "BtreeCursor::advance"); if ( !_independentFieldRanges ) { skipUnusedKeys( false ); checkEnd(); - return ok(); + if ( ok() ) { + ++_nscanned; + } + } + else { + skipAndCheck(); } - - skipAndCheck(); return ok(); } @@ -207,10 +220,10 @@ namespace mongo { if ( eof() ) return; - multikey = d->isMultikey(idxNo); + _multikey = d->isMultikey(idxNo); if ( keyOfs >= 0 ) { - BtreeBucket *b = bucket.btree(); + const BtreeBucket *b = bucket.btree(); assert( !keyAtKeyOfs.isEmpty() ); @@ -219,17 +232,17 @@ namespace mongo { int x = 0; while( 1 ) { if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) && - b->k(keyOfs).recordLoc == locAtKeyOfs ) { - if ( !b->k(keyOfs).isUsed() ) { - /* we were deleted but still exist as an unused - marker key. advance. - */ - skipUnusedKeys( false ); - } - return; + b->k(keyOfs).recordLoc == locAtKeyOfs ) { + if ( !b->k(keyOfs).isUsed() ) { + /* we were deleted but still exist as an unused + marker key. advance. + */ + skipUnusedKeys( false ); + } + return; } - /* we check one key earlier too, in case a key was just deleted. this is + /* we check one key earlier too, in case a key was just deleted. this is important so that multi updates are reasonably fast. */ if( keyOfs == 0 || x++ ) @@ -245,7 +258,7 @@ namespace mongo { bool found; /* TODO: Switch to keep indexdetails and do idx.head! */ - bucket = indexDetails.head.btree()->locate(indexDetails, indexDetails.head, keyAtKeyOfs, _ordering, keyOfs, found, locAtKeyOfs, direction); + bucket = indexDetails.head.btree()->locate(indexDetails, indexDetails.head, keyAtKeyOfs, _ordering, keyOfs, found, locAtKeyOfs, _direction); RARELY log() << " key seems to have moved in the index, refinding. found:" << found << endl; if ( ! bucket.isNull() ) skipUnusedKeys( false ); diff --git a/db/cap.cpp b/db/cap.cpp index c676429..198bd54 100644 --- a/db/cap.cpp +++ b/db/cap.cpp @@ -1,4 +1,5 @@ -// @file cap.cpp capped collection related +// @file cap.cpp capped collection related +// the "old" version (<= v1.6) /** * Copyright (C) 2008 10gen Inc. @@ -49,7 +50,7 @@ namespace mongo { /* combine adjacent deleted records *for the current extent* of the capped collection - + this is O(n^2) but we call it for capped tables where typically n==1 or 2! (or 3...there will be a little unused sliver at the end of the extent.) */ @@ -62,7 +63,8 @@ namespace mongo { DiskLoc i = cappedFirstDeletedInCurExtent(); for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted ) drecs.push_back( i ); - cappedFirstDeletedInCurExtent() = i; + + getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i; // This is the O(n^2) part. drecs.sort(); @@ -80,7 +82,7 @@ namespace mongo { DiskLoc b = *j; while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) { // a & b are adjacent. merge. - a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders; + getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders; j++; if ( j == drecs.end() ) { DEBUGGING out() << "temp: compact adddelrec2\n"; @@ -106,8 +108,8 @@ namespace mongo { // migrate old NamespaceDetails format assert( capped ); if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) { - capFirstNewRecord = DiskLoc(); - capFirstNewRecord.setInvalid(); + //capFirstNewRecord = DiskLoc(); + capFirstNewRecord.writing().setInvalid(); // put all the DeletedRecords in cappedListOfAllDeletedRecords() for ( int i = 1; i < Buckets; ++i ) { DiskLoc first = deletedList[ i ]; @@ -115,14 +117,14 @@ namespace mongo { continue; DiskLoc last = first; for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted ); - last.drec()->nextDeleted = cappedListOfAllDeletedRecords(); - cappedListOfAllDeletedRecords() = first; - deletedList[ i ] = DiskLoc(); + last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords(); + cappedListOfAllDeletedRecords().writing() = first; + deletedList[i].writing() = DiskLoc(); } // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above // Last, in case we're killed before getting here - capExtent = firstExtent; + capExtent.writing() = firstExtent; } } @@ -144,20 +146,20 @@ namespace mongo { // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent // (or DiskLoc() if new capExtent == firstExtent) if ( capExtent == lastExtent ) - cappedLastDelRecLastExtent() = DiskLoc(); + getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc(); else { DiskLoc i = cappedFirstDeletedInCurExtent(); for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted ); - cappedLastDelRecLastExtent() = i; + getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i; } - capExtent = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext; + getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext; /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */ //dassert( theCapExtent()->ns == ns ); theCapExtent()->assertOk(); - capFirstNewRecord = DiskLoc(); + getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc(); } DiskLoc NamespaceDetails::__capAlloc( int len ) { @@ -176,25 +178,25 @@ namespace mongo { /* unlink ourself from the deleted list */ if ( !ret.isNull() ) { if ( prev.isNull() ) - cappedListOfAllDeletedRecords() = ret.drec()->nextDeleted; + cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted; else - prev.drec()->nextDeleted = ret.drec()->nextDeleted; - ret.drec()->nextDeleted.setInvalid(); // defensive. + prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted; + ret.drec()->nextDeleted.writing().setInvalid(); // defensive. assert( ret.drec()->extentOfs < ret.getOfs() ); } return ret; } - DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) { + DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) { // signal done allocating new extents. if ( !cappedLastDelRecLastExtent().isValid() ) - cappedLastDelRecLastExtent() = DiskLoc(); - + getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc(); + assert( len < 400000000 ); int passes = 0; int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog - if ( maxPasses < 5000 ){ + if ( maxPasses < 5000 ) { // this is for bacwards safety since 5000 was the old value maxPasses = 5000; } @@ -208,7 +210,7 @@ namespace mongo { theCapExtent()->assertOk(); DiskLoc firstEmptyExtent; while ( 1 ) { - if ( nrecords < max ) { + if ( stats.nrecords < max ) { loc = __capAlloc( len ); if ( !loc.isNull() ) break; @@ -217,8 +219,9 @@ namespace mongo { // If on first iteration through extents, don't delete anything. if ( !capFirstNewRecord.isValid() ) { advanceCapExtent( ns ); + if ( capExtent != firstExtent ) - capFirstNewRecord.setInvalid(); + capFirstNewRecord.writing().setInvalid(); // else signal done with first iteration through extents. continue; } @@ -247,14 +250,14 @@ namespace mongo { compact(); if( ++passes > maxPasses ) { log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n'; - log() << "passes max:" << max << " nrecords:" << nrecords << " datasize: " << datasize << endl; + log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl; massert( 10345 , "passes >= maxPasses in capped collection alloc", false ); } } // Remember first record allocated on this iteration through capExtent. if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() ) - capFirstNewRecord = loc; + getDur().writingDiskLoc(capFirstNewRecord) = loc; return loc; } @@ -269,123 +272,179 @@ namespace mongo { } } - void NamespaceDetails::cappedDumpDelInfo() { + void NamespaceDetails::cappedDumpDelInfo() { cout << "dl[0]: " << deletedList[0].toString() << endl; - for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) { - cout << " drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders << - " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl; + for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) { + cout << " drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders << + " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl; } cout << "dl[1]: " << deletedList[1].toString() << endl; } - /* everything from end on, eliminate from the capped collection. - @param inclusive if true, deletes end (i.e. closed or open range) - */ + void NamespaceDetails::cappedTruncateLastDelUpdate() { + if ( capExtent == firstExtent ) { + // Only one extent of the collection is in use, so there + // is no deleted record in a previous extent, so nullify + // cappedLastDelRecLastExtent(). + cappedLastDelRecLastExtent().writing() = DiskLoc(); + } + else { + // Scan through all deleted records in the collection + // until the last deleted record for the extent prior + // to the new capExtent is found. Then set + // cappedLastDelRecLastExtent() to that deleted record. + DiskLoc i = cappedListOfAllDeletedRecords(); + for( ; + !i.drec()->nextDeleted.isNull() && + !inCapExtent( i.drec()->nextDeleted ); + i = i.drec()->nextDeleted ); + // In our capped storage model, every extent must have at least one + // deleted record. Here we check that 'i' is not the last deleted + // record. (We expect that there will be deleted records in the new + // capExtent as well.) + assert( !i.drec()->nextDeleted.isNull() ); + cappedLastDelRecLastExtent().writing() = i; + } + } + void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) { DEV assert( this == nsdetails(ns) ); assert( cappedLastDelRecLastExtent().isValid() ); - + + // We iteratively remove the newest document until the newest document + // is 'end', then we remove 'end' if requested. bool foundLast = false; while( 1 ) { if ( foundLast ) { + // 'end' has been found and removed, so break. break; } + // 'curr' will point to the newest document in the collection. DiskLoc curr = theCapExtent()->lastRecord; assert( !curr.isNull() ); if ( curr == end ) { if ( inclusive ) { + // 'end' has been found, so break next iteration. foundLast = true; - } else { + } + else { + // 'end' has been found, so break. break; } } - - uassert( 13415, "emptying the collection is not allowed", nrecords > 1 ); - + + // TODO The algorithm used in this function cannot generate an + // empty collection, but we could call emptyCappedCollection() in + // this case instead of asserting. + uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 ); + + // Delete the newest record, and coalesce the new deleted + // record with existing deleted records. + theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true); + compact(); + + // This is the case where we have not yet had to remove any + // documents to make room for other documents, and we are allocating + // documents from free space in fresh extents instead of reusing + // space from familiar extents. if ( !capLooped() ) { - theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true); - compact(); + + // We just removed the last record from the 'capExtent', and + // the 'capExtent' can't be empty, so we set 'capExtent' to + // capExtent's prev extent. if ( theCapExtent()->lastRecord.isNull() ) { assert( !theCapExtent()->xprev.isNull() ); - capExtent = theCapExtent()->xprev; + // NOTE Because we didn't delete the last document, and + // capLooped() is false, capExtent is not the first extent + // so xprev will be nonnull. + capExtent.writing() = theCapExtent()->xprev; theCapExtent()->assertOk(); - if ( capExtent == firstExtent ) { - cappedLastDelRecLastExtent() = DiskLoc(); - } else { - // slow - there's no prev ptr for deleted rec - DiskLoc i = cappedListOfAllDeletedRecords(); - for( ; - !i.drec()->nextDeleted.isNull() && - !inCapExtent( i.drec()->nextDeleted ); - i = i.drec()->nextDeleted ); - assert( !i.drec()->nextDeleted.isNull() ); // I believe there is always at least one drec per extent - cappedLastDelRecLastExtent() = i; - } + + // update cappedLastDelRecLastExtent() + cappedTruncateLastDelUpdate(); } continue; } - theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true); - compact(); - if ( curr == capFirstNewRecord ) { // invalid, but can compare locations - capExtent = ( capExtent == firstExtent ) ? lastExtent : theCapExtent()->xprev; - theCapExtent()->assertOk(); - assert( !theCapExtent()->firstRecord.isNull() ); - capFirstNewRecord = theCapExtent()->firstRecord; - if ( capExtent == firstExtent ) { - cappedLastDelRecLastExtent() = DiskLoc(); - } else { - // slow - there's no prev ptr for deleted rec - DiskLoc i = cappedListOfAllDeletedRecords(); - for( ; - !i.drec()->nextDeleted.isNull() && - !inCapExtent( i.drec()->nextDeleted ); - i = i.drec()->nextDeleted ); - assert( !i.drec()->nextDeleted.isNull() ); // I believe there is always at least one drec per extent - cappedLastDelRecLastExtent() = i; + // This is the case where capLooped() is true, and we just deleted + // from capExtent, and we just deleted capFirstNewRecord, which was + // the last record on the fresh side of capExtent. + // NOTE In this comparison, curr and potentially capFirstNewRecord + // may point to invalid data, but we can still compare the + // references themselves. + if ( curr == capFirstNewRecord ) { + + // Set 'capExtent' to the first nonempty extent prior to the + // initial capExtent. There must be such an extent because we + // have not deleted the last document in the collection. It is + // possible that all extents other than the capExtent are empty. + // In this case we will keep the initial capExtent and specify + // that all records contained within are on the fresh rather than + // stale side of the extent. + DiskLoc newCapExtent = capExtent; + do { + // Find the previous extent, looping if necessary. + newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev; + newCapExtent.ext()->assertOk(); } + while ( newCapExtent.ext()->firstRecord.isNull() ); + capExtent.writing() = newCapExtent; + + // Place all documents in the new capExtent on the fresh side + // of the capExtent by setting capFirstNewRecord to the first + // document in the new capExtent. + capFirstNewRecord.writing() = theCapExtent()->firstRecord; + + // update cappedLastDelRecLastExtent() + cappedTruncateLastDelUpdate(); } } } - + void NamespaceDetails::emptyCappedCollection( const char *ns ) { DEV assert( this == nsdetails(ns) ); massert( 13424, "collection must be capped", capped ); - massert( 13425, "background index build in progress", !backgroundIndexBuildInProgress ); + massert( 13425, "background index build in progress", !indexBuildInProgress ); massert( 13426, "indexes present", nIndexes == 0 ); + // Clear all references to this namespace. ClientCursor::invalidate( ns ); - NamespaceDetailsTransient::clearForPrefix( ns ); + NamespaceDetailsTransient::clearForPrefix( ns ); + + // Get a writeable reference to 'this' and reset all pertinent + // attributes. + NamespaceDetails *t = writingWithoutExtra(); + + t->cappedLastDelRecLastExtent() = DiskLoc(); + t->cappedListOfAllDeletedRecords() = DiskLoc(); - cappedLastDelRecLastExtent() = DiskLoc(); - cappedListOfAllDeletedRecords() = DiskLoc(); - // preserve firstExtent/lastExtent - capExtent = firstExtent; - datasize = nrecords = 0; + t->capExtent = firstExtent; + t->stats.datasize = stats.nrecords = 0; // lastExtentSize preserve // nIndexes preserve 0 // capped preserve true // max preserve - paddingFactor = 1.0; - flags = 0; - capFirstNewRecord = DiskLoc(); - capFirstNewRecord.setInvalid(); - cappedLastDelRecLastExtent().setInvalid(); + t->paddingFactor = 1.0; + t->flags = 0; + t->capFirstNewRecord = DiskLoc(); + t->capFirstNewRecord.setInvalid(); + t->cappedLastDelRecLastExtent().setInvalid(); // dataFileVersion preserve // indexFileVersion preserve - multiKeyIndexBits = 0; - reservedA = 0; - extraOffset = 0; - // backgroundIndexBuildInProgress preserve 0 - memset(reserved, 0, sizeof(reserved)); + t->multiKeyIndexBits = 0; + t->reservedA = 0; + t->extraOffset = 0; + // indexBuildInProgress preserve 0 + memset(t->reserved, 0, sizeof(t->reserved)); + // Reset all existing extents and recreate the deleted list. for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) { DiskLoc prev = ext.ext()->xprev; DiskLoc next = ext.ext()->xnext; DiskLoc empty = ext.ext()->reuse( ns ); - ext.ext()->xprev = prev; - ext.ext()->xnext = next; + ext.ext()->xprev.writing() = prev; + ext.ext()->xnext.writing() = next; addDeletedRec( empty.drec(), empty ); } } diff --git a/db/client.cpp b/db/client.cpp index f9653f5..e4fd4b9 100644 --- a/db/client.cpp +++ b/db/client.cpp @@ -16,14 +16,14 @@ * along with this program. If not, see . */ -/* Client represents a connection to the database (the server-side) and corresponds +/* Client represents a connection to the database (the server-side) and corresponds to an open socket (or logical connection if pooling on sockets) from a client. */ #include "pch.h" #include "db.h" #include "client.h" -#include "curop.h" +#include "curop-inl.h" #include "json.h" #include "security.h" #include "commands.h" @@ -40,20 +40,31 @@ namespace mongo { set Client::clients; // always be in clientsMutex when manipulating this boost::thread_specific_ptr currentClient; - Client::Client(const char *desc, MessagingPort *p) : - _context(0), - _shutdown(false), - _desc(desc), - _god(0), - _lastOp(0), - _mp(p) - { + /* each thread which does db operations has a Client object in TLS. + call this when your thread starts. + */ + Client& Client::initThread(const char *desc, MessagingPort *mp) { + assert( currentClient.get() == 0 ); + Client *c = new Client(desc, mp); + currentClient.reset(c); + mongo::lastError.initThread(); + return *c; + } + + Client::Client(const char *desc, MessagingPort *p) : + _context(0), + _shutdown(false), + _desc(desc), + _god(0), + _lastOp(0), + _mp(p) { + _connectionId = setThreadName(desc); _curOp = new CurOp( this ); scoped_lock bl(clientsMutex); clients.insert(this); } - Client::~Client() { + Client::~Client() { _god = 0; if ( _context ) @@ -62,90 +73,33 @@ namespace mongo { if ( ! _shutdown ) { error() << "Client::shutdown not called: " << _desc << endl; } - + scoped_lock bl(clientsMutex); if ( ! _shutdown ) clients.erase(this); delete _curOp; } - - void Client::_dropns( const string& ns ){ - Top::global.collectionDropped( ns ); - - dblock l; - Client::Context ctx( ns ); - if ( ! nsdetails( ns.c_str() ) ) - return; - - try { - string err; - BSONObjBuilder b; - dropCollection( ns , err , b ); - } - catch ( ... ){ - warning() << "error dropping temp collection: " << ns << endl; - } - - } - - void Client::_invalidateDB( const string& db ) { - assert( db.find( '.' ) == string::npos ); - - set::iterator min = _tempCollections.lower_bound( db + "." ); - set::iterator max = _tempCollections.lower_bound( db + "|" ); - - _tempCollections.erase( min , max ); - - } - - void Client::invalidateDB(const string& db) { - scoped_lock bl(clientsMutex); - for ( set::iterator i = clients.begin(); i!=clients.end(); i++ ){ - Client* cli = *i; - cli->_invalidateDB(db); - } - } - void Client::invalidateNS( const string& ns ){ - scoped_lock bl(clientsMutex); - for ( set::iterator i = clients.begin(); i!=clients.end(); i++ ){ - Client* cli = *i; - cli->_tempCollections.erase( ns ); - } - } - - - void Client::addTempCollection( const string& ns ) { - _tempCollections.insert( ns ); - } - - bool Client::shutdown(){ + bool Client::shutdown() { _shutdown = true; if ( inShutdown() ) return false; { scoped_lock bl(clientsMutex); clients.erase(this); - } - - bool didAnything = false; - - if ( _tempCollections.size() ){ - didAnything = true; - for ( set::iterator i = _tempCollections.begin(); i!=_tempCollections.end(); i++ ){ - _dropns( *i ); + if ( isSyncThread() ) { + syncThread = 0; } - _tempCollections.clear(); } - - return didAnything; + + return false; } - BSONObj CurOp::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}"); + BSONObj CachedBSONObj::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}"); AtomicUInt CurOp::_nextOpNum; - + Client::Context::Context( string ns , Database * db, bool doauth ) - : _client( currentClient.get() ) , _oldContext( _client->_context ) , + : _client( currentClient.get() ) , _oldContext( _client->_context ) , _path( dbpath ) , _lock(0) , _justCreated(false) { assert( db && db->isOk() ); _ns = ns; @@ -155,20 +109,36 @@ namespace mongo { _auth(); } - void Client::Context::_finishInit( bool doauth ){ + Client::Context::Context(const string& ns, string path , mongolock * lock , bool doauth ) + : _client( currentClient.get() ) , _oldContext( _client->_context ) , + _path( path ) , _lock( lock ) , + _ns( ns ), _db(0) { + _finishInit( doauth ); + } + + /* this version saves the context but doesn't yet set the new one: */ + + Client::Context::Context() + : _client( currentClient.get() ) , _oldContext( _client->_context ), + _path( dbpath ) , _lock(0) , _justCreated(false), _db(0) { + _client->_context = this; + clear(); + } + + void Client::Context::_finishInit( bool doauth ) { int lockState = dbMutex.getState(); assert( lockState ); - + _db = dbHolder.get( _ns , _path ); - if ( _db ){ + if ( _db ) { _justCreated = false; } - else if ( dbMutex.getState() > 0 ){ + else if ( dbMutex.getState() > 0 ) { // already in a write lock _db = dbHolder.getOrCreate( _ns , _path , _justCreated ); assert( _db ); } - else if ( dbMutex.getState() < -1 ){ + else if ( dbMutex.getState() < -1 ) { // nested read lock :( assert( _lock ); _lock->releaseAndWriteLock(); @@ -181,50 +151,52 @@ namespace mongo { // to do that, we're going to unlock, then get a write lock // this is so that if this is the first query and its long doesn't block db // we just have to check that the db wasn't closed in the interim where we unlock - for ( int x=0; x<2; x++ ){ - { + for ( int x=0; x<2; x++ ) { + { dbtemprelease unlock; writelock lk( _ns ); dbHolder.getOrCreate( _ns , _path , _justCreated ); } - + _db = dbHolder.get( _ns , _path ); - + if ( _db ) break; - + log() << "db was closed on us right after we opened it: " << _ns << endl; } - + uassert( 13005 , "can't create db, keeps getting closed" , _db ); } - - _client->_context = this; - _client->_curOp->enter( this ); - if ( doauth ) - _auth( lockState ); - switch ( _client->_curOp->getOp() ){ + switch ( _client->_curOp->getOp() ) { case dbGetMore: // getMore's are special and should be handled else where case dbUpdate: // update & delete check shard version in instance.cpp, so don't check here as well - case dbDelete: + case dbDelete: break; default: { string errmsg; - if ( ! shardVersionOk( _ns , lockState > 0 , errmsg ) ){ - msgasserted( StaleConfigInContextCode , (string)"[" + _ns + "] shard version not ok in Client::Context: " + errmsg ); + if ( ! shardVersionOk( _ns , lockState > 0 , errmsg ) ) { + ostringstream os; + os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg; + msgassertedNoTrace( StaleConfigInContextCode , os.str().c_str() ); } } } + + _client->_context = this; + _client->_curOp->enter( this ); + if ( doauth ) + _auth( lockState ); } - - void Client::Context::_auth( int lockState ){ + + void Client::Context::_auth( int lockState ) { if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) ) return; // before we assert, do a little cleanup _client->_context = _oldContext; // note: _oldContext may be null - + stringstream ss; ss << "unauthorized db:" << _db->name << " lock type:" << lockState << " client:" << _client->clientAddress(); uasserted( 10057 , ss.str() ); @@ -236,9 +208,35 @@ namespace mongo { _client->_context = _oldContext; // note: _oldContext may be null } - string Client::clientAddress() const { + bool Client::Context::inDB( const string& db , const string& path ) const { + if ( _path != path ) + return false; + + if ( db == _ns ) + return true; + + string::size_type idx = _ns.find( db ); + if ( idx != 0 ) + return false; + + return _ns[db.size()] == '.'; + } + + void Client::appendLastOp( BSONObjBuilder& b ) const { + if( theReplSet ) { + b.append("lastOp" , (long long) _lastOp); + } + else { + OpTime lo(_lastOp); + if ( ! lo.isNull() ) + b.appendTimestamp( "lastOp" , lo.asDate() ); + } + } + + + string Client::clientAddress(bool includePort) const { if( _curOp ) - return _curOp->getRemoteString(false); + return _curOp->getRemoteString(includePort); return ""; } @@ -249,63 +247,75 @@ namespace mongo { return ss.str(); } - string sayClientState(){ + string sayClientState() { Client* c = currentClient.get(); if ( !c ) return "no client"; return c->toString(); } - - void curopWaitingForLock( int type ){ + + Client* curopWaitingForLock( int type ) { Client * c = currentClient.get(); assert( c ); CurOp * co = c->curop(); - if ( co ){ + if ( co ) { co->waitingForLock( type ); } + return c; } - void curopGotLock(){ - Client * c = currentClient.get(); + void curopGotLock(Client *c) { assert(c); CurOp * co = c->curop(); - if ( co ){ + if ( co ) co->gotLock(); - } } - CurOp::~CurOp(){ - if ( _wrapped ){ - scoped_lock bl(Client::clientsMutex); - _client->_curOp = _wrapped; + void KillCurrentOp::interruptJs( AtomicUInt *op ) { + if ( !globalScriptEngine ) + return; + if ( !op ) { + globalScriptEngine->interruptAll(); } - - _client = 0; + else { + globalScriptEngine->interrupt( *op ); + } + } + + void KillCurrentOp::killAll() { + _globalKill = true; + interruptJs( 0 ); } - BSONObj CurOp::query( bool threadSafe ) { - if( querySize() == 1 ) { - return _tooBig; + void KillCurrentOp::kill(AtomicUInt i) { + bool found = false; + { + scoped_lock l( Client::clientsMutex ); + for( set< Client* >::const_iterator j = Client::clients.begin(); !found && j != Client::clients.end(); ++j ) { + for( CurOp *k = ( *j )->curop(); !found && k; k = k->parent() ) { + if ( k->opNum() == i ) { + k->kill(); + for( CurOp *l = ( *j )->curop(); l != k; l = l->parent() ) { + l->kill(); + } + found = true; + } + } + } } - - if ( ! threadSafe ){ - BSONObj o(_queryBuf); - return o; + if ( found ) { + interruptJs( &i ); } - - int size = querySize(); - int before = checksum( _queryBuf , size ); - BSONObj a(_queryBuf); - BSONObj b = a.copy(); - int after = checksum( _queryBuf , size ); - - if ( before == after ) - return b; - - return BSON( "msg" << "query changed while capturing" ); } + CurOp::~CurOp() { + if ( _wrapped ) { + scoped_lock bl(Client::clientsMutex); + _client->_curOp = _wrapped; + } + _client = 0; + } - BSONObj CurOp::infoNoauth( int attempt ) { + BSONObj CurOp::infoNoauth() { BSONObjBuilder b; b.append("opid", _opNum); bool a = _active && _start; @@ -313,40 +323,16 @@ namespace mongo { if ( _lockType ) b.append("lockType" , _lockType > 0 ? "write" : "read" ); b.append("waitingForLock" , _waitingForLock ); - - if( a ){ + + if( a ) { b.append("secs_running", elapsedSeconds() ); } - + b.append( "op" , opToString( _op ) ); - + b.append("ns", _ns); - - { - int size = querySize(); - if ( size == 0 ){ - // do nothing - } - else if ( size == 1 ){ - b.append( "query" , _tooBig ); - } - else if ( attempt > 2 ){ - b.append( "query" , BSON( "err" << "can't get a clean object" ) ); - log( LL_WARNING ) << "CurOp changing too much to get reading" << endl; - - } - else { - int before = checksum( _queryBuf , size ); - b.appendObject( "query" , _queryBuf , size ); - int after = checksum( _queryBuf , size ); - - if ( after != before ){ - // this means something changed - // going to retry - return infoNoauth( attempt + 1 ); - } - } - } + + _query.append( b , "query" ); // b.append("inLock", ?? stringstream clientStr; @@ -355,9 +341,9 @@ namespace mongo { if ( _client ) b.append( "desc" , _client->desc() ); - - if ( ! _message.empty() ){ - if ( _progressMeter.isActive() ){ + + if ( ! _message.empty() ) { + if ( _progressMeter.isActive() ) { StringBuilder buf(128); buf << _message.toString() << " " << _progressMeter.toString(); b.append( "msg" , buf.str() ); @@ -370,7 +356,7 @@ namespace mongo { return b.obj(); } - void Client::gotHandshake( const BSONObj& o ){ + void Client::gotHandshake( const BSONObj& o ) { BSONObjIterator i(o); { @@ -378,7 +364,7 @@ namespace mongo { assert( id.type() ); _remoteId = id.wrap( "_id" ); } - + BSONObjBuilder b; while ( i.more() ) b.append( i.next() ); @@ -388,31 +374,31 @@ namespace mongo { class HandshakeCmd : public Command { public: void help(stringstream& h) const { h << "internal"; } - HandshakeCmd() : Command( "handshake" ){} - virtual LockType locktype() const { return NONE; } + HandshakeCmd() : Command( "handshake" ) {} + virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return false; } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { Client& c = cc(); c.gotHandshake( cmdObj ); return 1; - } + } } handshakeCmd; class ClientListPlugin : public WebStatusPlugin { public: - ClientListPlugin() : WebStatusPlugin( "clients" , 20 ){} - virtual void init(){} - - virtual void run( stringstream& ss ){ + ClientListPlugin() : WebStatusPlugin( "clients" , 20 ) {} + virtual void init() {} + + virtual void run( stringstream& ss ) { using namespace mongoutils::html; ss << "\n"; ss << "" << th( a("", "Connections to the database, both internal and external.", "Client") ) << th( a("http://www.mongodb.org/display/DOCS/Viewing+and+Terminating+Current+Operation", "", "OpId") ) - << "" + << "" << "" << "" << "" @@ -426,11 +412,11 @@ namespace mongo { << "\n"; { scoped_lock bl(Client::clientsMutex); - for( set::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { + for( set::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { Client *c = *i; CurOp& co = *(c->curop()); ss << ""; - + tablecell( ss , co.opNum() ); tablecell( ss , co.active() ); { @@ -447,8 +433,9 @@ namespace mongo { tablecell( ss , "" ); tablecell( ss , co.getOp() ); tablecell( ss , co.getNS() ); - if ( co.haveQuery() ) - tablecell( ss , co.query( true ) ); + if ( co.haveQuery() ) { + tablecell( ss , co.query() ); + } else tablecell( ss , "" ); tablecell( ss , co.getRemoteString() ); @@ -463,18 +450,18 @@ namespace mongo { ss << "
ActiveActiveLockTypeWaitingSecsRunning
" << c->desc() << "
\n"; } - + } clientListPlugin; - int Client::recommendedYieldMicros( int * writers , int * readers ){ + int Client::recommendedYieldMicros( int * writers , int * readers ) { int num = 0; int w = 0; int r = 0; { scoped_lock bl(clientsMutex); - for ( set::iterator i=clients.begin(); i!=clients.end(); ++i ){ + for ( set::iterator i=clients.begin(); i!=clients.end(); ++i ) { Client* c = *i; - if ( c->curop()->isWaitingForLock() ){ + if ( c->curop()->isWaitingForLock() ) { num++; if ( c->curop()->getLockType() > 0 ) w++; @@ -483,15 +470,44 @@ namespace mongo { } } } - + if ( writers ) *writers = w; if ( readers ) *readers = r; - if ( num > 50 ) - num = 50; + int time = r * 100; + time += w * 500; + + time = min( time , 1000000 ); + + // there has been a kill request for this op - we should yield to allow the op to stop + // This function returns empty string if we aren't interrupted + if ( killCurrentOp.checkForInterruptNoAssert( false )[0] != '\0' ) { + return 100; + } + + return time; + } + + int Client::getActiveClientCount( int& writers, int& readers ) { + writers = 0; + readers = 0; + + scoped_lock bl(clientsMutex); + for ( set::iterator i=clients.begin(); i!=clients.end(); ++i ) { + Client* c = *i; + if ( ! c->curop()->active() ) + continue; + + int l = c->curop()->getLockType(); + if ( l > 0 ) + writers++; + else if ( l < 0 ) + readers++; + + } - return num * 100; + return writers + readers; } } diff --git a/db/client.h b/db/client.h index d0600e3..4e8589e 100644 --- a/db/client.h +++ b/db/client.h @@ -16,7 +16,7 @@ * along with this program. If not, see . */ -/* Client represents a connection to the database (the server-side) and corresponds +/* Client represents a connection to the database (the server-side) and corresponds to an open socket (or logical connection if pooling on sockets) from a client. todo: switch to asio...this will fit nicely with that. @@ -26,11 +26,11 @@ #include "../pch.h" #include "security.h" -#include "namespace.h" +#include "namespace-inl.h" #include "lasterror.h" #include "stats/top.h" -namespace mongo { +namespace mongo { extern class ReplSet *theReplSet; class AuthenticationInfo; @@ -42,18 +42,83 @@ namespace mongo { extern boost::thread_specific_ptr currentClient; - class Client : boost::noncopyable { + typedef long long ConnectionId; + + class Client : boost::noncopyable { public: + class Context; + + static mongo::mutex clientsMutex; + static set clients; // always be in clientsMutex when manipulating this + static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 ); + static int getActiveClientCount( int& writers , int& readers ); + static Client *syncThread; - void iAmSyncThread() { + + + /* each thread which does db operations has a Client object in TLS. + call this when your thread starts. + */ + static Client& initThread(const char *desc, MessagingPort *mp = 0); + + /* + this has to be called as the client goes away, but before thread termination + @return true if anything was done + */ + bool shutdown(); + + + ~Client(); + + void iAmSyncThread() { wassert( syncThread == 0 ); - syncThread = this; + syncThread = this; } bool isSyncThread() const { return this == syncThread; } // true if this client is the replication secondary pull thread - static mongo::mutex clientsMutex; - static set clients; // always be in clientsMutex when manipulating this - static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 ); + + string clientAddress(bool includePort=false) const; + AuthenticationInfo * getAuthenticationInfo() { return &_ai; } + bool isAdmin() { return _ai.isAuthorized( "admin" ); } + CurOp* curop() const { return _curOp; } + Context* getContext() const { return _context; } + Database* database() const { return _context ? _context->db() : 0; } + const char *ns() const { return _context->ns(); } + const char *desc() const { return _desc; } + void setLastOp( ReplTime op ) { _lastOp = op; } + ReplTime getLastOp() const { return _lastOp; } + + /* report what the last operation was. used by getlasterror */ + void appendLastOp( BSONObjBuilder& b ) const; + + bool isGod() const { return _god; } /* this is for map/reduce writes */ + string toString() const; + void gotHandshake( const BSONObj& o ); + BSONObj getRemoteID() const { return _remoteId; } + BSONObj getHandshake() const { return _handshake; } + + MessagingPort * port() const { return _mp; } + + ConnectionId getConnectionId() const { return _connectionId; } + + private: + ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise + CurOp * _curOp; + Context * _context; + bool _shutdown; + const char *_desc; + bool _god; + AuthenticationInfo _ai; + ReplTime _lastOp; + BSONObj _handshake; + BSONObj _remoteId; + MessagingPort * const _mp; + + Client(const char *desc, MessagingPort *p = 0); + + friend class CurOp; + + public: /* set _god=true temporarily, safely */ class GodScope { @@ -63,201 +128,99 @@ namespace mongo { ~GodScope(); }; + /* Set database we want to use, then, restores when we finish (are out of scope) Note this is also helpful if an exception happens as the state if fixed up. */ - class Context : boost::noncopyable{ - Client * _client; - Context * _oldContext; - - string _path; - mongolock * _lock; - bool _justCreated; - - string _ns; - Database * _db; - + class Context : boost::noncopyable { + public: /** - * at this point _client, _oldContext and _ns have to be set - * _db should not have been touched - * this will set _db and create if needed - * will also set _client->_context to this + * this is the main constructor + * use this unless there is a good reason not to */ - void _finishInit( bool doauth=true); - - void _auth( int lockState = dbMutex.getState() ); - public: - Context(const string& ns, string path=dbpath, mongolock * lock = 0 , bool doauth=true ) - : _client( currentClient.get() ) , _oldContext( _client->_context ) , - _path( path ) , _lock( lock ) , - _ns( ns ), _db(0){ - _finishInit( doauth ); - } - + Context(const string& ns, string path=dbpath, mongolock * lock = 0 , bool doauth=true ); + /* this version saves the context but doesn't yet set the new one: */ - - Context() - : _client( currentClient.get() ) , _oldContext( _client->_context ), - _path( dbpath ) , _lock(0) , _justCreated(false), _db(0){ - _client->_context = this; - clear(); - } - + Context(); + /** * if you are doing this after allowing a write there could be a race condition * if someone closes that db. this checks that the DB is still valid */ Context( string ns , Database * db, bool doauth=true ); - + ~Context(); - Client* getClient() const { return _client; } + Client* getClient() const { return _client; } Database* db() const { return _db; } - const char * ns() const { return _ns.c_str(); } + const char * ns() const { return _ns.c_str(); } + + /** @return if the db was created by this Context */ bool justCreated() const { return _justCreated; } - bool equals( const string& ns , const string& path=dbpath ) const { - return _ns == ns && _path == path; - } + bool equals( const string& ns , const string& path=dbpath ) const { return _ns == ns && _path == path; } - bool inDB( const string& db , const string& path=dbpath ) const { - if ( _path != path ) - return false; - - if ( db == _ns ) - return true; - - string::size_type idx = _ns.find( db ); - if ( idx != 0 ) - return false; - - return _ns[db.size()] == '.'; - } + /** + * @return true iff the current Context is using db/path + */ + bool inDB( const string& db , const string& path=dbpath ) const; - void clear(){ - _ns = ""; - _db = 0; - } + void clear() { _ns = ""; _db = 0; } /** * call before unlocking, so clear any non-thread safe state */ - void unlocked(){ - _db = 0; - } + void unlocked() { _db = 0; } /** * call after going back into the lock, will re-establish non-thread safe stuff */ - void relocked(){ - _finishInit(); - } + void relocked() { _finishInit(); } friend class CurOp; - }; // class Client::Context - - private: - void _dropns( const string& ns ); - - CurOp * _curOp; - Context * _context; - bool _shutdown; - set _tempCollections; - const char *_desc; - bool _god; - AuthenticationInfo _ai; - ReplTime _lastOp; - BSONObj _handshake; - BSONObj _remoteId; - - public: - MessagingPort * const _mp; - string clientAddress() const; - AuthenticationInfo * getAuthenticationInfo(){ return &_ai; } - bool isAdmin() { return _ai.isAuthorized( "admin" ); } - CurOp* curop() { return _curOp; } - Context* getContext(){ return _context; } - Database* database() { return _context ? _context->db() : 0; } - const char *ns() const { return _context->ns(); } - const char *desc() const { return _desc; } - - Client(const char *desc, MessagingPort *p = 0); - ~Client(); + private: + /** + * at this point _client, _oldContext and _ns have to be set + * _db should not have been touched + * this will set _db and create if needed + * will also set _client->_context to this + */ + void _finishInit( bool doauth=true); - void addTempCollection( const string& ns ); - - void _invalidateDB(const string& db); - static void invalidateDB(const string& db); - static void invalidateNS( const string& ns ); + void _auth( int lockState = dbMutex.getState() ); - void setLastOp( ReplTime op ) { _lastOp = op; } - ReplTime getLastOp() const { return _lastOp; } + Client * _client; + Context * _oldContext; - /* report what the last operation was. used by getlasterror */ - void appendLastOp( BSONObjBuilder& b ) { - if( theReplSet ) { - b.append("lastOp" , (long long) _lastOp); - } - else { - OpTime lo(_lastOp); - if ( ! lo.isNull() ) - b.appendTimestamp( "lastOp" , lo.asDate() ); - } - } + string _path; + mongolock * _lock; + bool _justCreated; - /* each thread which does db operations has a Client object in TLS. - call this when your thread starts. - */ - static Client& initThread(const char *desc, MessagingPort *mp = 0); + string _ns; + Database * _db; - /* - this has to be called as the client goes away, but before thread termination - @return true if anything was done - */ - bool shutdown(); - - /* this is for map/reduce writes */ - bool isGod() const { return _god; } + }; // class Client::Context - friend class CurOp; - string toString() const; - void gotHandshake( const BSONObj& o ); - BSONObj getRemoteID() const { return _remoteId; } - BSONObj getHandshake() const { return _handshake; } }; - + /** get the Client object for this thread. */ - inline Client& cc() { + inline Client& cc() { Client * c = currentClient.get(); assert( c ); return *c; } - /* each thread which does db operations has a Client object in TLS. - call this when your thread starts. - */ - inline Client& Client::initThread(const char *desc, MessagingPort *mp) { - setThreadName(desc); - assert( currentClient.get() == 0 ); - Client *c = new Client(desc, mp); - currentClient.reset(c); - mongo::lastError.initThread(); - return *c; - } - - inline Client::GodScope::GodScope(){ + inline Client::GodScope::GodScope() { _prev = cc()._god; cc()._god = true; } - inline Client::GodScope::~GodScope(){ - cc()._god = _prev; - } + inline Client::GodScope::~GodScope() { cc()._god = _prev; } - /* this unlocks, does NOT upgrade. that works for our current usage */ - inline void mongolock::releaseAndWriteLock() { + /* this unlocks, does NOT upgrade. that works for our current usage */ + inline void mongolock::releaseAndWriteLock() { if( !_writelock ) { #if BOOST_VERSION >= 103500 @@ -278,6 +241,6 @@ namespace mongo { } string sayClientState(); - + inline bool haveClient() { return currentClient.get() > 0; } }; diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp index 23ef529..bc09457 100644 --- a/db/clientcursor.cpp +++ b/db/clientcursor.cpp @@ -32,18 +32,18 @@ namespace mongo { - typedef multimap CCByLoc; - CCById ClientCursor::clientCursorsById; boost::recursive_mutex ClientCursor::ccmutex; long long ClientCursor::numberTimedOut = 0; - /*static*/ void ClientCursor::assertNoCursors() { + void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h + + /*static*/ void ClientCursor::assertNoCursors() { recursive_scoped_lock lock(ccmutex); - if( clientCursorsById.size() ) { + if( clientCursorsById.size() ) { log() << "ERROR clientcursors exist but should not at this point" << endl; ClientCursor *cc = clientCursorsById.begin()->second; - log() << "first one: " << cc->cursorid << ' ' << cc->ns << endl; + log() << "first one: " << cc->_cursorid << ' ' << cc->_ns << endl; clientCursorsById.clear(); assert(false); } @@ -51,18 +51,19 @@ namespace mongo { void ClientCursor::setLastLoc_inlock(DiskLoc L) { + assert( _pos != -2 ); // defensive - see ~ClientCursor + if ( L == _lastLoc ) return; CCByLoc& bl = byLoc(); + if ( !_lastLoc.isNull() ) { - CCByLoc::iterator i = kv_find(bl, _lastLoc, this); - if ( i != bl.end() ) - bl.erase(i); + bl.erase( ByLocKey( _lastLoc, _cursorid ) ); } if ( !L.isNull() ) - bl.insert( make_pair(L, this) ); + bl[ByLocKey(L,_cursorid)] = this; _lastLoc = L; } @@ -74,8 +75,8 @@ namespace mongo { /* todo: this implementation is incomplete. we use it as a prefix for dropDatabase, which works fine as the prefix will end with '.'. however, when used with drop and - dropIndexes, this could take out cursors that belong to something else -- if you - drop "foo", currently, this will kill cursors for "foobar". + dropIndexes, this could take out cursors that belong to something else -- if you + drop "foo", currently, this will kill cursors for "foobar". */ void ClientCursor::invalidate(const char *nsPrefix) { vector toDelete; @@ -84,6 +85,7 @@ namespace mongo { assert( len > 0 && strchr(nsPrefix, '.') ); { + //cout << "\nTEMP invalidate " << nsPrefix << endl; recursive_scoped_lock lock(ccmutex); Database *db = cc().database(); @@ -92,18 +94,18 @@ namespace mongo { for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) { ClientCursor *cc = i->second; - if( cc->_db != db ) + if( cc->_db != db ) continue; - if ( strncmp(nsPrefix, cc->ns.c_str(), len) == 0 ) { + if ( strncmp(nsPrefix, cc->_ns.c_str(), len) == 0 ) { toDelete.push_back(i->second); } } /* note : we can't iterate byloc because clientcursors may exist with a loc of null in which case - they are not in the map. perhaps they should not exist though in the future? something to + they are not in the map. perhaps they should not exist though in the future? something to change??? - + CCByLoc& bl = db->ccByLoc; for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) { ClientCursor *cc = i->second; @@ -115,10 +117,16 @@ namespace mongo { for ( vector::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) delete (*i); + + /*cout << "TEMP after invalidate " << endl; + for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) { + cout << " " << i->second->ns << endl; + } + cout << "TEMP after invalidate done" << endl;*/ } } - bool ClientCursor::shouldTimeout( unsigned millis ){ + bool ClientCursor::shouldTimeout( unsigned millis ) { _idleAgeMillis += millis; return _idleAgeMillis > 600000 && _pinValue == 0; } @@ -130,9 +138,9 @@ namespace mongo { for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) { CCById::iterator j = i; i++; - if( j->second->shouldTimeout( millis ) ){ + if( j->second->shouldTimeout( millis ) ) { numberTimedOut++; - log(1) << "killing old cursor " << j->second->cursorid << ' ' << j->second->ns + log(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns << " idle:" << j->second->idleTime() << "ms\n"; delete j->second; } @@ -150,10 +158,10 @@ namespace mongo { log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n"; } for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ ) - i->second->c->aboutToDeleteBucket(b); + i->second->_c->aboutToDeleteBucket(b); } void aboutToDeleteBucket(const DiskLoc& b) { - ClientCursor::informAboutToDeleteBucket(b); + ClientCursor::informAboutToDeleteBucket(b); } /* must call this on a delete so we clean up the cursors. */ @@ -162,9 +170,12 @@ namespace mongo { Database *db = cc().database(); assert(db); + + aboutToDeleteForSharding( db , dl ); + CCByLoc& bl = db->ccByLoc; - CCByLoc::iterator j = bl.lower_bound(dl); - CCByLoc::iterator stop = bl.upper_bound(dl); + CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl)); + CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl)); if ( j == stop ) return; @@ -172,26 +183,45 @@ namespace mongo { while ( 1 ) { toAdvance.push_back(j->second); - DEV assert( j->first == dl ); + DEV assert( j->first.loc == dl ); ++j; if ( j == stop ) break; } - wassert( toAdvance.size() < 5000 ); - - for ( vector::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){ + if( toAdvance.size() >= 3000 ) { + log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc " + << dl.toString() + << ' ' << toAdvance[1000]->_ns + << ' ' << toAdvance[2000]->_ns + << ' ' << toAdvance[1000]->_pinValue + << ' ' << toAdvance[2000]->_pinValue + << ' ' << toAdvance[1000]->_pos + << ' ' << toAdvance[2000]->_pos + << ' ' << toAdvance[1000]->_idleAgeMillis + << ' ' << toAdvance[2000]->_idleAgeMillis + << ' ' << toAdvance[1000]->_doingDeletes + << ' ' << toAdvance[2000]->_doingDeletes + << endl; + //wassert( toAdvance.size() < 5000 ); + } + + for ( vector::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) { ClientCursor* cc = *i; wassert(cc->_db == db); - + if ( cc->_doingDeletes ) continue; - Cursor *c = cc->c.get(); - if ( c->capped() ){ + Cursor *c = cc->_c.get(); + if ( c->capped() ) { + /* note we cannot advance here. if this condition occurs, writes to the oplog + have "caught" the reader. skipping ahead, the reader would miss postentially + important data. + */ delete cc; continue; } - + c->checkLocation(); DiskLoc tmp1 = c->refLoc(); if ( tmp1 != dl ) { @@ -213,53 +243,131 @@ namespace mongo { } void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); } + ClientCursor::ClientCursor(int queryOptions, const shared_ptr& c, const string& ns, BSONObj query ) : + _ns(ns), _db( cc().database() ), + _c(c), _pos(0), + _query(query), _queryOptions(queryOptions), + _idleAgeMillis(0), _pinValue(0), + _doingDeletes(false), _yieldSometimesTracker(128,10) { + assert( _db ); + assert( str::startsWith(_ns, _db->name) ); + if( queryOptions & QueryOption_NoCursorTimeout ) + noTimeout(); + recursive_scoped_lock lock(ccmutex); + _cursorid = allocCursorId_inlock(); + clientCursorsById.insert( make_pair(_cursorid, this) ); + + if ( ! _c->modifiedKeys() ) { + // store index information so we can decide if we can + // get something out of the index key rather than full object + + int x = 0; + BSONObjIterator i( _c->indexKeyPattern() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( e.isNumber() ) { + // only want basic index fields, not "2d" etc + _indexedFields[e.fieldName()] = x; + } + x++; + } + } + + } + + ClientCursor::~ClientCursor() { - assert( pos != -2 ); + assert( _pos != -2 ); { recursive_scoped_lock lock(ccmutex); setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap - clientCursorsById.erase(cursorid); + clientCursorsById.erase(_cursorid); // defensive: - (CursorId&) cursorid = -1; - pos = -2; + (CursorId&)_cursorid = -1; + _pos = -2; + } + } + + bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret ) { + + map::const_iterator i = _indexedFields.find( name ); + if ( i == _indexedFields.end() ) { + current().getFieldsDotted( name , ret ); + return false; + } + + int x = i->second; + + BSONObjIterator it( currKey() ); + while ( x && it.more() ) { + it.next(); + x--; } + assert( x == 0 ); + ret.insert( it.next() ); + return true; + } + + BSONElement ClientCursor::getFieldDotted( const string& name , bool * fromKey ) { + + map::const_iterator i = _indexedFields.find( name ); + if ( i == _indexedFields.end() ) { + if ( fromKey ) + *fromKey = false; + return current().getFieldDotted( name ); + } + + int x = i->second; + + BSONObjIterator it( currKey() ); + while ( x && it.more() ) { + it.next(); + x--; + } + assert( x == 0 ); + + if ( fromKey ) + *fromKey = true; + return it.next(); } + /* call when cursor's location changes so that we can update the cursorsbylocation map. if you are locked and internally iterating, only need to call when you are ready to "unlock". */ void ClientCursor::updateLocation() { - assert( cursorid ); + assert( _cursorid ); _idleAgeMillis = 0; - DiskLoc cl = c->refLoc(); + DiskLoc cl = _c->refLoc(); if ( lastLoc() == cl ) { //log() << "info: lastloc==curloc " << ns << '\n'; - } else { + } + else { recursive_scoped_lock lock(ccmutex); setLastLoc_inlock(cl); } // may be necessary for MultiCursor even when cl hasn't changed - c->noteLocation(); + _c->noteLocation(); } - + int ClientCursor::yieldSuggest() { int writers = 0; int readers = 0; - + int micros = Client::recommendedYieldMicros( &writers , &readers ); - - if ( micros > 0 && writers == 0 && dbMutex.getState() <= 0 ){ + + if ( micros > 0 && writers == 0 && dbMutex.getState() <= 0 ) { // we have a read lock, and only reads are coming on, so why bother unlocking micros = 0; } - + return micros; } - - bool ClientCursor::yieldSometimes(){ + + bool ClientCursor::yieldSometimes() { if ( ! _yieldSometimesTracker.ping() ) return true; @@ -267,82 +375,83 @@ namespace mongo { return ( micros > 0 ) ? yield( micros ) : true; } - void ClientCursor::staticYield( int micros ) { + void ClientCursor::staticYield( int micros , const StringData& ns ) { + killCurrentOp.checkForInterrupt( false ); { dbtempreleasecond unlock; - if ( unlock.unlocked() ){ + if ( unlock.unlocked() ) { if ( micros == -1 ) micros = Client::recommendedYieldMicros(); if ( micros > 0 ) - sleepmicros( micros ); + sleepmicros( micros ); } else { - log( LL_WARNING ) << "ClientCursor::yield can't unlock b/c of recursive lock" << endl; + warning() << "ClientCursor::yield can't unlock b/c of recursive lock ns: " << ns << endl; } - } + } } - + bool ClientCursor::prepareToYield( YieldData &data ) { - if ( ! c->supportYields() ) + if ( ! _c->supportYields() ) return false; // need to store in case 'this' gets deleted - data._id = cursorid; - + data._id = _cursorid; + data._doingDeletes = _doingDeletes; _doingDeletes = false; - + updateLocation(); - + { - /* a quick test that our temprelease is safe. - todo: make a YieldingCursor class + /* a quick test that our temprelease is safe. + todo: make a YieldingCursor class and then make the following code part of a unit test. */ const int test = 0; static bool inEmpty = false; - if( test && !inEmpty ) { + if( test && !inEmpty ) { inEmpty = true; log() << "TEST: manipulate collection during cc:yield" << endl; - if( test == 1 ) - Helpers::emptyCollection(ns.c_str()); + if( test == 1 ) + Helpers::emptyCollection(_ns.c_str()); else if( test == 2 ) { BSONObjBuilder b; string m; - dropCollection(ns.c_str(), m, b); + dropCollection(_ns.c_str(), m, b); } - else { - dropDatabase(ns.c_str()); + else { + dropDatabase(_ns.c_str()); } } - } + } return true; } - + bool ClientCursor::recoverFromYield( const YieldData &data ) { ClientCursor *cc = ClientCursor::find( data._id , false ); - if ( cc == 0 ){ + if ( cc == 0 ) { // id was deleted return false; } - + cc->_doingDeletes = data._doingDeletes; - cc->c->checkLocation(); - return true; + cc->_c->checkLocation(); + return true; } - + bool ClientCursor::yield( int micros ) { - if ( ! c->supportYields() ) + if ( ! _c->supportYields() ) return true; - YieldData data; + YieldData data; prepareToYield( data ); - - staticYield( micros ); + + staticYield( micros , _ns ); return ClientCursor::recoverFromYield( data ); } int ctmLast = 0; // so we don't have to do find() which is a little slow very often. long long ClientCursor::allocCursorId_inlock() { - if( 0 ) { + if( 0 ) { static long long z; ++z; cout << "TEMP alloccursorid " << z << endl; @@ -362,32 +471,32 @@ namespace mongo { return x; } - void ClientCursor::storeOpForSlave( DiskLoc last ){ + void ClientCursor::storeOpForSlave( DiskLoc last ) { if ( ! ( _queryOptions & QueryOption_OplogReplay )) return; if ( last.isNull() ) return; - + BSONElement e = last.obj()["ts"]; if ( e.type() == Date || e.type() == Timestamp ) _slaveReadTill = e._opTime(); } - - void ClientCursor::updateSlaveLocation( CurOp& curop ){ + + void ClientCursor::updateSlaveLocation( CurOp& curop ) { if ( _slaveReadTill.isNull() ) return; - mongo::updateSlaveLocation( curop , ns.c_str() , _slaveReadTill ); + mongo::updateSlaveLocation( curop , _ns.c_str() , _slaveReadTill ); } - void ClientCursor::appendStats( BSONObjBuilder& result ){ + void ClientCursor::appendStats( BSONObjBuilder& result ) { recursive_scoped_lock lock(ccmutex); - result.appendNumber("totalOpen", (int)clientCursorsById.size() ); + result.appendNumber("totalOpen", clientCursorsById.size() ); result.appendNumber("clientCursors_size", (int) numCursors()); - result.appendNumber("timedOut" , (int)numberTimedOut); + result.appendNumber("timedOut" , numberTimedOut); } - + // QUESTION: Restrict to the namespace from which this command was issued? // Alternatively, make this command admin-only? class CmdCursorInfo : public Command { @@ -398,19 +507,19 @@ namespace mongo { help << " example: { cursorInfo : 1 }"; } virtual LockType locktype() const { return NONE; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { ClientCursor::appendStats( result ); return true; } } cmdCursorInfo; - - void ClientCursorMonitor::run(){ + + void ClientCursorMonitor::run() { Client::initThread("clientcursormon"); Client& client = cc(); - + unsigned old = curTimeMillis(); - while ( ! inShutdown() ){ + while ( ! inShutdown() ) { unsigned now = curTimeMillis(); ClientCursor::idleTimeReport( now - old ); old = now; @@ -420,15 +529,28 @@ namespace mongo { client.shutdown(); } - void ClientCursor::find( const string& ns , set& all ){ + void ClientCursor::find( const string& ns , set& all ) { recursive_scoped_lock lock(ccmutex); - - for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ){ - if ( i->second->ns == ns ) + + for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ) { + if ( i->second->_ns == ns ) all.insert( i->first ); } } + int ClientCursor::erase(int n, long long *ids) { + int found = 0; + for ( int i = 0; i < n; i++ ) { + if ( erase(ids[i]) ) + found++; + + if ( inShutdown() ) + break; + } + return found; + + } + ClientCursorMonitor clientCursorMonitor; diff --git a/db/clientcursor.h b/db/clientcursor.h index b895c17..f1d107f 100644 --- a/db/clientcursor.h +++ b/db/clientcursor.h @@ -33,6 +33,7 @@ #include "dbhelpers.h" #include "matcher.h" #include "../client/dbclient.h" +#include "projection.h" namespace mongo { @@ -41,31 +42,35 @@ namespace mongo { class ClientCursor; class ParsedQuery; + struct ByLocKey { + + ByLocKey( const DiskLoc & l , const CursorId& i ) : loc(l), id(i) {} + + static ByLocKey min( const DiskLoc& l ) { return ByLocKey( l , numeric_limits::min() ); } + static ByLocKey max( const DiskLoc& l ) { return ByLocKey( l , numeric_limits::max() ); } + + bool operator<( const ByLocKey &other ) const { + int x = loc.compare( other.loc ); + if ( x ) + return x < 0; + return id < other.id; + } + + DiskLoc loc; + CursorId id; + + }; + /* todo: make this map be per connection. this will prevent cursor hijacking security attacks perhaps. + * ERH: 9/2010 this may not work since some drivers send getMore over a different connection */ typedef map CCById; + typedef map CCByLoc; extern BSONObj id_obj; class ClientCursor { friend class CmdCursorInfo; - DiskLoc _lastLoc; // use getter and setter not this (important) - unsigned _idleAgeMillis; // how long has the cursor been around, relative to server idle time - - /* 0 = normal - 1 = no timeout allowed - 100 = in use (pinned) -- see Pointer class - */ - unsigned _pinValue; - - bool _doingDeletes; - ElapsedTracker _yieldSometimesTracker; - - static CCById clientCursorsById; - static long long numberTimedOut; - static boost::recursive_mutex ccmutex; // must use this for all statics above! - static CursorId allocCursorId_inlock(); - public: static void assertNoCursors(); @@ -75,32 +80,38 @@ namespace mongo { at the same time - which might be bad. That should never happen, but if a client driver had a bug, it could (or perhaps some sort of attack situation). */ - class Pointer : boost::noncopyable { - public: + class Pointer : boost::noncopyable { ClientCursor *_c; + public: + ClientCursor * c() { return _c; } void release() { if( _c ) { assert( _c->_pinValue >= 100 ); _c->_pinValue -= 100; + _c = 0; } + } + /** + * call this if during a yield, the cursor got deleted + * if so, we don't want to use the point address + */ + void deleted() { _c = 0; } + ~Pointer() { release(); } Pointer(long long cursorid) { recursive_scoped_lock lock(ccmutex); _c = ClientCursor::find_inlock(cursorid, true); if( _c ) { if( _c->_pinValue >= 100 ) { _c = 0; - uassert(12051, "clientcursor already in use? driver problem?", false); + uasserted(12051, "clientcursor already in use? driver problem?"); } _c->_pinValue += 100; } } - ~Pointer() { - release(); - } - }; - + }; + // This object assures safe and reliable cleanup of the ClientCursor. // The implementation assumes that there will be no duplicate ids among cursors // (which is assured if cursors must last longer than 1 second). @@ -108,19 +119,17 @@ namespace mongo { public: CleanupPointer() : _c( 0 ), _id( -1 ) {} void reset( ClientCursor *c = 0 ) { - if ( c == _c ) { + if ( c == _c ) return; - } - if ( _c ) { // be careful in case cursor was deleted by someone else ClientCursor::erase( _id ); } - if ( c ) { _c = c; - _id = c->cursorid; - } else { + _id = c->_cursorid; + } + else { _c = 0; _id = -1; } @@ -135,40 +144,19 @@ namespace mongo { CursorId _id; }; - /*const*/ CursorId cursorid; - const string ns; - const shared_ptr c; - int pos; // # objects into the cursor so far - const BSONObj query; // used for logging diags only; optional in constructor - const int _queryOptions; // see enum QueryOptions dbclient.h - OpTime _slaveReadTill; - Database * const _db; - - ClientCursor(int queryOptions, shared_ptr& _c, const string& _ns, BSONObj _query = BSONObj()) : - _idleAgeMillis(0), _pinValue(0), - _doingDeletes(false), _yieldSometimesTracker(128,10), - ns(_ns), c(_c), - pos(0), query(_query), - _queryOptions(queryOptions), - _db( cc().database() ) - { - assert( _db ); - assert( str::startsWith(_ns, _db->name) ); - if( queryOptions & QueryOption_NoCursorTimeout ) - noTimeout(); - recursive_scoped_lock lock(ccmutex); - cursorid = allocCursorId_inlock(); - clientCursorsById.insert( make_pair(cursorid, this) ); - } + ClientCursor(int queryOptions, const shared_ptr& c, const string& ns, BSONObj query = BSONObj() ); + ~ClientCursor(); - DiskLoc lastLoc() const { - return _lastLoc; - } + // *************** basic accessors ******************* - shared_ptr< ParsedQuery > pq; - shared_ptr< FieldMatcher > fields; // which fields query wants returned - Message originalMessage; // this is effectively an auto ptr for data the matcher points to + CursorId cursorid() const { return _cursorid; } + string ns() const { return _ns; } + Database * db() const { return _db; } + const BSONObj& query() const { return _query; } + int queryOptions() const { return _queryOptions; } + + DiskLoc lastLoc() const { return _lastLoc; } /* Get rid of cursors for namespaces that begin with nsprefix. Used by drop, dropIndexes, dropDatabase. @@ -176,14 +164,14 @@ namespace mongo { static void invalidate(const char *nsPrefix); /** - * @param microsToSleep -1 : ask client + * @param microsToSleep -1 : ask client * >=0 : sleep for that amount - * do a dbtemprelease - * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic - + * do a dbtemprelease + * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic - * we don't do herein as this->matcher (above) is only initialized for true queries/getmore. * (ie not set for remote/update) - * @return if the cursor is still valid. - * if false is returned, then this ClientCursor should be considered deleted - + * @return if the cursor is still valid. + * if false is returned, then this ClientCursor should be considered deleted - * in fact, the whole database could be gone. */ bool yield( int microsToSleep = -1 ); @@ -192,72 +180,82 @@ namespace mongo { * @return same as yield() */ bool yieldSometimes(); - + static int yieldSuggest(); - static void staticYield( int micros ); - + static void staticYield( int micros , const StringData& ns ); + struct YieldData { CursorId _id; bool _doingDeletes; }; bool prepareToYield( YieldData &data ); static bool recoverFromYield( const YieldData &data ); struct YieldLock : boost::noncopyable { explicit YieldLock( ptr cc ) - : _canYield(cc->c->supportYields()) { - if ( _canYield ){ + : _canYield(cc->_c->supportYields()) { + if ( _canYield ) { cc->prepareToYield( _data ); _unlock.reset(new dbtempreleasecond()); } } - ~YieldLock(){ - if ( _unlock ){ + ~YieldLock() { + if ( _unlock ) { log( LL_WARNING ) << "ClientCursor::YieldLock not closed properly" << endl; relock(); } } - - bool stillOk(){ + bool stillOk() { if ( ! _canYield ) return true; - relock(); - return ClientCursor::recoverFromYield( _data ); } - - void relock(){ + void relock() { _unlock.reset(); } - private: - bool _canYield; + const bool _canYield; YieldData _data; - scoped_ptr _unlock; - }; // --- some pass through helpers for Cursor --- - BSONObj indexKeyPattern() { - return c->indexKeyPattern(); - } + Cursor* c() const { return _c.get(); } + int pos() const { return _pos; } - bool ok(){ - return c->ok(); - } + void incPos( int n ) { _pos += n; } // TODO: this is bad + void setPos( int n ) { _pos = n; } // TODO : this is bad too - bool advance(){ - return c->advance(); - } + BSONObj indexKeyPattern() { return _c->indexKeyPattern(); } + bool modifiedKeys() const { return _c->modifiedKeys(); } + bool isMultiKey() const { return _c->isMultiKey(); } - bool currentMatches(){ - if ( ! c->matcher() ) - return true; - return c->matcher()->matchesCurrent( c.get() ); - } + bool ok() { return _c->ok(); } + bool advance() { return _c->advance(); } + BSONObj current() { return _c->current(); } + DiskLoc currLoc() { return _c->currLoc(); } + BSONObj currKey() const { return _c->currKey(); } + + + /** + * same as BSONObj::getFieldsDotted + * if it can be retrieved from key, it is + * @return if this was retrieved from key + */ + bool getFieldsDotted( const string& name, BSONElementSet &ret ); + + /** + * same as BSONObj::getFieldDotted + * if it can be retrieved from key, it is + * @return if this was retrieved from key + */ + BSONElement getFieldDotted( const string& name , bool * fromKey = 0 ); + + bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); } - BSONObj current(){ - return c->current(); + bool currentMatches() { + if ( ! _c->matcher() ) + return true; + return _c->matcher()->matchesCurrent( _c.get() ); } private: @@ -273,12 +271,12 @@ namespace mongo { return it->second; } public: - static ClientCursor* find(CursorId id, bool warn = true) { + static ClientCursor* find(CursorId id, bool warn = true) { recursive_scoped_lock lock(ccmutex); ClientCursor *c = find_inlock(id, warn); - // if this asserts, your code was not thread safe - you either need to set no timeout - // for the cursor or keep a ClientCursor::Pointer in scope for it. - massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue ); + // if this asserts, your code was not thread safe - you either need to set no timeout + // for the cursor or keep a ClientCursor::Pointer in scope for it. + massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue ); return c; } @@ -293,6 +291,11 @@ namespace mongo { return false; } + /** + * @return number of cursors found + */ + static int erase( int n , long long * ids ); + /* call when cursor's location changes so that we can update the cursorsbylocation map. if you are locked and internally iterating, only need to call when you are ready to "unlock". @@ -314,43 +317,82 @@ namespace mongo { void storeOpForSlave( DiskLoc last ); void updateSlaveLocation( CurOp& curop ); - - unsigned idleTime(){ - return _idleAgeMillis; - } + + unsigned idleTime() const { return _idleAgeMillis; } + + void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; } + + void slaveReadTill( const OpTime& t ) { _slaveReadTill = t; } + + public: // static methods static void idleTimeReport(unsigned millis); -private: - // cursors normally timeout after an inactivy period to prevent excess memory use - // setting this prevents timeout of the cursor in question. - void noTimeout() { - _pinValue++; - } - multimap& byLoc() { - return _db->ccByLoc; - } -public: - void setDoingDeletes( bool doingDeletes ){ - _doingDeletes = doingDeletes; - } - static void appendStats( BSONObjBuilder& result ); - static unsigned numCursors() { return clientCursorsById.size(); } - static void informAboutToDeleteBucket(const DiskLoc& b); static void aboutToDelete(const DiskLoc& dl); - static void find( const string& ns , set& all ); + + + private: // methods + + // cursors normally timeout after an inactivy period to prevent excess memory use + // setting this prevents timeout of the cursor in question. + void noTimeout() { _pinValue++; } + + CCByLoc& byLoc() { return _db->ccByLoc; } + + private: + + CursorId _cursorid; + + const string _ns; + Database * _db; + + const shared_ptr _c; + map _indexedFields; // map from indexed field to offset in key object + int _pos; // # objects into the cursor so far + + const BSONObj _query; // used for logging diags only; optional in constructor + int _queryOptions; // see enum QueryOptions dbclient.h + + OpTime _slaveReadTill; + + DiskLoc _lastLoc; // use getter and setter not this (important) + unsigned _idleAgeMillis; // how long has the cursor been around, relative to server idle time + + /* 0 = normal + 1 = no timeout allowed + 100 = in use (pinned) -- see Pointer class + */ + unsigned _pinValue; + + bool _doingDeletes; + ElapsedTracker _yieldSometimesTracker; + + public: + shared_ptr pq; + shared_ptr fields; // which fields query wants returned + Message originalMessage; // this is effectively an auto ptr for data the matcher points to + + + + private: // static members + + static CCById clientCursorsById; + static long long numberTimedOut; + static boost::recursive_mutex ccmutex; // must use this for all statics above! + static CursorId allocCursorId_inlock(); + }; class ClientCursorMonitor : public BackgroundJob { public: + string name() const { return "ClientCursorMonitor"; } void run(); - string name() { return "ClientCursorMonitor"; } }; extern ClientCursorMonitor clientCursorMonitor; - + } // namespace mongo diff --git a/db/cloner.cpp b/db/cloner.cpp index 9177a00..fe57463 100644 --- a/db/cloner.cpp +++ b/db/cloner.cpp @@ -31,7 +31,7 @@ namespace mongo { void ensureHaveIdIndex(const char *ns); - bool replAuthenticate(DBClientConnection *); + bool replAuthenticate(DBClientBase *); class Cloner: boost::noncopyable { auto_ptr< DBClientWithCommands > conn; @@ -40,7 +40,7 @@ namespace mongo { struct Fun; public: Cloner() { } - + /* slaveOk - if true it is ok if the source of the data is !ismaster. useReplAuth - use the credentials we normally use as a replication slave for the cloning snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower. @@ -92,14 +92,14 @@ namespace mongo { if ( context ) { context->relocked(); } - + while( i.moreInCurrentBatch() ) { if ( n % 128 == 127 /*yield some*/ ) { dbtemprelease t; } - + BSONObj tmp = i.nextSafe(); - + /* assure object is valid. note this will slow us down a little. */ if ( !tmp.valid() ) { stringstream ss; @@ -109,15 +109,15 @@ namespace mongo { e.validate(); ss << " firstElement: " << e; } - catch( ... ){ + catch( ... ) { ss << " firstElement corrupt"; } out() << ss.str() << endl; continue; } - + ++n; - + BSONObj js = tmp; if ( isindex ) { assert( strstr(from_collection, "system.indexes") ); @@ -125,16 +125,18 @@ namespace mongo { storedForLater->push_back( js.getOwned() ); continue; } - - try { + + try { theDataFileMgr.insertWithObjMod(to_collection, js); if ( logForRepl ) logOp("i", to_collection, js); + + getDur().commitIfNeeded(); } - catch( UserException& e ) { + catch( UserException& e ) { log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; } - + RARELY if ( time( 0 ) - saveLast > 60 ) { log() << n << " objects cloned so far from collection " << from_collection << endl; saveLast = time( 0 ); @@ -146,17 +148,17 @@ namespace mongo { const char *from_collection; const char *to_collection; time_t saveLast; - list *storedForLater; + list *storedForLater; bool logForRepl; Client::Context *context; }; - + /* copy the specified collection isindex - if true, this is system.indexes collection, in which we do some transformation when copying. */ void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, Query query) { list storedForLater; - + Fun f; f.n = 0; f.isindex = isindex; @@ -165,7 +167,7 @@ namespace mongo { f.saveLast = time( 0 ); f.storedForLater = &storedForLater; f.logForRepl = logForRepl; - + int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 ); { dbtemprelease r; @@ -173,7 +175,9 @@ namespace mongo { DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() ); if ( remote ) { remote->query( boost::function( f ), from_collection, query, 0, options ); - } else { // no exhaust mode for direct client, so we have this hack + } + else { + // there is no exhaust mode for direct client, so we have this hack auto_ptr c = conn->query( from_collection, query, 0, 0, 0, options ); assert( c.get() ); while( c->more() ) { @@ -182,16 +186,18 @@ namespace mongo { } } } - - if ( storedForLater.size() ){ - for ( list::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ){ + + if ( storedForLater.size() ) { + for ( list::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ) { BSONObj js = *i; - try { + try { theDataFileMgr.insertWithObjMod(to_collection, js); if ( logForRepl ) logOp("i", to_collection, js); + + getDur().commitIfNeeded(); } - catch( UserException& e ) { + catch( UserException& e ) { log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; } } @@ -210,40 +216,44 @@ namespace mongo { return false; conn.reset( myconn.release() ); - + writelock lk(ns); // TODO: make this lower down Client::Context ctx(ns); - { // config + { + // config string temp = ctx.db()->name + ".system.namespaces"; BSONObj config = conn->findOne( temp , BSON( "name" << ns ) ); if ( config["options"].isABSONObj() ) if ( ! userCreateNS( ns.c_str() , config["options"].Obj() , errmsg, true , 0 ) ) return false; } - - { // main data + + { + // main data copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , Query(query).snapshot() ); } - + /* TODO : copyIndexes bool does not seem to be implemented! */ - if( !copyIndexes ) { + if( !copyIndexes ) { log() << "ERROR copy collection copyIndexes not implemented? " << ns << endl; } - { // indexes + { + // indexes string temp = ctx.db()->name + ".system.indexes"; copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , BSON( "ns" << ns ) ); } + getDur().commitIfNeeded(); return true; } - + extern bool inDBRepair; void ensureIdIndexForNewNs(const char *ns); bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot) { - massert( 10289 , "useReplAuth is not written to replication log", !useReplAuth || !logForRepl ); + massert( 10289 , "useReplAuth is not written to replication log", !useReplAuth || !logForRepl ); string todb = cc().database()->name; stringstream a,b; @@ -263,23 +273,26 @@ namespace mongo { */ string ns = fromdb + ".system.namespaces"; list toClone; - { + { dbtemprelease r; - + // just using exhaust for collection copying right now auto_ptr c; { if ( conn.get() ) { // nothing to do - } else if ( !masterSameProcess ) { - auto_ptr< DBClientConnection > c( new DBClientConnection() ); - if ( !c->connect( masterHost, errmsg ) ) + } + else if ( !masterSameProcess ) { + ConnectionString cs = ConnectionString::parse( masterHost, errmsg ); + auto_ptr con( cs.connect( errmsg )); + if ( !con.get() ) return false; - if( !replAuthenticate(c.get()) ) + if( !replAuthenticate(con.get()) ) return false; - - conn = c; - } else { + + conn = con; + } + else { conn.reset( new DBDirectClient() ); } c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 ); @@ -289,8 +302,8 @@ namespace mongo { errmsg = "query failed " + ns; return false; } - - while ( c->more() ){ + + while ( c->more() ) { BSONObj collection = c->next(); log(2) << "\t cloner got " << collection << endl; @@ -304,23 +317,23 @@ namespace mongo { assert( e.type() == String ); const char *from_name = e.valuestr(); - if( strstr(from_name, ".system.") ) { + if( strstr(from_name, ".system.") ) { /* system.users and s.js is cloned -- but nothing else from system. * system.indexes is handled specially at the end*/ - if( legalClientSystemNS( from_name , true ) == 0 ){ + if( legalClientSystemNS( from_name , true ) == 0 ) { log(2) << "\t\t not cloning because system collection" << endl; continue; } } - if( ! nsDollarCheck( from_name ) ){ + if( ! isANormalNSName( from_name ) ) { log(2) << "\t\t not cloning because has $ " << endl; continue; - } + } toClone.push_back( collection.getOwned() ); } } - for ( list::iterator i=toClone.begin(); i != toClone.end(); i++ ){ + for ( list::iterator i=toClone.begin(); i != toClone.end(); i++ ) { { dbtemprelease r; } @@ -328,7 +341,7 @@ namespace mongo { log(2) << " really will clone: " << collection << endl; const char * from_name = collection["name"].valuestr(); BSONObj options = collection.getObjectField("options"); - + /* change name ".collection" -> .collection */ const char *p = strchr(from_name, '.'); assert(p); @@ -338,17 +351,17 @@ namespace mongo { { string err; const char *toname = to_name.c_str(); - /* we defer building id index for performance - building it in batch is much faster */ + /* we defer building id index for performance - building it in batch is much faster */ userCreateNS(toname, options, err, logForRepl, &wantIdIndex); } log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl; Query q; - if( snapshot ) + if( snapshot ) q.snapshot(); copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, q); if( wantIdIndex ) { - /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations + /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations that occur during the initial sync. inDBRepair makes dropDups be true. */ bool old = inDBRepair; @@ -357,7 +370,7 @@ namespace mongo { ensureIdIndexForNewNs(to_name.c_str()); inDBRepair = old; } - catch(...) { + catch(...) { inDBRepair = old; throw; } @@ -368,27 +381,26 @@ namespace mongo { string system_indexes_from = fromdb + ".system.indexes"; string system_indexes_to = todb + ".system.indexes"; - /* [dm]: is the ID index sometimes not called "_id_"? There is other code in the system that looks for a "_id" prefix - rather than this exact value. we should standardize. OR, remove names - which is in the bugdb. Anyway, this + /* [dm]: is the ID index sometimes not called "_id_"? There is other code in the system that looks for a "_id" prefix + rather than this exact value. we should standardize. OR, remove names - which is in the bugdb. Anyway, this is dubious here at the moment. */ copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, BSON( "name" << NE << "_id_" ) ); return true; } - + /* slaveOk - if true it is ok if the source of the data is !ismaster. useReplAuth - use the credentials we normally use as a replication slave for the cloning snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower. for example repairDatabase need not use it. */ - bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, - bool slaveOk, bool useReplAuth, bool snapshot) - { + bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, + bool slaveOk, bool useReplAuth, bool snapshot) { Cloner c; return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot); } - + /* Usage: mydb.$cmd.findOne( { clone: "fromhost" } ); */ @@ -410,11 +422,11 @@ namespace mongo { /* replication note: we must logOp() not the command, but the cloned data -- if the slave were to clone it would get a different point-in-time and not match. */ - return cloneFrom(from.c_str(), errmsg, dbname, + return cloneFrom(from.c_str(), errmsg, dbname, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*usereplauth*/false, /*snapshot*/true); } } cmdclone; - + class CmdCloneCollection : public Command { public: virtual bool slaveOk() const { @@ -424,10 +436,10 @@ namespace mongo { CmdCloneCollection() : Command("cloneCollection") { } virtual void help( stringstream &help ) const { help << "{ cloneCollection: , from: [,query: ] [,copyIndexes:] }" - "\nCopies a collection from one server to another. Do not use on a single server as the destination " - "is placed at the same db.collection (namespace) as the source.\n" - "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there." - ; + "\nCopies a collection from one server to another. Do not use on a single server as the destination " + "is placed at the same db.collection (namespace) as the source.\n" + "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there." + ; } virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string fromhost = cmdObj.getStringField("from"); @@ -437,7 +449,7 @@ namespace mongo { } { HostAndPort h(fromhost); - if( h.isSelf() ) { + if( h.isSelf() ) { errmsg = "can't cloneCollection from self"; return false; } @@ -450,13 +462,13 @@ namespace mongo { BSONObj query = cmdObj.getObjectField("query"); if ( query.isEmpty() ) query = BSONObj(); - + BSONElement copyIndexesSpec = cmdObj.getField("copyindexes"); bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true; - - log() << "cloneCollection. db:" << dbname << " collection:" << collection << " from: " << fromhost + + log() << "cloneCollection. db:" << dbname << " collection:" << collection << " from: " << fromhost << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl; - + Cloner c; return c.copyCollection( fromhost , collection , query, errmsg , copyIndexes ); } @@ -557,7 +569,7 @@ namespace mongo { return res; } } cmdcopydb; - + class CmdRenameCollection : public Command { public: CmdRenameCollection() : Command( "renameCollection" ) {} @@ -581,7 +593,7 @@ namespace mongo { errmsg = "invalid command syntax"; return false; } - + bool capped = false; long long size = 0; { @@ -593,10 +605,10 @@ namespace mongo { for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext ) size += i.ext()->length; } - + Client::Context ctx( target ); - - if ( nsdetails( target.c_str() ) ){ + + if ( nsdetails( target.c_str() ) ) { uassert( 10027 , "target namespace exists", cmdObj["dropTarget"].trueValue() ); BSONObjBuilder bb( result.subobjStart( "dropTarget" ) ); dropCollection( target , errmsg , bb ); @@ -623,7 +635,7 @@ namespace mongo { } if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) ) return false; - + auto_ptr< DBClientCursor > c; DBDirectClient bridge; @@ -638,7 +650,7 @@ namespace mongo { BSONObj o = c->next(); theDataFileMgr.insertWithObjMod( target.c_str(), o ); } - + char cl[256]; nsToDatabase( source.c_str(), cl ); string sourceIndexes = string( cl ) + ".system.indexes"; @@ -661,7 +673,8 @@ namespace mongo { break; if ( strcmp( e.fieldName(), "ns" ) == 0 ) { b.append( "ns", target ); - } else { + } + else { b.append( e ); } } diff --git a/db/cmdline.cpp b/db/cmdline.cpp index 65ee179..900a782 100644 --- a/db/cmdline.cpp +++ b/db/cmdline.cpp @@ -20,47 +20,92 @@ #include "cmdline.h" #include "commands.h" #include "../util/processinfo.h" +#include "security_key.h" + +#ifdef _WIN32 +#include +#endif namespace po = boost::program_options; +namespace fs = boost::filesystem; namespace mongo { - void setupSignals(); + void setupSignals( bool inFork ); + string getHostNameCached(); BSONArray argvArray; - void CmdLine::addGlobalOptions( boost::program_options::options_description& general , - boost::program_options::options_description& hidden ){ + void CmdLine::addGlobalOptions( boost::program_options::options_description& general , + boost::program_options::options_description& hidden ) { /* support for -vv -vvvv etc. */ for (string s = "vv"; s.length() <= 12; s.append("v")) { hidden.add_options()(s.c_str(), "verbose"); } - + general.add_options() - ("help,h", "show this usage information") - ("version", "show version information") - ("config,f", po::value(), "configuration file specifying additional options") - ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)") - ("quiet", "quieter output") - ("port", po::value(&cmdLine.port), "specify port number") - ("bind_ip", po::value(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default") - ("logpath", po::value() , "file to send all output to instead of stdout" ) - ("logappend" , "append to logpath instead of over-writing" ) - ("pidfilepath", po::value(), "full path to pidfile (if not set, no pidfile is created)") + ("help,h", "show this usage information") + ("version", "show version information") + ("config,f", po::value(), "configuration file specifying additional options") + ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)") + ("quiet", "quieter output") + ("port", po::value(&cmdLine.port), "specify port number") + ("bind_ip", po::value(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default") + ("logpath", po::value() , "log file to send write to instead of stdout - has to be a file, not directory" ) + ("logappend" , "append to logpath instead of over-writing" ) + ("pidfilepath", po::value(), "full path to pidfile (if not set, no pidfile is created)") + ("keyFile", po::value(), "private key for cluster authentication (only for replica sets)") #ifndef _WIN32 - ("fork" , "fork server process" ) + ("unixSocketPrefix", po::value(), "alternative directory for UNIX domain sockets (defaults to /tmp)") + ("fork" , "fork server process" ) #endif - ; - + ; + } - bool CmdLine::store( int argc , char ** argv , +#if defined(_WIN32) + void CmdLine::addWindowsOptions( boost::program_options::options_description& windows , + boost::program_options::options_description& hidden ) { + windows.add_options() + ("install", "install mongodb service") + ("remove", "remove mongodb service") + ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)") + ("serviceName", po::value(), "windows service name") + ("serviceDisplayName", po::value(), "windows service display name") + ("serviceDescription", po::value(), "windows service description") + ("serviceUser", po::value(), "user name service executes as") + ("servicePassword", po::value(), "password used to authenticate serviceUser") + ; + hidden.add_options()("service", "start mongodb service"); + } +#endif + + + bool CmdLine::store( int argc , char ** argv , boost::program_options::options_description& visible, boost::program_options::options_description& hidden, boost::program_options::positional_options_description& positional, - boost::program_options::variables_map ¶ms ){ - + boost::program_options::variables_map ¶ms ) { + + + { + // setup binary name + cmdLine.binaryName = argv[0]; + size_t i = cmdLine.binaryName.rfind( '/' ); + if ( i != string::npos ) + cmdLine.binaryName = cmdLine.binaryName.substr( i + 1 ); + + // setup cwd + char buffer[1024]; +#ifdef _WIN32 + assert( _getcwd( buffer , 1000 ) ); +#else + assert( getcwd( buffer , 1000 ) ); +#endif + cmdLine.cwd = buffer; + } + /* don't allow guessing - creates ambiguities when some options are * prefixes of others. allow long disguises and don't allow guessing * to get away with our vvvvvvv trick. */ @@ -69,7 +114,7 @@ namespace mongo { po::command_line_style::allow_long_disguise) ^ po::command_line_style::allow_sticky); - + try { po::options_description all; @@ -80,26 +125,27 @@ namespace mongo { .options( all ) .positional( positional ) .style( style ) - .run(), + .run(), params ); - if ( params.count("config") ){ + if ( params.count("config") ) { ifstream f( params["config"].as().c_str() ); - if ( ! f.is_open() ){ + if ( ! f.is_open() ) { cout << "ERROR: could not read from config file" << endl << endl; cout << visible << endl; return false; } - + po::store( po::parse_config_file( f , all ) , params ); f.close(); } - + po::notify(params); - } + } catch (po::error &e) { - cout << "ERROR: " << e.what() << endl << endl; - cout << visible << endl; + cout << "error command line: " << e.what() << endl; + cout << "use --help for help" << endl; + //cout << visible << endl; return false; } @@ -120,44 +166,51 @@ namespace mongo { string logpath; #ifndef _WIN32 + if (params.count("unixSocketPrefix")) { + cmdLine.socket = params["unixSocketPrefix"].as(); + if (!fs::is_directory(cmdLine.socket)) { + cout << cmdLine.socket << " must be a directory" << endl; + ::exit(-1); + } + } + if (params.count("fork")) { - if ( ! params.count( "logpath" ) ){ + if ( ! params.count( "logpath" ) ) { cout << "--fork has to be used with --logpath" << endl; ::exit(-1); } - - { // test logpath + + { + // test logpath logpath = params["logpath"].as(); assert( logpath.size() ); - if ( logpath[0] != '/' ){ - char temp[256]; - assert( getcwd( temp , 256 ) ); - logpath = (string)temp + "/" + logpath; + if ( logpath[0] != '/' ) { + logpath = cmdLine.cwd + "/" + logpath; } FILE * test = fopen( logpath.c_str() , "a" ); - if ( ! test ){ + if ( ! test ) { cout << "can't open [" << logpath << "] for log file: " << errnoWithDescription() << endl; ::exit(-1); } fclose( test ); } - + cout.flush(); cerr.flush(); pid_t c = fork(); - if ( c ){ + if ( c ) { _exit(0); } - if ( chdir("/") < 0 ){ + if ( chdir("/") < 0 ) { cout << "Cant chdir() while forking server process: " << strerror(errno) << endl; ::exit(-1); } setsid(); - + pid_t c2 = fork(); - if ( c2 ){ + if ( c2 ) { cout << "forked process: " << c2 << endl; _exit(0); } @@ -170,19 +223,19 @@ namespace mongo { fclose(stdin); FILE* f = freopen("/dev/null", "w", stderr); - if ( f == NULL ){ + if ( f == NULL ) { cout << "Cant reassign stderr while forking server process: " << strerror(errno) << endl; ::exit(-1); } f = freopen("/dev/null", "r", stdin); - if ( f == NULL ){ + if ( f == NULL ) { cout << "Cant reassign stdin while forking server process: " << strerror(errno) << endl; ::exit(-1); } setupCoreSignals(); - setupSignals(); + setupSignals( true ); } #endif if (params.count("logpath")) { @@ -196,6 +249,18 @@ namespace mongo { writePidFile( params["pidfilepath"].as() ); } + if (params.count("keyFile")) { + const string f = params["keyFile"].as(); + + if (!setUpSecurityKey(f)) { + // error message printed in setUpPrivateKey + dbexit(EXIT_BADOPTIONS); + } + + noauth = false; + } + + { BSONArrayBuilder b; for (int i=0; i < argc; i++) @@ -205,29 +270,51 @@ namespace mongo { return true; } - - void ignoreSignal( int signal ){ - } - void setupCoreSignals(){ + void ignoreSignal( int sig ) {} + + void setupCoreSignals() { #if !defined(_WIN32) assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR ); assert( signal(SIGHUP , ignoreSignal ) != SIG_ERR ); #endif } - class CmdGetCmdLineOpts : Command{ - public: + class CmdGetCmdLineOpts : Command { + public: CmdGetCmdLineOpts(): Command("getCmdLineOpts") {} void help(stringstream& h) const { h << "get argv"; } virtual LockType locktype() const { return NONE; } virtual bool adminOnly() const { return true; } virtual bool slaveOk() const { return true; } - virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { result.append("argv", argvArray); return true; } } cmdGetCmdLineOpts; + + string prettyHostName() { + StringBuilder s(128); + s << getHostNameCached(); + if( cmdLine.port != CmdLine::DefaultDBPort ) + s << ':' << mongo::cmdLine.port; + return s.str(); + } + + ParameterValidator::ParameterValidator( const string& name ) : _name( name ) { + if ( ! _all ) + _all = new map(); + (*_all)[_name] = this; + } + + ParameterValidator * ParameterValidator::get( const string& name ) { + map::iterator i = _all->find( name ); + if ( i == _all->end() ) + return NULL; + return i->second; + } + map * ParameterValidator::_all = 0; + } diff --git a/db/cmdline.h b/db/cmdline.h index ef1bd57..4c8c7c4 100644 --- a/db/cmdline.h +++ b/db/cmdline.h @@ -17,72 +17,134 @@ #pragma once #include "../pch.h" +#include "jsobj.h" namespace mongo { - - /* command line options + + /* command line options */ /* concurrency: OK/READ */ - struct CmdLine { + struct CmdLine { + + CmdLine() : + port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), smallfiles(sizeof(int*) == 4), + quota(false), quotaFiles(8), cpu(false), durOptions(0), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ), + syncdelay(60), socket("/tmp") { + // default may change for this later. +#if defined(_DURABLEDEFAULTON) + dur = true; +#else + dur = false; +#endif + } + + string binaryName; // mongod or mongos + string cwd; // cwd of when process started + int port; // --port + enum { + DefaultDBPort = 27017, + ConfigServerPort = 27019, + ShardServerPort = 27018 + }; + bool isDefaultPort() const { return port == DefaultDBPort; } + string bind_ip; // --bind_ip bool rest; // --rest + bool jsonp; // --jsonp string _replSet; // --replSet[/] - string ourSetName() const { + string ourSetName() const { string setname; size_t sl = _replSet.find('/'); if( sl == string::npos ) return _replSet; return _replSet.substr(0, sl); } + bool usingReplSets() const { return !_replSet.empty(); } + // for master/slave replication string source; // --source string only; // --only - + bool quiet; // --quiet - bool notablescan; // --notablescan - bool prealloc; // --noprealloc - bool smallfiles; // --smallfiles - + bool noTableScan; // --notablescan no table scans allowed + bool prealloc; // --noprealloc no preallocation of data files + bool smallfiles; // --smallfiles allocate smaller data files + bool quota; // --quota int quotaFiles; // --quotaFiles bool cpu; // --cpu show cpu time periodically + bool dur; // --dur durability + + /** --durOptions 7 dump journal and terminate without doing anything further + --durOptions 4 recover and terminate without listening + */ + enum { // bits to be ORed + DurDumpJournal = 1, // dump diagnostics on the journal during recovery + DurScanOnly = 2, // don't do any real work, just scan and dump if dump specified + DurRecoverOnly = 4, // terminate after recovery step + DurParanoid = 8, // paranoid mode enables extra checks + DurAlwaysCommit = 16 // do a group commit every time the writelock is released + }; + int durOptions; // --durOptions for debugging + long long oplogSize; // --oplogSize int defaultProfile; // --profile int slowMS; // --time in ms that is "slow" int pretouch; // --pretouch for replication application (experimental) - bool moveParanoia; // for move chunk paranoia + bool moveParanoia; // for move chunk paranoia + double syncdelay; // seconds between fsyncs - enum { - DefaultDBPort = 27017, - ConfigServerPort = 27019, - ShardServerPort = 27018 - }; + string socket; // UNIX domain socket directory - CmdLine() : - port(DefaultDBPort), rest(false), quiet(false), notablescan(false), prealloc(true), smallfiles(false), - quota(false), quotaFiles(8), cpu(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ) - { } - - - static void addGlobalOptions( boost::program_options::options_description& general , + static void addGlobalOptions( boost::program_options::options_description& general , boost::program_options::options_description& hidden ); - + static void addWindowsOptions( boost::program_options::options_description& windows , + boost::program_options::options_description& hidden ); + + /** * @return true if should run program, false if should exit */ - static bool store( int argc , char ** argv , + static bool store( int argc , char ** argv , boost::program_options::options_description& visible, boost::program_options::options_description& hidden, boost::program_options::positional_options_description& positional, boost::program_options::variables_map &output ); }; - + extern CmdLine cmdLine; - + void setupCoreSignals(); + + string prettyHostName(); + + + /** + * used for setParameter + * so you can write validation code that lives with code using it + * rather than all in the command place + * also lets you have mongos or mongod specific code + * without pulling it all sorts of things + */ + class ParameterValidator { + public: + ParameterValidator( const string& name ); + virtual ~ParameterValidator() {} + + virtual bool isValid( BSONElement e , string& errmsg ) = 0; + + static ParameterValidator * get( const string& name ); + + private: + string _name; + + // don't need to lock since this is all done in static init + static map * _all; + }; + } diff --git a/db/commands.cpp b/db/commands.cpp index ef219fe..770d035 100644 --- a/db/commands.cpp +++ b/db/commands.cpp @@ -38,7 +38,7 @@ namespace mongo { } ss << "\n"; bool web = _webCommands->count(name) != 0; - if( web ) ss << ""; + if( web ) ss << ""; ss << name; if( web ) ss << ""; ss << "\n"; @@ -55,7 +55,7 @@ namespace mongo { ss << ""; if( helpStr != "no help defined" ) { const char *p = helpStr.c_str(); - while( *p ) { + while( *p ) { if( *p == '<' ) { ss << "<"; p++; continue; @@ -67,7 +67,7 @@ namespace mongo { p++; continue; } - if( strncmp(p, "http:", 5) == 0 ) { + if( strncmp(p, "http:", 5) == 0 ) { ss << ""; @@ -120,7 +120,7 @@ namespace mongo { void Command::help( stringstream& help ) const { help << "no help defined"; } - + bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder) { const char *p = strchr(ns, '.'); if ( !p ) return false; @@ -145,7 +145,7 @@ namespace mongo { ok = false; errmsg = "access denied - use admin db"; } - else if ( jsobj.getBoolField( "help" ) ){ + else if ( jsobj.getBoolField( "help" ) ) { stringstream help; help << "help for: " << e.fieldName() << " "; c->help( help ); @@ -161,18 +161,18 @@ namespace mongo { if (!have_ok) anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 ); - + if ( !ok && !have_errmsg) { anObjBuilder.append("errmsg", errmsg); uassert_nothrow(errmsg.c_str()); } return true; } - + return false; } - Command* Command::findCommand( const string& name ){ + Command* Command::findCommand( const string& name ) { map::iterator i = _commands->find( name ); if ( i == _commands->end() ) return 0; @@ -180,7 +180,7 @@ namespace mongo { } - Command::LockType Command::locktype( const string& name ){ + Command::LockType Command::locktype( const string& name ) { Command * c = findCommand( name ); if ( ! c ) return WRITE; @@ -189,10 +189,10 @@ namespace mongo { void Command::logIfSlow( const Timer& timer, const string& msg ) { int ms = timer.millis(); - if ( ms > cmdLine.slowMS ){ + if ( ms > cmdLine.slowMS ) { out() << msg << " took " << ms << " ms." << endl; } } - - + + } // namespace mongo diff --git a/db/commands.h b/db/commands.h index a8a61c4..42e46a0 100644 --- a/db/commands.h +++ b/db/commands.h @@ -18,7 +18,9 @@ #pragma once #include "../pch.h" + #include "jsobj.h" +#include "../util/timer.h" namespace mongo { @@ -32,7 +34,7 @@ namespace mongo { */ class Command { public: - + enum LockType { READ = -1 , NONE = 0 , WRITE = 1 }; const string name; @@ -47,11 +49,11 @@ namespace mongo { */ virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0; - /* - note: logTheTop() MUST be false if READ + /* + note: logTheTop() MUST be false if READ if NONE, can't use Client::Context setup use with caution - */ + */ virtual LockType locktype() const = 0; /* Return true if only the admin ns has privileges to run this command. */ @@ -61,7 +63,7 @@ namespace mongo { void htmlHelp(stringstream&) const; - /* Like adminOnly, but even stricter: we must either be authenticated for admin db, + /* Like adminOnly, but even stricter: we must either be authenticated for admin db, or, if running without auth, on the local interface. When localHostOnlyIfNoAuth() is true, adminOnly() must also be true. @@ -72,7 +74,7 @@ namespace mongo { (the command directly from a client -- if fromRepl, always allowed). */ virtual bool slaveOk() const = 0; - + /* Return true if the client force a command to be run on a slave by turning on the 'slaveok' option in the command query. */ @@ -89,12 +91,12 @@ namespace mongo { virtual void help( stringstream& help ) const; - /* Return true if authentication and security applies to the commands. Some commands + /* Return true if authentication and security applies to the commands. Some commands (e.g., getnonce, authenticate) can be done by anyone even unauthorized. */ virtual bool requiresAuth() { return true; } - /** @param webUI expose the command in the web ui as localhost:28017/ + /** @param webUI expose the command in the web ui as localhost:28017/ @param oldName an optional old, deprecated name for the command */ Command(const char *_name, bool webUI = false, const char *oldName = 0); @@ -102,7 +104,7 @@ namespace mongo { virtual ~Command() {} protected: - BSONObj getQuery( const BSONObj& cmdObj ){ + BSONObj getQuery( const BSONObj& cmdObj ) { if ( cmdObj["query"].type() == Object ) return cmdObj["query"].embeddedObject(); if ( cmdObj["q"].type() == Object ) diff --git a/db/commands/distinct.cpp b/db/commands/distinct.cpp new file mode 100644 index 0000000..2e26bcd --- /dev/null +++ b/db/commands/distinct.cpp @@ -0,0 +1,150 @@ +// distinct.cpp + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "../commands.h" +#include "../instance.h" +#include "../queryoptimizer.h" +#include "../clientcursor.h" + +namespace mongo { + + class DistinctCommand : public Command { + public: + DistinctCommand() : Command("distinct") {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }"; + } + + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + Timer t; + string ns = dbname + '.' + cmdObj.firstElement().valuestr(); + + string key = cmdObj["key"].valuestrsafe(); + BSONObj keyPattern = BSON( key << 1 ); + + BSONObj query = getQuery( cmdObj ); + + int bufSize = BSONObjMaxUserSize - 4096; + BufBuilder bb( bufSize ); + char * start = bb.buf(); + + BSONArrayBuilder arr( bb ); + BSONElementSet values; + + long long nscanned = 0; // locations looked at + long long nscannedObjects = 0; // full objects looked at + long long n = 0; // matches + MatchDetails md; + + NamespaceDetails * d = nsdetails( ns.c_str() ); + + if ( ! d ) { + result.appendArray( "values" , BSONObj() ); + result.append( "stats" , BSON( "n" << 0 << "nscanned" << 0 << "nscannedObjects" << 0 ) ); + return true; + } + + shared_ptr cursor; + if ( ! query.isEmpty() ) { + cursor = bestGuessCursor(ns.c_str() , query , BSONObj() ); + } + else { + + // query is empty, so lets see if we can find an index + // with the key so we don't have to hit the raw data + NamespaceDetails::IndexIterator ii = d->ii(); + while ( ii.more() ) { + IndexDetails& idx = ii.next(); + + if ( d->isMultikey( ii.pos() - 1 ) ) + continue; + + if ( idx.inKeyPattern( key ) ) { + cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() ); + break; + } + + } + + if ( ! cursor.get() ) + cursor = bestGuessCursor(ns.c_str() , query , BSONObj() ); + + } + + + + scoped_ptr cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns)); + + while ( cursor->ok() ) { + nscanned++; + bool loadedObject = false; + + if ( !cursor->matcher() || cursor->matcher()->matchesCurrent( cursor.get() , &md ) ) { + n++; + + BSONElementSet temp; + loadedObject = ! cc->getFieldsDotted( key , temp ); + + for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) { + BSONElement e = *i; + if ( values.count( e ) ) + continue; + + int now = bb.len(); + + uassert(10044, "distinct too big, 4mb cap", ( now + e.size() + 1024 ) < bufSize ); + + arr.append( e ); + BSONElement x( start + now ); + + values.insert( x ); + } + } + + if ( loadedObject || md.loadedObject ) + nscannedObjects++; + + cursor->advance(); + + if (!cc->yieldSometimes()) + break; + + RARELY killCurrentOp.checkForInterrupt(); + } + + assert( start == bb.buf() ); + + result.appendArray( "values" , arr.done() ); + + { + BSONObjBuilder b; + b.appendNumber( "n" , n ); + b.appendNumber( "nscanned" , nscanned ); + b.appendNumber( "nscannedObjects" , nscannedObjects ); + b.appendNumber( "timems" , t.millis() ); + result.append( "stats" , b.obj() ); + } + + return true; + } + + } distinctCmd; + +} diff --git a/db/commands/group.cpp b/db/commands/group.cpp new file mode 100644 index 0000000..0cc6ab3 --- /dev/null +++ b/db/commands/group.cpp @@ -0,0 +1,202 @@ +// group.cpp + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "../commands.h" +#include "../instance.h" +#include "../queryoptimizer.h" + +namespace mongo { + + class GroupCommand : public Command { + public: + GroupCommand() : Command("group") {} + virtual LockType locktype() const { return READ; } + virtual bool slaveOk() const { return false; } + virtual bool slaveOverrideOk() { return true; } + virtual void help( stringstream &help ) const { + help << "http://www.mongodb.org/display/DOCS/Aggregation"; + } + + BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ) { + if ( func ) { + BSONObjBuilder b( obj.objsize() + 32 ); + b.append( "0" , obj ); + int res = s->invoke( func , b.obj() ); + uassert( 10041 , (string)"invoke failed in $keyf: " + s->getError() , res == 0 ); + int type = s->type("return"); + uassert( 10042 , "return of $key has to be an object" , type == Object ); + return s->getObject( "return" ); + } + return obj.extractFields( keyPattern , true ); + } + + bool group( string realdbname , const string& ns , const BSONObj& query , + BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope , + BSONObj initial , string finalize , + string& errmsg , BSONObjBuilder& result ) { + + + auto_ptr s = globalScriptEngine->getPooledScope( realdbname ); + s->localConnect( realdbname.c_str() ); + + if ( reduceScope ) + s->init( reduceScope ); + + s->setObject( "$initial" , initial , true ); + + s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 ); + s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 ); + ScriptingFunction f = s->createFunction( + "function(){ " + " if ( $arr[n] == null ){ " + " next = {}; " + " Object.extend( next , $key ); " + " Object.extend( next , $initial , true ); " + " $arr[n] = next; " + " next = null; " + " } " + " $reduce( obj , $arr[n] ); " + "}" ); + + ScriptingFunction keyFunction = 0; + if ( keyFunctionCode.size() ) { + keyFunction = s->createFunction( keyFunctionCode.c_str() ); + } + + + double keysize = keyPattern.objsize() * 3; + double keynum = 1; + + map map; + list blah; + + shared_ptr cursor = bestGuessCursor(ns.c_str() , query , BSONObj() ); + + while ( cursor->ok() ) { + if ( cursor->matcher() && ! cursor->matcher()->matchesCurrent( cursor.get() ) ) { + cursor->advance(); + continue; + } + + BSONObj obj = cursor->current(); + cursor->advance(); + + BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() ); + keysize += key.objsize(); + keynum++; + + int& n = map[key]; + if ( n == 0 ) { + n = map.size(); + s->setObject( "$key" , key , true ); + + uassert( 10043 , "group() can't handle more than 20000 unique keys" , n <= 20000 ); + } + + s->setObject( "obj" , obj , true ); + s->setNumber( "n" , n - 1 ); + if ( s->invoke( f , BSONObj() , 0 , true ) ) { + throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() ); + } + } + + if (!finalize.empty()) { + s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 ); + ScriptingFunction g = s->createFunction( + "function(){ " + " for(var i=0; i < $arr.length; i++){ " + " var ret = $finalize($arr[i]); " + " if (ret !== undefined) " + " $arr[i] = ret; " + " } " + "}" ); + s->invoke( g , BSONObj() , 0 , true ); + } + + result.appendArray( "retval" , s->getObject( "$arr" ) ); + result.append( "count" , keynum - 1 ); + result.append( "keys" , (int)(map.size()) ); + s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 ); + s->gc(); + + return true; + } + + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + /* db.$cmd.findOne( { group :

} ) */ + const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck(); + + BSONObj q; + if ( p["cond"].type() == Object ) + q = p["cond"].embeddedObject(); + else if ( p["condition"].type() == Object ) + q = p["condition"].embeddedObject(); + else + q = getQuery( p ); + + if ( p["ns"].type() != String ) { + errmsg = "ns has to be set"; + return false; + } + + string ns = dbname + "." + p["ns"].String(); + + BSONObj key; + string keyf; + if ( p["key"].type() == Object ) { + key = p["key"].embeddedObjectUserCheck(); + if ( ! p["$keyf"].eoo() ) { + errmsg = "can't have key and $keyf"; + return false; + } + } + else if ( p["$keyf"].type() ) { + keyf = p["$keyf"]._asCode(); + } + else { + // no key specified, will use entire object as key + } + + BSONElement reduce = p["$reduce"]; + if ( reduce.eoo() ) { + errmsg = "$reduce has to be set"; + return false; + } + + BSONElement initial = p["initial"]; + if ( initial.type() != Object ) { + errmsg = "initial has to be an object"; + return false; + } + + + string finalize; + if (p["finalize"].type()) + finalize = p["finalize"]._asCode(); + + return group( dbname , ns , q , + key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() , + initial.embeddedObject() , finalize , + errmsg , result ); + } + + } cmdGroup; + + +} // namespace mongo diff --git a/db/commands/isself.cpp b/db/commands/isself.cpp new file mode 100644 index 0000000..b97f51e --- /dev/null +++ b/db/commands/isself.cpp @@ -0,0 +1,220 @@ +// isself.cpp + +#include "pch.h" +#include "../../util/message.h" +#include "../commands.h" +#include "../../client/dbclient.h" + +#ifndef _WIN32 +# ifndef __sunos__ +# include +# endif +# include +# include +#endif + + +namespace mongo { + +#if !defined(_WIN32) && !defined(__sunos__) + + vector getMyAddrs() { + ifaddrs * addrs; + + int status = getifaddrs(&addrs); + massert(13469, "getifaddrs failure: " + errnoWithDescription(errno), status == 0); + + vector out; + + // based on example code from linux getifaddrs manpage + for (ifaddrs * addr = addrs; addr != NULL; addr = addr->ifa_next) { + if ( addr->ifa_addr == NULL ) continue; + int family = addr->ifa_addr->sa_family; + char host[NI_MAXHOST]; + + if (family == AF_INET || family == AF_INET6) { + status = getnameinfo(addr->ifa_addr, + (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)), + host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + if ( status != 0 ) { + freeifaddrs( addrs ); + addrs = NULL; + msgasserted( 13470, string("getnameinfo() failed: ") + gai_strerror(status) ); + } + + out.push_back(host); + } + + } + + freeifaddrs( addrs ); + addrs = NULL; + + if (logLevel >= 1) { + log(1) << "getMyAddrs():"; + for (vector::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) { + log(1) << " [" << *it << ']'; + } + log(1) << endl; + } + + return out; + } + + vector getAllIPs(StringData iporhost) { + addrinfo* addrs = NULL; + addrinfo hints; + memset(&hints, 0, sizeof(addrinfo)); + hints.ai_socktype = SOCK_STREAM; + hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET); + + static string portNum = BSONObjBuilder::numStr(cmdLine.port); + + vector out; + + int ret = getaddrinfo(iporhost.data(), portNum.c_str(), &hints, &addrs); + if ( ret ) { + warning() << "getaddrinfo(\"" << iporhost.data() << "\") failed: " << gai_strerror(ret) << endl; + return out; + } + + for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) { + int family = addr->ai_family; + char host[NI_MAXHOST]; + + if (family == AF_INET || family == AF_INET6) { + int status = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + + massert(13472, string("getnameinfo() failed: ") + gai_strerror(status), status == 0); + + out.push_back(host); + } + + } + + freeaddrinfo(addrs); + + if (logLevel >= 1) { + log(1) << "getallIPs(\"" << iporhost << "\"):"; + for (vector::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) { + log(1) << " [" << *it << ']'; + } + log(1) << endl; + } + + return out; + } +#endif + + + class IsSelfCommand : public Command { + public: + IsSelfCommand() : Command("_isSelf") , _cacheLock( "IsSelfCommand::_cacheLock" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "{ _isSelf : 1 } INTERNAL ONLY"; + } + + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + init(); + result.append( "id" , _id ); + return true; + } + + void init() { + scoped_lock lk( _cacheLock ); + if ( ! _id.isSet() ) + _id.init(); + } + + OID _id; + + mongo::mutex _cacheLock; + map _cache; + } isSelfCommand; + + bool HostAndPort::isSelf() const { + + int p = _port == -1 ? CmdLine::DefaultDBPort : _port; + + if( p != cmdLine.port ) { + // shortcut - ports have to match at the very least + return false; + } + + string host = str::stream() << _host << ":" << p; + + { + // check cache for this host + // debatably something _could_ change, but I'm not sure right now (erh 10/14/2010) + scoped_lock lk( isSelfCommand._cacheLock ); + map::const_iterator i = isSelfCommand._cache.find( host ); + if ( i != isSelfCommand._cache.end() ) + return i->second; + } + +#if !defined(_WIN32) && !defined(__sunos__) + // on linux and os x we can do a quick check for an ip match + + const vector myaddrs = getMyAddrs(); + const vector addrs = getAllIPs(_host); + + for (vector::const_iterator i=myaddrs.begin(), iend=myaddrs.end(); i!=iend; ++i) { + for (vector::const_iterator j=addrs.begin(), jend=addrs.end(); j!=jend; ++j) { + string a = *i; + string b = *j; + + if ( a == b || + ( str::startsWith( a , "127." ) && str::startsWith( b , "127." ) ) // 127. is all loopback + ) { + + // add to cache + scoped_lock lk( isSelfCommand._cacheLock ); + isSelfCommand._cache[host] = true; + return true; + } + } + } + +#endif + + if ( ! Listener::getTimeTracker() ) { + // this ensures we are actually running a server + // this may return true later, so may want to retry + return false; + } + + + try { + + isSelfCommand.init(); + + DBClientConnection conn; + string errmsg; + if ( ! conn.connect( host , errmsg ) ) { + // should this go in the cache? + return false; + } + + BSONObj out; + bool ok = conn.simpleCommand( "admin" , &out , "_isSelf" ); + + bool me = ok && out["id"].type() == jstOID && isSelfCommand._id == out["id"].OID(); + + // add to cache + scoped_lock lk( isSelfCommand._cacheLock ); + isSelfCommand._cache[host] = me; + + return me; + } + catch ( std::exception& e ) { + warning() << "could't check isSelf (" << host << ") " << e.what() << endl; + } + + return false; + } + + + +} diff --git a/db/commands/mr.cpp b/db/commands/mr.cpp new file mode 100644 index 0000000..16c604a --- /dev/null +++ b/db/commands/mr.cpp @@ -0,0 +1,1074 @@ +// mr.cpp + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "../db.h" +#include "../instance.h" +#include "../commands.h" +#include "../../scripting/engine.h" +#include "../../client/dbclient.h" +#include "../../client/connpool.h" +#include "../../client/parallel.h" +#include "../queryoptimizer.h" +#include "../matcher.h" +#include "../clientcursor.h" +#include "../replpair.h" +#include "../../s/d_chunk_manager.h" +#include "../../s/d_logic.h" + +#include "mr.h" + +namespace mongo { + + namespace mr { + + AtomicUInt Config::JOB_NUMBER; + + JSFunction::JSFunction( string type , const BSONElement& e ) { + _type = type; + _code = e._asCode(); + + if ( e.type() == CodeWScope ) + _wantedScope = e.codeWScopeObject(); + } + + void JSFunction::init( State * state ) { + _scope = state->scope(); + assert( _scope ); + _scope->init( &_wantedScope ); + + _func = _scope->createFunction( _code.c_str() ); + uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func ); + } + + void JSMapper::init( State * state ) { + _func.init( state ); + _params = state->config().mapParams; + } + + /** + * Applies the map function to an object, which should internally call emit() + */ + void JSMapper::map( const BSONObj& o ) { + Scope * s = _func.scope(); + assert( s ); + s->setThis( &o ); + if ( s->invoke( _func.func() , _params , 0 , true ) ) + throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() ); + } + + /** + * Applies the finalize function to a tuple obj (key, val) + * Returns tuple obj {_id: key, value: newval} + */ + BSONObj JSFinalizer::finalize( const BSONObj& o ) { + Scope * s = _func.scope(); + + Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" ); + s->invokeSafe( _func.func() , o ); + + // don't want to use o.objsize() to size b + // since there are many cases where the point of finalize + // is converting many fields to 1 + BSONObjBuilder b; + b.append( o.firstElement() ); + s->append( b , "value" , "return" ); + return b.obj(); + } + + /** + * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value} + */ + BSONObj JSReducer::reduce( const BSONList& tuples ) { + if (tuples.size() <= 1) + return tuples[0]; + BSONObj key; + int endSizeEstimate = 16; + _reduce( tuples , key , endSizeEstimate ); + + BSONObjBuilder b(endSizeEstimate); + b.appendAs( key.firstElement() , "0" ); + _func.scope()->append( b , "1" , "return" ); + return b.obj(); + } + + /** + * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val} + * Also applies a finalizer method if present. + */ + BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) { + + BSONObj res; + BSONObj key; + + if (tuples.size() == 1) { + // 1 obj, just use it + key = tuples[0]; + BSONObjBuilder b(key.objsize()); + BSONObjIterator it(key); + b.appendAs( it.next() , "_id" ); + b.appendAs( it.next() , "value" ); + res = b.obj(); + } + else { + // need to reduce + int endSizeEstimate = 16; + _reduce( tuples , key , endSizeEstimate ); + BSONObjBuilder b(endSizeEstimate); + b.appendAs( key.firstElement() , "_id" ); + _func.scope()->append( b , "value" , "return" ); + res = b.obj(); + } + + if ( finalizer ) { + res = finalizer->finalize( res ); + } + + return res; + } + + /** + * actually applies a reduce, to a list of tuples (key, value). + * After the call, tuples will hold a single tuple {"0": key, "1": value} + */ + void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) { + uassert( 10074 , "need values" , tuples.size() ); + + int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128; + + // need to build the reduce args: ( key, [values] ) + BSONObjBuilder reduceArgs( sizeEstimate ); + boost::scoped_ptr valueBuilder; + int sizeSoFar = 0; + unsigned n = 0; + for ( ; n BSONObjMaxUserSize ) { + assert( n > 1 ); // if not, inf. loop + break; + } + + valueBuilder->append( ee ); + sizeSoFar += ee.size(); + } + assert(valueBuilder); + valueBuilder->done(); + BSONObj args = reduceArgs.obj(); + + Scope * s = _func.scope(); + + s->invokeSafe( _func.func() , args ); + + if ( s->type( "return" ) == Array ) { + uasserted( 10075 , "reduce -> multiple not supported yet"); + return; + } + + endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() ); + + if ( n == tuples.size() ) + return; + + // the input list was too large, add the rest of elmts to new tuples and reduce again + // note: would be better to use loop instead of recursion to avoid stack overflow + BSONList x; + for ( ; n < tuples.size(); n++ ) { + x.push_back( tuples[n] ); + } + BSONObjBuilder temp( endSizeEstimate ); + temp.append( key.firstElement() ); + s->append( temp , "1" , "return" ); + x.push_back( temp.obj() ); + _reduce( x , key , endSizeEstimate ); + } + + Config::Config( const string& _dbname , const BSONObj& cmdObj ) { + + dbname = _dbname; + ns = dbname + "." + cmdObj.firstElement().valuestr(); + + verbose = cmdObj["verbose"].trueValue(); + + uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() ); + + if ( cmdObj["out"].type() == String ) { + finalShort = cmdObj["out"].String(); + outType = REPLACE; + } + else if ( cmdObj["out"].type() == Object ) { + BSONObj o = cmdObj["out"].embeddedObject(); + + BSONElement e = o.firstElement(); + string t = e.fieldName(); + + if ( t == "normal" || t == "replace" ) { + outType = REPLACE; + finalShort = e.String(); + } + else if ( t == "merge" ) { + outType = MERGE; + finalShort = e.String(); + } + else if ( t == "reduce" ) { + outType = REDUCE; + finalShort = e.String(); + } + else if ( t == "inline" ) { + outType = INMEMORY; + } + else { + uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" ); + } + + if (o.hasElement("db")) { + outDB = o["db"].String(); + } + } + else { + uasserted( 13606 , "'out' has to be a string or an object" ); + } + + if ( outType != INMEMORY ) { // setup names + tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << finalShort << "_" << JOB_NUMBER++; + + incLong = tempLong + "_inc"; + + finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort; + } + + { + // scope and code + + if ( cmdObj["scope"].type() == Object ) + scopeSetup = cmdObj["scope"].embeddedObjectUserCheck(); + + mapper.reset( new JSMapper( cmdObj["map"] ) ); + reducer.reset( new JSReducer( cmdObj["reduce"] ) ); + if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() ) + finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) ); + + if ( cmdObj["mapparams"].type() == Array ) { + mapParams = cmdObj["mapparams"].embeddedObjectUserCheck(); + } + + } + + { + // query options + BSONElement q = cmdObj["query"]; + if ( q.type() == Object ) + filter = q.embeddedObjectUserCheck(); + else + uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() ); + + + BSONElement s = cmdObj["sort"]; + if ( s.type() == Object ) + sort = s.embeddedObjectUserCheck(); + else + uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() ); + + if ( cmdObj["limit"].isNumber() ) + limit = cmdObj["limit"].numberLong(); + else + limit = 0; + } + } + + /** + * Create temporary collection, set up indexes + */ + void State::prepTempCollection() { + if ( ! _onDisk ) + return; + + _db.dropCollection( _config.tempLong ); + + { + // create + writelock lock( _config.tempLong.c_str() ); + Client::Context ctx( _config.tempLong.c_str() ); + string errmsg; + if ( ! userCreateNS( _config.tempLong.c_str() , BSONObj() , errmsg , true ) ) { + uasserted( 13630 , str::stream() << "userCreateNS failed for mr tempLong ns: " << _config.tempLong << " err: " << errmsg ); + } + } + + + { + // copy indexes + auto_ptr idx = _db.getIndexes( _config.finalLong ); + while ( idx->more() ) { + BSONObj i = idx->next(); + + BSONObjBuilder b( i.objsize() + 16 ); + b.append( "ns" , _config.tempLong ); + BSONObjIterator j( i ); + while ( j.more() ) { + BSONElement e = j.next(); + if ( str::equals( e.fieldName() , "_id" ) || + str::equals( e.fieldName() , "ns" ) ) + continue; + + b.append( e ); + } + + BSONObj indexToInsert = b.obj(); + insert( Namespace( _config.tempLong.c_str() ).getSisterNS( "system.indexes" ).c_str() , indexToInsert ); + } + + } + + } + + /** + * For inline mode, appends results to output object. + * Makes sure (key, value) tuple is formatted as {_id: key, value: val} + */ + void State::appendResults( BSONObjBuilder& final ) { + if ( _onDisk ) + return; + + uassert( 13604 , "too much data for in memory map/reduce" , _size < ( BSONObjMaxUserSize / 2 ) ); + + BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys + + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { + BSONObj key = i->first; + BSONList& all = i->second; + + assert( all.size() == 1 ); + + BSONObjIterator vi( all[0] ); + vi.next(); + + BSONObjBuilder temp( b.subobjStart() ); + temp.appendAs( key.firstElement() , "_id" ); + temp.appendAs( vi.next() , "value" ); + temp.done(); + } + + BSONArray res = b.arr(); + uassert( 13605 , "too much data for in memory map/reduce" , res.objsize() < ( BSONObjMaxUserSize * 2 / 3 ) ); + + final.append( "results" , res ); + } + + /** + * Does post processing on output collection. + * This may involve replacing, merging or reducing. + */ + long long State::postProcessCollection() { + if ( _onDisk == false || _config.outType == Config::INMEMORY ) + return _temp->size(); + + dblock lock; + + if ( _config.finalLong == _config.tempLong ) + return _db.count( _config.finalLong ); + + if ( _config.outType == Config::REPLACE || _db.count( _config.finalLong ) == 0 ) { + // replace: just rename from temp to final collection name, dropping previous collection + _db.dropCollection( _config.finalLong ); + BSONObj info; + uassert( 10076 , "rename failed" , + _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) ); + _db.dropCollection( _config.tempLong ); + } + else if ( _config.outType == Config::MERGE ) { + // merge: upsert new docs into old collection + auto_ptr cursor = _db.query( _config.tempLong , BSONObj() ); + while ( cursor->more() ) { + BSONObj o = cursor->next(); + Helpers::upsert( _config.finalLong , o ); + getDur().commitIfNeeded(); + } + _db.dropCollection( _config.tempLong ); + } + else if ( _config.outType == Config::REDUCE ) { + // reduce: apply reduce op on new result and existing one + BSONList values; + + auto_ptr cursor = _db.query( _config.tempLong , BSONObj() ); + while ( cursor->more() ) { + BSONObj temp = cursor->next(); + BSONObj old; + + bool found; + { + Client::Context tx( _config.finalLong ); + found = Helpers::findOne( _config.finalLong.c_str() , temp["_id"].wrap() , old , true ); + } + + if ( found ) { + // need to reduce + values.clear(); + values.push_back( temp ); + values.push_back( old ); + Helpers::upsert( _config.finalLong , _config.reducer->finalReduce( values , _config.finalizer.get() ) ); + } + else { + Helpers::upsert( _config.finalLong , temp ); + } + getDur().commitIfNeeded(); + } + _db.dropCollection( _config.tempLong ); + } + + return _db.count( _config.finalLong ); + } + + /** + * Insert doc in collection + */ + void State::insert( const string& ns , BSONObj& o ) { + assert( _onDisk ); + + writelock l( ns ); + Client::Context ctx( ns ); + + theDataFileMgr.insertAndLog( ns.c_str() , o , false ); + } + + /** + * Insert doc into the inc collection + */ + void State::_insertToInc( BSONObj& o ) { + assert( _onDisk ); + theDataFileMgr.insertWithObjMod( _config.incLong.c_str() , o , true ); + getDur().commitIfNeeded(); + } + + State::State( const Config& c ) : _config( c ), _size(0), _numEmits(0) { + _temp.reset( new InMemory() ); + _onDisk = _config.outType != Config::INMEMORY; + } + + bool State::sourceExists() { + return _db.exists( _config.ns ); + } + + long long State::incomingDocuments() { + return _db.count( _config.ns , _config.filter , QueryOption_SlaveOk , (unsigned) _config.limit ); + } + + State::~State() { + if ( _onDisk ) { + try { + _db.dropCollection( _config.tempLong ); + _db.dropCollection( _config.incLong ); + } + catch ( std::exception& e ) { + error() << "couldn't cleanup after map reduce: " << e.what() << endl; + } + } + } + + /** + * Initialize the mapreduce operation, creating the inc collection + */ + void State::init() { + // setup js + _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() ); + _scope->localConnect( _config.dbname.c_str() ); + + if ( ! _config.scopeSetup.isEmpty() ) + _scope->init( &_config.scopeSetup ); + + _config.mapper->init( this ); + _config.reducer->init( this ); + if ( _config.finalizer ) + _config.finalizer->init( this ); + + _scope->injectNative( "emit" , fast_emit ); + + if ( _onDisk ) { + // clear temp collections + _db.dropCollection( _config.tempLong ); + _db.dropCollection( _config.incLong ); + + // create the inc collection and make sure we have index on "0" key + { + writelock l( _config.incLong ); + Client::Context ctx( _config.incLong ); + string err; + if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) { + uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err ); + } + } + + BSONObj sortKey = BSON( "0" << 1 ); + _db.ensureIndex( _config.incLong , sortKey ); + + } + + } + + /** + * Applies last reduce and finalize on a list of tuples (key, val) + * Inserts single result {_id: key, value: val} into temp collection + */ + void State::finalReduce( BSONList& values ) { + if ( !_onDisk || values.size() == 0 ) + return; + + BSONObj res = _config.reducer->finalReduce( values , _config.finalizer.get() ); + insert( _config.tempLong , res ); + } + + /** + * Applies last reduce and finalize. + * After calling this method, the temp collection will be completed. + * If inline, the results will be in the in memory map + */ + void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) { + if ( ! _onDisk ) { + // all data has already been reduced, just finalize + if ( _config.finalizer ) { + long size = 0; + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { + BSONObj key = i->first; + BSONList& all = i->second; + + assert( all.size() == 1 ); + + BSONObj res = _config.finalizer->finalize( all[0] ); + + all.clear(); + all.push_back( res ); + size += res.objsize(); + } + _size = size; + } + return; + } + + // use index on "0" to pull sorted data + assert( _temp->size() == 0 ); + BSONObj sortKey = BSON( "0" << 1 ); + { + bool foundIndex = false; + + auto_ptr idx = _db.getIndexes( _config.incLong ); + while ( idx.get() && idx->more() ) { + BSONObj x = idx->next(); + if ( sortKey.woCompare( x["key"].embeddedObject() ) == 0 ) { + foundIndex = true; + break; + } + } + + assert( foundIndex ); + } + + readlock rl( _config.incLong.c_str() ); + Client::Context ctx( _config.incLong ); + + BSONObj prev; + BSONList all; + + assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , _db.count( _config.incLong, BSONObj(), QueryOption_SlaveOk ) ) ); + + shared_ptr temp = bestGuessCursor( _config.incLong.c_str() , BSONObj() , sortKey ); + auto_ptr cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , _config.incLong.c_str() ) ); + + // iterate over all sorted objects + while ( cursor->ok() ) { + BSONObj o = cursor->current().getOwned(); + cursor->advance(); + + pm.hit(); + + if ( o.woSortOrder( prev , sortKey ) == 0 ) { + // object is same as previous, add to array + all.push_back( o ); + if ( pm->hits() % 1000 == 0 ) { + if ( ! cursor->yield() ) { + cursor.release(); + break; + } + killCurrentOp.checkForInterrupt(); + } + continue; + } + + ClientCursor::YieldLock yield (cursor.get()); + // reduce an finalize array + finalReduce( all ); + + all.clear(); + prev = o; + all.push_back( o ); + + if ( ! yield.stillOk() ) { + cursor.release(); + break; + } + + killCurrentOp.checkForInterrupt(); + } + + // we need to release here since we temp release below + cursor.release(); + + { + dbtempreleasecond tl; + if ( ! tl.unlocked() ) + log( LL_WARNING ) << "map/reduce can't temp release" << endl; + // reduce and finalize last array + finalReduce( all ); + } + + pm.finished(); + } + + /** + * Attempts to reduce objects in the memory map. + * A new memory map will be created to hold the results. + * If applicable, objects with unique key may be dumped to inc collection. + * Input and output objects are both {"0": key, "1": val} + */ + void State::reduceInMemory() { + + auto_ptr n( new InMemory() ); // for new data + long nSize = 0; + long dupCount = 0; + + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { + BSONObj key = i->first; + BSONList& all = i->second; + + if ( all.size() == 1 ) { + // only 1 value for this key + if ( _onDisk ) { + // this key has low cardinality, so just write to collection + writelock l(_config.incLong); + Client::Context ctx(_config.incLong.c_str()); + _insertToInc( *(all.begin()) ); + } + else { + // add to new map + _add( n.get() , all[0] , nSize, dupCount ); + } + } + else if ( all.size() > 1 ) { + // several values, reduce and add to map + BSONObj res = _config.reducer->reduce( all ); + _add( n.get() , res , nSize, dupCount ); + } + } + + // swap maps + _temp.reset( n.release() ); + _size = nSize; + _dupCount = dupCount; + } + + /** + * Dumps the entire in memory map to the inc collection. + */ + void State::dumpToInc() { + if ( ! _onDisk ) + return; + + writelock l(_config.incLong); + Client::Context ctx(_config.incLong); + + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ) { + BSONList& all = i->second; + if ( all.size() < 1 ) + continue; + + for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ ) + _insertToInc( *j ); + } + _temp->clear(); + _size = 0; + + } + + /** + * Adds object to in memory map + */ + void State::emit( const BSONObj& a ) { + _numEmits++; + _add( _temp.get() , a , _size, _dupCount ); + } + + void State::_add( InMemory* im, const BSONObj& a , long& size, long& dupCount ) { + BSONList& all = (*im)[a]; + all.push_back( a ); + size += a.objsize() + 16; + if (all.size() > 1) + ++dupCount; + } + + /** + * this method checks the size of in memory map and potentially flushes to disk + */ + void State::checkSize() { + if ( _size < 1024 * 50 ) + return; + + // attempt to reduce in memory map, if we've seen duplicates + if ( _dupCount > 0) { + long before = _size; + reduceInMemory(); + log(1) << " mr: did reduceInMemory " << before << " -->> " << _size << endl; + } + + if ( ! _onDisk || _size < 1024 * 100 ) + return; + + dumpToInc(); + log(1) << " mr: dumping to db" << endl; + } + + boost::thread_specific_ptr _tl; + + /** + * emit that will be called by js function + */ + BSONObj fast_emit( const BSONObj& args ) { + uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 ); + uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) ); + (*_tl)->emit( args ); + return BSONObj(); + } + + /** + * This class represents a map/reduce command executed on a single server + */ + class MapReduceCommand : public Command { + public: + MapReduceCommand() : Command("mapReduce", false, "mapreduce") {} + virtual bool slaveOk() const { return !replSet; } + virtual bool slaveOverrideOk() { return true; } + + virtual void help( stringstream &help ) const { + help << "Run a map/reduce operation on the server.\n"; + help << "Note this is used for aggregation, not querying, in MongoDB.\n"; + help << "http://www.mongodb.org/display/DOCS/MapReduce"; + } + virtual LockType locktype() const { return NONE; } + bool run(const string& dbname , BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + Timer t; + Client::GodScope cg; + Client& client = cc(); + CurOp * op = client.curop(); + + Config config( dbname , cmd ); + + log(1) << "mr ns: " << config.ns << endl; + + bool shouldHaveData = false; + + long long num = 0; + long long inReduce = 0; + + BSONObjBuilder countsBuilder; + BSONObjBuilder timingBuilder; + State state( config ); + + if ( ! state.sourceExists() ) { + errmsg = "ns doesn't exist"; + return false; + } + + if (replSet && state.isOnDisk()) { + // this means that it will be doing a write operation, make sure we are on Master + // ideally this check should be in slaveOk(), but at that point config is not known + if (!isMaster(dbname.c_str())) { + errmsg = "not master"; + return false; + } + } + + try { + state.init(); + + { + State** s = new State*(); + s[0] = &state; + _tl.reset( s ); + } + + wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned + ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) ); + long long mapTime = 0; + { + readlock lock( config.ns ); + Client::Context ctx( config.ns ); + + ShardChunkManagerPtr chunkManager; + if ( shardingState.needShardChunkManager( config.ns ) ) { + chunkManager = shardingState.getShardChunkManager( config.ns ); + } + + // obtain cursor on data to apply mr to, sorted + shared_ptr temp = bestGuessCursor( config.ns.c_str(), config.filter, config.sort ); + auto_ptr cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) ); + + Timer mt; + // go through each doc + while ( cursor->ok() ) { + // make sure we dont process duplicates in case data gets moved around during map + if ( cursor->currentIsDup() ) { + cursor->advance(); + continue; + } + + if ( ! cursor->currentMatches() ) { + cursor->advance(); + continue; + } + + BSONObj o = cursor->current(); + cursor->advance(); + + // check to see if this is a new object we don't own yet + // because of a chunk migration + if ( chunkManager && ! chunkManager->belongsToMe( o ) ) + continue; + + // do map + if ( config.verbose ) mt.reset(); + config.mapper->map( o ); + if ( config.verbose ) mapTime += mt.micros(); + + num++; + if ( num % 100 == 0 ) { + // try to yield lock regularly + ClientCursor::YieldLock yield (cursor.get()); + Timer t; + // check if map needs to be dumped to disk + state.checkSize(); + inReduce += t.micros(); + + if ( ! yield.stillOk() ) { + cursor.release(); + break; + } + + killCurrentOp.checkForInterrupt(); + } + pm.hit(); + + if ( config.limit && num >= config.limit ) + break; + } + } + pm.finished(); + + killCurrentOp.checkForInterrupt(); + // update counters + countsBuilder.appendNumber( "input" , num ); + countsBuilder.appendNumber( "emit" , state.numEmits() ); + if ( state.numEmits() ) + shouldHaveData = true; + + timingBuilder.append( "mapTime" , mapTime / 1000 ); + timingBuilder.append( "emitLoop" , t.millis() ); + + op->setMessage( "m/r: (2/3) final reduce in memory" ); + // do reduce in memory + // this will be the last reduce needed for inline mode + state.reduceInMemory(); + // if not inline: dump the in memory map to inc collection, all data is on disk + state.dumpToInc(); + state.prepTempCollection(); + // final reduce + state.finalReduce( op , pm ); + + _tl.reset(); + } + catch ( ... ) { + log() << "mr failed, removing collection" << endl; + throw; + } + + long long finalCount = state.postProcessCollection(); + state.appendResults( result ); + + timingBuilder.append( "total" , t.millis() ); + + if (!config.outDB.empty()) { + BSONObjBuilder loc; + if ( !config.outDB.empty()) + loc.append( "db" , config.outDB ); + if ( !config.finalShort.empty() ) + loc.append( "collection" , config.finalShort ); + result.append("result", loc.obj()); + } + else { + if ( !config.finalShort.empty() ) + result.append( "result" , config.finalShort ); + } + result.append( "timeMillis" , t.millis() ); + countsBuilder.appendNumber( "output" , finalCount ); + if ( config.verbose ) result.append( "timing" , timingBuilder.obj() ); + result.append( "counts" , countsBuilder.obj() ); + + if ( finalCount == 0 && shouldHaveData ) { + result.append( "cmd" , cmd ); + errmsg = "there were emits but no data!"; + return false; + } + + return true; + } + + } mapReduceCommand; + + /** + * This class represents a map/reduce command executed on the output server of a sharded env + */ + class MapReduceFinishCommand : public Command { + public: + MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ) {} + virtual bool slaveOk() const { return !replSet; } + virtual bool slaveOverrideOk() { return true; } + + virtual LockType locktype() const { return NONE; } + bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe(); + + Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() ); + config.incLong = config.tempLong; + + set servers; + + BSONObjBuilder shardCounts; + map counts; + + BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck(); + vector< auto_ptr > shardCursors; + + { + // parse per shard results + BSONObjIterator i( shards ); + while ( i.more() ) { + BSONElement e = i.next(); + string shard = e.fieldName(); + + BSONObj res = e.embeddedObjectUserCheck(); + + uassert( 10078 , "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() ); + servers.insert( shard ); + shardCounts.appendAs( res["counts"] , shard ); + + BSONObjIterator j( res["counts"].embeddedObjectUserCheck() ); + while ( j.more() ) { + BSONElement temp = j.next(); + counts[temp.fieldName()] += temp.numberLong(); + } + + } + + } + + State state(config); + state.prepTempCollection(); + + { + // reduce from each stream + + BSONObj sortKey = BSON( "_id" << 1 ); + + ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection , + Query().sort( sortKey ) ); + cursor.init(); + state.init(); + + BSONList values; + if (!config.outDB.empty()) { + BSONObjBuilder loc; + if ( !config.outDB.empty()) + loc.append( "db" , config.outDB ); + if ( !config.finalShort.empty() ) + loc.append( "collection" , config.finalShort ); + result.append("result", loc.obj()); + } + else { + if ( !config.finalShort.empty() ) + result.append( "result" , config.finalShort ); + } + + while ( cursor.more() ) { + BSONObj t = cursor.next().getOwned(); + + if ( values.size() == 0 ) { + values.push_back( t ); + continue; + } + + if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) { + values.push_back( t ); + continue; + } + + + state.emit( config.reducer->finalReduce( values , config.finalizer.get() ) ); + values.clear(); + values.push_back( t ); + } + + if ( values.size() ) + state.emit( config.reducer->finalReduce( values , config.finalizer.get() ) ); + } + + + state.dumpToInc(); + state.postProcessCollection(); + state.appendResults( result ); + + for ( set::iterator i=servers.begin(); i!=servers.end(); i++ ) { + ScopedDbConnection conn( i->_server ); + conn->dropCollection( dbname + "." + shardedOutputCollection ); + conn.done(); + } + + result.append( "shardCounts" , shardCounts.obj() ); + + { + BSONObjBuilder c; + for ( map::iterator i=counts.begin(); i!=counts.end(); i++ ) { + c.append( i->first , i->second ); + } + result.append( "counts" , c.obj() ); + } + + return 1; + } + } mapReduceFinishCommand; + + } + +} + diff --git a/db/commands/mr.h b/db/commands/mr.h new file mode 100644 index 0000000..f505a45 --- /dev/null +++ b/db/commands/mr.h @@ -0,0 +1,291 @@ +// mr.h + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#pragma once + +#include "pch.h" + +namespace mongo { + + namespace mr { + + typedef vector BSONList; + + class State; + + // ------------ function interfaces ----------- + + class Mapper : boost::noncopyable { + public: + virtual ~Mapper() {} + virtual void init( State * state ) = 0; + + virtual void map( const BSONObj& o ) = 0; + }; + + class Finalizer : boost::noncopyable { + public: + virtual ~Finalizer() {} + virtual void init( State * state ) = 0; + + /** + * this takes a tuple and returns a tuple + */ + virtual BSONObj finalize( const BSONObj& tuple ) = 0; + }; + + class Reducer : boost::noncopyable { + public: + virtual ~Reducer() {} + virtual void init( State * state ) = 0; + + virtual BSONObj reduce( const BSONList& tuples ) = 0; + /** this means its a final reduce, even if there is no finalizer */ + virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0; + }; + + // ------------ js function implementations ----------- + + /** + * used as a holder for Scope and ScriptingFunction + * visitor like pattern as Scope is gotten from first access + */ + class JSFunction : boost::noncopyable { + public: + /** + * @param type (map|reduce|finalize) + */ + JSFunction( string type , const BSONElement& e ); + virtual ~JSFunction() {} + + virtual void init( State * state ); + + Scope * scope() const { return _scope; } + ScriptingFunction func() const { return _func; } + + private: + string _type; + string _code; // actual javascript code + BSONObj _wantedScope; // this is for CodeWScope + + Scope * _scope; // this is not owned by us, and might be shared + ScriptingFunction _func; + }; + + class JSMapper : public Mapper { + public: + JSMapper( const BSONElement & code ) : _func( "map" , code ) {} + virtual void map( const BSONObj& o ); + virtual void init( State * state ); + + private: + JSFunction _func; + BSONObj _params; + }; + + class JSReducer : public Reducer { + public: + JSReducer( const BSONElement& code ) : _func( "reduce" , code ) {} + virtual void init( State * state ) { _func.init( state ); } + + virtual BSONObj reduce( const BSONList& tuples ); + virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ); + + private: + + /** + * result in "return" + * @param key OUT + * @param endSizeEstimate OUT + */ + void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate ); + + JSFunction _func; + + }; + + class JSFinalizer : public Finalizer { + public: + JSFinalizer( const BSONElement& code ) : _func( "finalize" , code ) {} + virtual BSONObj finalize( const BSONObj& o ); + virtual void init( State * state ) { _func.init( state ); } + private: + JSFunction _func; + + }; + + // ----------------- + + + class TupleKeyCmp { + public: + TupleKeyCmp() {} + bool operator()( const BSONObj &l, const BSONObj &r ) const { + return l.firstElement().woCompare( r.firstElement() ) < 0; + } + }; + + typedef map< BSONObj,BSONList,TupleKeyCmp > InMemory; // from key to list of tuples + + /** + * holds map/reduce config information + */ + class Config { + public: + Config( const string& _dbname , const BSONObj& cmdObj ); + + string dbname; + string ns; + + // options + bool verbose; + + // query options + + BSONObj filter; + BSONObj sort; + long long limit; + + // functions + + scoped_ptr mapper; + scoped_ptr reducer; + scoped_ptr finalizer; + + BSONObj mapParams; + BSONObj scopeSetup; + + // output tables + string incLong; + string tempLong; + + string finalShort; + string finalLong; + + string outDB; + + enum { REPLACE , // atomically replace the collection + MERGE , // merge keys, override dups + REDUCE , // merge keys, reduce dups + INMEMORY // only store in memory, limited in size + } outType; + + static AtomicUInt JOB_NUMBER; + }; // end MRsetup + + /** + * stores information about intermediate map reduce state + * controls flow of data from map->reduce->finalize->output + */ + class State { + public: + State( const Config& c ); + ~State(); + + void init(); + + // ---- prep ----- + bool sourceExists(); + + long long incomingDocuments(); + + // ---- map stage ---- + + /** + * stages on in in-memory storage + */ + void emit( const BSONObj& a ); + + /** + * if size is big, run a reduce + * if its still big, dump to temp collection + */ + void checkSize(); + + /** + * run reduce on _temp + */ + void reduceInMemory(); + + /** + * transfers in memory storage to temp collection + */ + void dumpToInc(); + + // ------ reduce stage ----------- + + void prepTempCollection(); + + void finalReduce( BSONList& values ); + + void finalReduce( CurOp * op , ProgressMeterHolder& pm ); + + // ------- cleanup/data positioning ---------- + + /** + @return number objects in collection + */ + long long postProcessCollection(); + + /** + * if INMEMORY will append + * may also append stats or anything else it likes + */ + void appendResults( BSONObjBuilder& b ); + + // -------- util ------------ + + /** + * inserts with correct replication semantics + */ + void insert( const string& ns , BSONObj& o ); + + // ------ simple accessors ----- + + /** State maintains ownership, do no use past State lifetime */ + Scope* scope() { return _scope.get(); } + + const Config& config() { return _config; } + + const bool isOnDisk() { return _onDisk; } + + long long numEmits() const { return _numEmits; } + + protected: + + void _insertToInc( BSONObj& o ); + static void _add( InMemory* im , const BSONObj& a , long& size, long& dupCount ); + + scoped_ptr _scope; + const Config& _config; + bool _onDisk; // if the end result of this map reduce is disk or not + + DBDirectClient _db; + + scoped_ptr _temp; + long _size; // bytes in _temp + long _dupCount; // number of duplicate key entries + + long long _numEmits; + }; + + BSONObj fast_emit( const BSONObj& args ); + + } // end mr namespace +} + + diff --git a/db/common.cpp b/db/common.cpp index b7883f5..44bc54d 100644 --- a/db/common.cpp +++ b/db/common.cpp @@ -26,4 +26,8 @@ namespace mongo { /* we use new here so we don't have to worry about destructor orders at program shutdown */ MongoMutex &dbMutex( *(new MongoMutex("rw:dbMutex")) ); + MongoMutex::MongoMutex(const char *name) : _m(name) { + _remapPrivateViewRequested = false; + } + } diff --git a/db/compact.cpp b/db/compact.cpp new file mode 100644 index 0000000..6bafd91 --- /dev/null +++ b/db/compact.cpp @@ -0,0 +1,199 @@ +/* @file compact.cpp + compaction of deleted space in pdfiles (datafiles) +*/ + +/* NOTE 6Oct2010 : this file PRELIMINARY, EXPERIMENTAL, NOT DONE, NOT USED YET (not in SConstruct) */ + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful,b +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "pdfile.h" +#include "concurrency.h" +#include "commands.h" +#include "curop-inl.h" +#include "../util/concurrency/task.h" + +namespace mongo { + + class CompactJob : public task::Task { + public: + CompactJob(string ns) : _ns(ns) { } + private: + virtual string name() const { return "compact"; } + virtual void doWork(); + NamespaceDetails * beginBlock(); + void doBatch(); + void prep(); + const string _ns; + unsigned long long _nrecords; + unsigned long long _ncompacted; + DiskLoc _firstExtent; + }; + + // lock & set context first. this checks that collection still exists, and that it hasn't + // morphed into a capped collection between locks (which is possible) + NamespaceDetails * CompactJob::beginBlock() { + NamespaceDetails *nsd = nsdetails(_ns.c_str()); + if( nsd == 0 ) throw "ns no longer present"; + if( nsd->firstExtent.isNull() ) + throw "no first extent"; + if( nsd->capped ) + throw "capped collection"; + return nsd; + } + + void CompactJob::doBatch() { + unsigned n = 0; + { + /* pre-touch records in a read lock so that paging happens in read not write lock. + note we are only touching the records though; if indexes aren't in RAM, they will + page later. So the concept is only partial. + */ + readlock lk; + Timer t; + Client::Context ctx(_ns); + NamespaceDetails *nsd = beginBlock(); + if( nsd->firstExtent != _firstExtent ) { + // TEMP DEV - stop after 1st extent + throw "change of first extent"; + } + DiskLoc loc = nsd->firstExtent.ext()->firstRecord; + while( !loc.isNull() ) { + Record *r = loc.rec(); + loc = r->getNext(loc); + if( ++n >= 100 || (n % 8 == 0 && t.millis() > 50) ) + break; + } + } + { + writelock lk; + Client::Context ctx(_ns); + NamespaceDetails *nsd = beginBlock(); + for( unsigned i = 0; i < n; i++ ) { + if( nsd->firstExtent != _firstExtent ) { + // TEMP DEV - stop after 1st extent + throw "change of first extent (or it is now null)"; + } + DiskLoc loc = nsd->firstExtent.ext()->firstRecord; + Record *rec = loc.rec(); + BSONObj o = loc.obj().getOwned(); // todo: inefficient, double mem copy... + try { + theDataFileMgr.deleteRecord(_ns.c_str(), rec, loc, false); + } + catch(DBException&) { throw "error deleting record"; } + try { + theDataFileMgr.insertNoReturnVal(_ns.c_str(), o); + } + catch(DBException&) { + /* todo: save the record somehow??? try again with 'avoid' logic? */ + log() << "compact: error re-inserting record ns:" << _ns << " n:" << _nrecords << " _id:" << o["_id"].toString() << endl; + throw "error re-inserting record"; + } + ++_ncompacted; + if( killCurrentOp.globalInterruptCheck() ) + throw "interrupted"; + } + } + } + + void CompactJob::prep() { + readlock lk; + Client::Context ctx(_ns); + NamespaceDetails *nsd = beginBlock(); + DiskLoc L = nsd->firstExtent; + assert( !L.isNull() ); + _firstExtent = L; + _nrecords = nsd->stats.nrecords; + _ncompacted = 0; + } + + static mutex m("compact"); + static volatile bool running; + + void CompactJob::doWork() { + Client::initThread("compact"); + cc().curop()->reset(); + cc().curop()->setNS(_ns.c_str()); + cc().curop()->markCommand(); + sleepsecs(60); + try { + prep(); + while( _ncompacted < _nrecords ) + doBatch(); + } + catch(const char *p) { + log() << "info: exception compact " << p << endl; + } + catch(...) { + log() << "info: exception compact" << endl; + } + mongo::running = false; + cc().shutdown(); + } + + /* --- CompactCmd --- */ + + class CompactCmd : public Command { + public: + virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string coll = cmdObj.firstElement().valuestr(); + if( coll.empty() || db.empty() ) { + errmsg = "no collection name specified"; + return false; + } + string ns = db + '.' + coll; + assert( isANormalNSName(ns.c_str()) ); + { + readlock lk; + Client::Context ctx(ns); + if( nsdetails(ns.c_str()) == 0 ) { + errmsg = "namespace " + ns + " does not exist"; + return false; + } + } + { + scoped_lock lk(m); + if( running ) { + errmsg = "a compaction is already running"; + return false; + } + running = true; + task::fork( new CompactJob(ns) ); + return true; + } + errmsg = "not done"; + return false; + } + + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return false; } + virtual bool slaveOk() const { return true; } + virtual bool logTheOp() { return false; } + virtual void help( stringstream& help ) const { + help << "compact / defragment a collection in the background, slowly, attempting to minimize disruptions to other operations\n" + "{ compact : }"; + } + virtual bool requiresAuth() { return true; } + + /** @param webUI expose the command in the web ui as localhost:28017/ + @param oldName an optional old, deprecated name for the command + */ + CompactCmd() : Command("compact") { } + }; + static CompactCmd compactCmd; + +} diff --git a/db/concurrency.h b/db/concurrency.h index 9b91b0f..39cd853 100644 --- a/db/concurrency.h +++ b/db/concurrency.h @@ -1,3 +1,5 @@ +// @file concurrency.h + /* * Copyright (C) 2010 10gen Inc. * @@ -14,9 +16,7 @@ * along with this program. If not, see . */ -/* concurrency.h - - mongod concurrency rules & notes will be placed here. +/*mongod concurrency rules & notes will be placed here. Mutex heirarchy (1 = "leaf") name level @@ -31,19 +31,22 @@ #include "../util/concurrency/rwlock.h" #include "../util/mmap.h" +#include "../util/time_support.h" namespace mongo { string sayClientState(); bool haveClient(); - - void curopWaitingForLock( int type ); - void curopGotLock(); + + class Client; + Client* curopWaitingForLock( int type ); + void curopGotLock(Client*); /* mutex time stats */ class MutexInfo { - unsigned long long start, enter, timeLocked; // all in microseconds + unsigned long long enter, timeLocked; // microseconds int locked; + unsigned long long start; // last as we touch this least often public: MutexInfo() : timeLocked(0) , locked(0) { @@ -61,215 +64,53 @@ namespace mongo { if ( locked == 0 ) timeLocked += curTimeMicros64() - enter; } - int isLocked() const { - return locked; - } + int isLocked() const { return locked; } void getTimingInfo(unsigned long long &s, unsigned long long &tl) const { s = start; tl = timeLocked; } - unsigned long long getTimeLocked() const { - return timeLocked; - } + unsigned long long getTimeLocked() const { return timeLocked; } }; - class MongoMutex { - MutexInfo _minfo; - RWLock _m; - ThreadLocalValue _state; - - /* we use a separate TLS value for releasedEarly - that is ok as - our normal/common code path, we never even touch it. - */ - ThreadLocalValue _releasedEarly; - public: - MongoMutex(const char * name) : _m(name) { } - - /** - * @return - * > 0 write lock - * = 0 no lock - * < 0 read lock - */ - int getState() { return _state.get(); } - bool isWriteLocked() { return getState() > 0; } - void assertWriteLocked() { - assert( getState() > 0 ); - DEV assert( !_releasedEarly.get() ); - } - bool atLeastReadLocked() { return _state.get() != 0; } - void assertAtLeastReadLocked() { assert(atLeastReadLocked()); } - - bool _checkWriteLockAlready(){ - //DEV cout << "LOCK" << endl; - DEV assert( haveClient() ); - - int s = _state.get(); - if( s > 0 ) { - _state.set(s+1); - return true; - } - - massert( 10293 , (string)"internal error: locks are not upgradeable: " + sayClientState() , s == 0 ); - - return false; - } - - void lock() { - if ( _checkWriteLockAlready() ) - return; - - _state.set(1); - - curopWaitingForLock( 1 ); - _m.lock(); - curopGotLock(); - - _minfo.entered(); - - MongoFile::lockAll(); - } - - bool lock_try( int millis ) { - if ( _checkWriteLockAlready() ) - return true; - - curopWaitingForLock( 1 ); - bool got = _m.lock_try( millis ); - curopGotLock(); - - if ( got ){ - _minfo.entered(); - _state.set(1); - MongoFile::lockAll(); - } - - return got; - } - - - void unlock() { - //DEV cout << "UNLOCK" << endl; - int s = _state.get(); - if( s > 1 ) { - _state.set(s-1); - return; - } - if( s != 1 ) { - if( _releasedEarly.get() ) { - _releasedEarly.set(false); - return; - } - massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false); - } - - MongoFile::unlockAll(); - - _state.set(0); - _minfo.leaving(); - _m.unlock(); - } - - /* unlock (write lock), and when unlock() is called later, - be smart then and don't unlock it again. - */ - void releaseEarly() { - assert( getState() == 1 ); // must not be recursive - assert( !_releasedEarly.get() ); - _releasedEarly.set(true); - unlock(); - } - - void lock_shared() { - //DEV cout << " LOCKSHARED" << endl; - int s = _state.get(); - if( s ) { - if( s > 0 ) { - // already in write lock - just be recursive and stay write locked - _state.set(s+1); - return; - } - else { - // already in read lock - recurse - _state.set(s-1); - return; - } - } - _state.set(-1); - curopWaitingForLock( -1 ); - _m.lock_shared(); - curopGotLock(); - } - - bool lock_shared_try( int millis ) { - int s = _state.get(); - if ( s ){ - // we already have a lock, so no need to try - lock_shared(); - return true; - } +} - bool got = _m.lock_shared_try( millis ); - if ( got ) - _state.set(-1); - return got; - } - - void unlock_shared() { - //DEV cout << " UNLOCKSHARED" << endl; - int s = _state.get(); - if( s > 0 ) { - assert( s > 1 ); /* we must have done a lock write first to have s > 1 */ - _state.set(s-1); - return; - } - if( s < -1 ) { - _state.set(s+1); - return; - } - assert( s == -1 ); - _state.set(0); - _m.unlock_shared(); - } - - MutexInfo& info() { return _minfo; } - }; +#include "mongomutex.h" - extern MongoMutex &dbMutex; +namespace mongo { inline void dbunlocking_write() { } inline void dbunlocking_read() { } struct writelock { - writelock(const string& ns) { - dbMutex.lock(); - } - ~writelock() { + writelock() { dbMutex.lock(); } + writelock(const string& ns) { dbMutex.lock(); } + ~writelock() { DESTRUCTOR_GUARD( dbunlocking_write(); dbMutex.unlock(); ); } }; - + struct readlock { readlock(const string& ns) { dbMutex.lock_shared(); } - ~readlock() { + readlock() { dbMutex.lock_shared(); } + ~readlock() { DESTRUCTOR_GUARD( dbunlocking_read(); dbMutex.unlock_shared(); ); } - }; + }; struct readlocktry { - readlocktry( const string&ns , int tryms ){ + readlocktry( const string&ns , int tryms ) { _got = dbMutex.lock_shared_try( tryms ); } ~readlocktry() { - if ( _got ){ + if ( _got ) { dbunlocking_read(); dbMutex.unlock_shared(); } @@ -280,11 +121,11 @@ namespace mongo { }; struct writelocktry { - writelocktry( const string&ns , int tryms ){ + writelocktry( const string&ns , int tryms ) { _got = dbMutex.lock_try( tryms ); } ~writelocktry() { - if ( _got ){ + if ( _got ) { dbunlocking_read(); dbMutex.unlock(); } @@ -294,10 +135,10 @@ namespace mongo { bool _got; }; - struct readlocktryassert : public readlocktry { - readlocktryassert(const string& ns, int tryms) : - readlocktry(ns,tryms) { - uassert(13142, "timeout getting readlock", got()); + struct readlocktryassert : public readlocktry { + readlocktryassert(const string& ns, int tryms) : + readlocktry(ns,tryms) { + uassert(13142, "timeout getting readlock", got()); } }; @@ -305,12 +146,12 @@ namespace mongo { if you have a write lock, that's ok too. */ struct atleastreadlock { - atleastreadlock( const string& ns ){ + atleastreadlock( const string& ns ) { _prev = dbMutex.getState(); if ( _prev == 0 ) dbMutex.lock_shared(); } - ~atleastreadlock(){ + ~atleastreadlock() { if ( _prev == 0 ) dbMutex.unlock_shared(); } @@ -318,6 +159,9 @@ namespace mongo { int _prev; }; + /* parameterized choice of read or write locking + use readlock and writelock instead of this when statically known which you want + */ class mongolock { bool _writelock; public: @@ -328,27 +172,28 @@ namespace mongo { else dbMutex.lock_shared(); } - ~mongolock() { + ~mongolock() { DESTRUCTOR_GUARD( - if( _writelock ) { - dbunlocking_write(); - dbMutex.unlock(); - } else { - dbunlocking_read(); - dbMutex.unlock_shared(); - } + if( _writelock ) { + dbunlocking_write(); + dbMutex.unlock(); + } + else { + dbunlocking_read(); + dbMutex.unlock_shared(); + } ); } /* this unlocks, does NOT upgrade. that works for our current usage */ void releaseAndWriteLock(); }; - - /* use writelock and readlock instead */ + + /* deprecated - use writelock and readlock instead */ struct dblock : public writelock { dblock() : writelock("") { } }; - // eliminate + // eliminate this - we should just type "dbMutex.assertWriteLocked();" instead inline void assertInWriteLock() { dbMutex.assertWriteLocked(); } } diff --git a/db/curop-inl.h b/db/curop-inl.h new file mode 100644 index 0000000..21d6f0a --- /dev/null +++ b/db/curop-inl.h @@ -0,0 +1,42 @@ +// @file curop-inl.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "curop.h" + +namespace mongo { + + // todo : move more here + + inline CurOp::CurOp( Client * client , CurOp * wrapped ) { + _client = client; + _wrapped = wrapped; + if ( _wrapped ) + _client->_curOp = this; + _start = _checkpoint = 0; + _active = false; + _reset(); + _op = 0; + // These addresses should never be written to again. The zeroes are + // placed here as a precaution because currentOp may be accessed + // without the db mutex. + memset(_ns, 0, sizeof(_ns)); + } + +} diff --git a/db/curop.h b/db/curop.h index bf06a69..c6e949b 100644 --- a/db/curop.h +++ b/db/curop.h @@ -1,4 +1,5 @@ -// curop.h +// @file curop.h + /* * Copyright (C) 2010 10gen Inc. * @@ -18,152 +19,188 @@ #pragma once -#include "namespace.h" +#include "namespace-inl.h" #include "client.h" #include "../bson/util/atomic_int.h" +#include "../util/concurrency/spin_lock.h" +#include "../util/time_support.h" #include "db.h" +#include "../scripting/engine.h" -namespace mongo { +namespace mongo { /* lifespan is different than CurOp because of recursives with DBDirectClient */ class OpDebug { public: StringBuilder str; - - void reset(){ - str.reset(); - } + void reset() { str.reset(); } }; - - /* Current operation (for the current Client). - an embedded member of Client class, and typically used from within the mutex there. */ - class CurOp : boost::noncopyable { - static AtomicUInt _nextOpNum; + + /** + * stores a copy of a bson obj in a fixed size buffer + * if its too big for the buffer, says "too big" + * useful for keeping a copy around indefinitely without wasting a lot of space or doing malloc + */ + class CachedBSONObj { + public: + enum { TOO_BIG_SENTINEL = 1 } ; static BSONObj _tooBig; // { $msg : "query not recording (too large)" } - - Client * _client; - CurOp * _wrapped; - unsigned long long _start; - unsigned long long _checkpoint; - unsigned long long _end; + CachedBSONObj() { + _size = (int*)_buf; + reset(); + } - bool _active; - int _op; - bool _command; - int _lockType; // see concurrency.h for values - bool _waitingForLock; - int _dbprofile; // 0=off, 1=slow, 2=all - AtomicUInt _opNum; - char _ns[Namespace::MaxNsLen+2]; - struct SockAddr _remote; - char _queryBuf[256]; - - void resetQuery(int x=0) { *((int *)_queryBuf) = x; } - - OpDebug _debug; - - ThreadSafeString _message; - ProgressMeter _progressMeter; + void reset( int sz = 0 ) { + _lock.lock(); + _reset( sz ); + _lock.unlock(); + } + + void set( const BSONObj& o ) { + _lock.lock(); + try { + int sz = o.objsize(); + + if ( sz > (int) sizeof(_buf) ) { + _reset(TOO_BIG_SENTINEL); + } + else { + memcpy(_buf, o.objdata(), sz ); + } + + _lock.unlock(); + } + catch ( ... ) { + _lock.unlock(); + throw; + } - void _reset(){ - _command = false; - _lockType = 0; - _dbprofile = 0; - _end = 0; - _waitingForLock = false; - _message = ""; - _progressMeter.finished(); } - void setNS(const char *ns) { - strncpy(_ns, ns, Namespace::MaxNsLen); + int size() const { return *_size; } + bool have() const { return size() > 0; } + + BSONObj get() { + _lock.lock(); + BSONObj o; + try { + o = _get(); + _lock.unlock(); + } + catch ( ... ) { + _lock.unlock(); + throw; + } + return o; + } + + void append( BSONObjBuilder& b , const StringData& name ) { + _lock.lock(); + try { + BSONObj temp = _get(); + b.append( name , temp ); + _lock.unlock(); + } + catch ( ... ) { + _lock.unlock(); + throw; + } } + private: + /** you have to be locked when you call this */ + BSONObj _get() { + int sz = size(); + if ( sz == 0 ) + return BSONObj(); + if ( sz == TOO_BIG_SENTINEL ) + return _tooBig; + return BSONObj( _buf ).copy(); + } + + /** you have to be locked when you call this */ + void _reset( int sz ) { _size[0] = sz; } + + SpinLock _lock; + int * _size; + char _buf[512]; + }; + + /* Current operation (for the current Client). + an embedded member of Client class, and typically used from within the mutex there. + */ + class CurOp : boost::noncopyable { public: - - int querySize() const { return *((int *) _queryBuf); } - bool haveQuery() const { return querySize() != 0; } + CurOp( Client * client , CurOp * wrapped = 0 ); + ~CurOp(); - BSONObj query( bool threadSafe = false); + bool haveQuery() const { return _query.have(); } + BSONObj query() { return _query.get(); } - void ensureStarted(){ + void ensureStarted() { if ( _start == 0 ) - _start = _checkpoint = curTimeMicros64(); + _start = _checkpoint = curTimeMicros64(); } - void enter( Client::Context * context ){ + void enter( Client::Context * context ) { ensureStarted(); setNS( context->ns() ); if ( context->_db && context->_db->profile > _dbprofile ) _dbprofile = context->_db->profile; } - void leave( Client::Context * context ){ + void leave( Client::Context * context ) { unsigned long long now = curTimeMicros64(); Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command ); _checkpoint = now; } - void reset(){ + void reset() { _reset(); _start = _checkpoint = 0; _active = true; _opNum = _nextOpNum++; _ns[0] = '?'; // just in case not set later _debug.reset(); - resetQuery(); + _query.reset(); } - + void reset( const SockAddr & remote, int op ) { reset(); _remote = remote; _op = op; } - - void markCommand(){ - _command = true; - } - void waitingForLock( int type ){ + void markCommand() { _command = true; } + + void waitingForLock( int type ) { _waitingForLock = true; if ( type > 0 ) _lockType = 1; else _lockType = -1; } - void gotLock(){ - _waitingForLock = false; - } - - OpDebug& debug(){ - return _debug; - } - - int profileLevel() const { - return _dbprofile; - } - - const char * getNS() const { - return _ns; - } + void gotLock() { _waitingForLock = false; } + OpDebug& debug() { return _debug; } + int profileLevel() const { return _dbprofile; } + const char * getNS() const { return _ns; } bool shouldDBProfile( int ms ) const { if ( _dbprofile <= 0 ) return false; - + return _dbprofile >= 2 || ms >= cmdLine.slowMS; } - + AtomicUInt opNum() const { return _opNum; } /** if this op is running */ bool active() const { return _active; } - + int getLockType() const { return _lockType; } - bool isWaitingForLock() const { return _waitingForLock; } + bool isWaitingForLock() const { return _waitingForLock; } int getOp() const { return _op; } - - + /** micros */ unsigned long long startTime() { ensureStarted(); @@ -174,75 +211,41 @@ namespace mongo { _active = false; _end = curTimeMicros64(); } - + unsigned long long totalTimeMicros() { massert( 12601 , "CurOp not marked done yet" , ! _active ); return _end - startTime(); } - int totalTimeMillis() { - return (int) (totalTimeMicros() / 1000); - } + int totalTimeMillis() { return (int) (totalTimeMicros() / 1000); } int elapsedMillis() { unsigned long long total = curTimeMicros64() - startTime(); return (int) (total / 1000); } - int elapsedSeconds() { - return elapsedMillis() / 1000; - } + int elapsedSeconds() { return elapsedMillis() / 1000; } - void setQuery(const BSONObj& query) { - if( query.objsize() > (int) sizeof(_queryBuf) ) { - resetQuery(1); // flag as too big and return - return; - } - memcpy(_queryBuf, query.objdata(), query.objsize()); - } + void setQuery(const BSONObj& query) { _query.set( query ); } - Client * getClient() const { - return _client; - } + Client * getClient() const { return _client; } - CurOp( Client * client , CurOp * wrapped = 0 ) { - _client = client; - _wrapped = wrapped; - if ( _wrapped ){ - _client->_curOp = this; - } - _start = _checkpoint = 0; - _active = false; - _reset(); - _op = 0; - // These addresses should never be written to again. The zeroes are - // placed here as a precaution because currentOp may be accessed - // without the db mutex. - memset(_ns, 0, sizeof(_ns)); - memset(_queryBuf, 0, sizeof(_queryBuf)); - } - - ~CurOp(); - - BSONObj info() { - if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) { + BSONObj info() { + if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) { BSONObjBuilder b; b.append("err", "unauthorized"); return b.obj(); } return infoNoauth(); } - - BSONObj infoNoauth( int attempt = 0 ); - string getRemoteString( bool includePort = true ){ - return _remote.toString(includePort); - } + BSONObj infoNoauth(); - ProgressMeter& setMessage( const char * msg , long long progressMeterTotal = 0 , int secondsBetween = 3 ){ + string getRemoteString( bool includePort = true ) { return _remote.toString(includePort); } - if ( progressMeterTotal ){ - if ( _progressMeter.isActive() ){ + ProgressMeter& setMessage( const char * msg , unsigned long long progressMeterTotal = 0 , int secondsBetween = 3 ) { + if ( progressMeterTotal ) { + if ( _progressMeter.isActive() ) { cout << "about to assert, old _message: " << _message << " new message:" << msg << endl; assert( ! _progressMeter.isActive() ); } @@ -251,38 +254,93 @@ namespace mongo { else { _progressMeter.finished(); } - + _message = msg; - + return _progressMeter; } - + string getMessage() const { return _message.toString(); } ProgressMeter& getProgressMeter() { return _progressMeter; } - + CurOp *parent() const { return _wrapped; } + void kill() { _killed = true; } + bool killed() const { return _killed; } + void setNS(const char *ns) { + strncpy(_ns, ns, Namespace::MaxNsLen); + _ns[Namespace::MaxNsLen] = 0; + } friend class Client; + + private: + static AtomicUInt _nextOpNum; + Client * _client; + CurOp * _wrapped; + unsigned long long _start; + unsigned long long _checkpoint; + unsigned long long _end; + bool _active; + int _op; + bool _command; + int _lockType; // see concurrency.h for values + bool _waitingForLock; + int _dbprofile; // 0=off, 1=slow, 2=all + AtomicUInt _opNum; + char _ns[Namespace::MaxNsLen+2]; + struct SockAddr _remote; + CachedBSONObj _query; + OpDebug _debug; + ThreadSafeString _message; + ProgressMeter _progressMeter; + volatile bool _killed; + + void _reset() { + _command = false; + _lockType = 0; + _dbprofile = 0; + _end = 0; + _waitingForLock = false; + _message = ""; + _progressMeter.finished(); + _killed = false; + } }; - /* 0 = ok - 1 = kill current operation and reset this to 0 - future: maybe use this as a "going away" thing on process termination with a higher flag value + /* _globalKill: we are shutting down + otherwise kill attribute set on specified CurOp + this class does not handle races between interruptJs and the checkForInterrupt functions - those must be + handled by the client of this class */ - extern class KillCurrentOp { - enum { Off, On, All } state; - AtomicUInt toKill; + extern class KillCurrentOp { public: - void killAll() { state = All; } - void kill(AtomicUInt i) { toKill = i; state = On; } - - void checkForInterrupt() { - if( state != Off ) { - if( state == All ) - uasserted(11600,"interrupted at shutdown"); - if( cc().curop()->opNum() == toKill ) { - state = Off; - uasserted(11601,"interrupted"); - } - } + void killAll(); + void kill(AtomicUInt i); + + /** @return true if global interrupt and should terminate the operation */ + bool globalInterruptCheck() const { return _globalKill; } + + void checkForInterrupt( bool heedMutex = true ) { + if ( heedMutex && dbMutex.isWriteLocked() ) + return; + if( _globalKill ) + uasserted(11600,"interrupted at shutdown"); + if( cc().curop()->killed() ) + uasserted(11601,"interrupted"); } + + /** @return "" if not interrupted. otherwise, you should stop. */ + const char *checkForInterruptNoAssert( bool heedMutex = true ) { + if ( heedMutex && dbMutex.isWriteLocked() ) + return ""; + if( _globalKill ) + return "interrupted at shutdown"; + if( cc().curop()->killed() ) + return "interrupted"; + return ""; + } + + private: + void interruptJs( AtomicUInt *op ); + volatile bool _globalKill; } killCurrentOp; + } diff --git a/db/cursor.cpp b/db/cursor.cpp index e98cb7a..ac7afc1 100644 --- a/db/cursor.cpp +++ b/db/cursor.cpp @@ -16,7 +16,7 @@ #include "pch.h" #include "pdfile.h" -#include "curop.h" +#include "curop-inl.h" namespace mongo { @@ -24,14 +24,17 @@ namespace mongo { killCurrentOp.checkForInterrupt(); if ( eof() ) { if ( tailable_ && !last.isNull() ) { - curr = s->next( last ); - } else { + curr = s->next( last ); + } + else { return false; } - } else { + } + else { last = curr; curr = s->next( curr ); } + incNscanned(); return ok(); } @@ -72,7 +75,7 @@ namespace mongo { } ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) : - nsd( _nsd ) { + nsd( _nsd ) { if ( !nsd ) return; DiskLoc start = startLoc; @@ -89,6 +92,7 @@ namespace mongo { } curr = start; s = this; + incNscanned(); } DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const { @@ -112,19 +116,21 @@ namespace mongo { } ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) : - nsd( _nsd ) { + nsd( _nsd ) { if ( !nsd ) return; DiskLoc start = startLoc; if ( start.isNull() ) { if ( !nsd->capLooped() ) { start = nsd->lastRecord(); - } else { + } + else { start = nsd->capExtent.ext()->lastRecord; } } curr = start; s = this; + incNscanned(); } DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const { @@ -138,7 +144,8 @@ namespace mongo { if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) { return DiskLoc(); } - } else { + } + else { if ( i == nsd->capExtent.ext()->firstRecord ) { return DiskLoc(); } diff --git a/db/cursor.h b/db/cursor.h index db5d9a3..9797d66 100644 --- a/db/cursor.h +++ b/db/cursor.h @@ -23,14 +23,15 @@ #include "matcher.h" namespace mongo { - + + class NamespaceDetails; class Record; class CoveredIndexMatcher; /* Query cursors, base class. This is for our internal cursors. "ClientCursor" is a separate concept and is for the user's cursor. - WARNING concurrency: the vfunctions below are called back from within a + WARNING concurrency: the vfunctions below are called back from within a ClientCursor::ccmutex. Don't cause a deadlock, you've been warned. */ class Cursor : boost::noncopyable { @@ -49,7 +50,7 @@ namespace mongo { virtual DiskLoc refLoc() = 0; /* Implement these if you want the cursor to be "tailable" */ - + /* Request that the cursor starts tailing after advancing past last record. */ /* The implementation may or may not honor this request. */ virtual void setTailable() {} @@ -76,10 +77,10 @@ namespace mongo { /* called before query getmore block is iterated */ virtual void checkLocation() { } - + virtual bool supportGetMore() = 0; virtual bool supportYields() = 0; - + virtual string toString() { return "abstract?"; } /* used for multikey index traversal to avoid sending back dups. see Matcher::matches(). @@ -87,20 +88,33 @@ namespace mongo { if loc has already been sent, returns true. otherwise, marks loc as sent. @param deep - match was against an array, so we know it is multikey. this is legacy and kept - for backwards datafile compatibility. 'deep' can be eliminated next time we + for backwards datafile compatibility. 'deep' can be eliminated next time we force a data file conversion. 7Jul09 */ virtual bool getsetdup(DiskLoc loc) = 0; + virtual bool isMultiKey() const = 0; + + /** + * return true if the keys in the index have been modified from the main doc + * if you have { a : 1 , b : [ 1 , 2 ] } + * an index on { a : 1 } would not be modified + * an index on { b : 1 } would be since the values of the array are put in the index + * not the array + */ + virtual bool modifiedKeys() const = 0; + virtual BSONObj prettyIndexBounds() const { return BSONArray(); } virtual bool capped() const { return false; } + virtual long long nscanned() = 0; + // The implementation may return different matchers depending on the // position of the cursor. If matcher() is nonzero at the start, // matcher() should be checked each time advance() is called. virtual CoveredIndexMatcher *matcher() const { return 0; } - + // A convenience function for setting the value of matcher() manually // so it may accessed later. Implementations which must generate // their own matcher() should assert here. @@ -121,20 +135,15 @@ namespace mongo { /* table-scan style cursor */ class BasicCursor : public Cursor { - protected: - DiskLoc curr, last; - const AdvanceStrategy *s; - - private: - bool tailable_; - shared_ptr< CoveredIndexMatcher > _matcher; - void init() { - tailable_ = false; - } public: - bool ok() { - return !curr.isNull(); + BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ), _nscanned() { + incNscanned(); + init(); } + BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ), _nscanned() { + init(); + } + bool ok() { return !curr.isNull(); } Record* _current() { assert( ok() ); return curr.rec(); @@ -144,42 +153,33 @@ namespace mongo { BSONObj j(r); return j; } - virtual DiskLoc currLoc() { - return curr; - } - virtual DiskLoc refLoc() { - return curr.isNull() ? last : curr; - } - + virtual DiskLoc currLoc() { return curr; } + virtual DiskLoc refLoc() { return curr.isNull() ? last : curr; } bool advance(); - - BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ) { - init(); - } - BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ) { - init(); - } - virtual string toString() { - return "BasicCursor"; - } + virtual string toString() { return "BasicCursor"; } virtual void setTailable() { if ( !curr.isNull() || !last.isNull() ) tailable_ = true; } - virtual bool tailable() { - return tailable_; - } + virtual bool tailable() { return tailable_; } virtual bool getsetdup(DiskLoc loc) { return false; } - + virtual bool isMultiKey() const { return false; } + virtual bool modifiedKeys() const { return false; } virtual bool supportGetMore() { return true; } virtual bool supportYields() { return true; } - virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); } - - virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { - _matcher = matcher; - } - + virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; } + virtual long long nscanned() { return _nscanned; } + + protected: + DiskLoc curr, last; + const AdvanceStrategy *s; + void incNscanned() { if ( !curr.isNull() ) { ++_nscanned; } } + private: + bool tailable_; + shared_ptr< CoveredIndexMatcher > _matcher; + long long _nscanned; + void init() { tailable_ = false; } }; /* used for order { $natural: -1 } */ @@ -187,13 +187,9 @@ namespace mongo { public: ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { } ReverseCursor() : BasicCursor( reverse() ) { } - virtual string toString() { - return "ReverseCursor"; - } + virtual string toString() { return "ReverseCursor"; } }; - class NamespaceDetails; - class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy { public: ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() ); diff --git a/db/database.cpp b/db/database.cpp index dde117f..d164ba5 100644 --- a/db/database.cpp +++ b/db/database.cpp @@ -20,15 +20,29 @@ #include "pdfile.h" #include "database.h" #include "instance.h" +#include "clientcursor.h" namespace mongo { bool Database::_openAllFiles = false; + Database::~Database() { + magic = 0; + size_t n = files.size(); + for ( size_t i = 0; i < n; i++ ) + delete files[i]; + if( ccByLoc.size() ) { + log() << "\n\n\nWARNING: ccByLoc not empty on database close! " << ccByLoc.size() << ' ' << name << endl; + } + } + Database::Database(const char *nm, bool& newDb, const string& _path ) - : name(nm), path(_path), namespaceIndex( path, name ) { - - { // check db name is valid + : name(nm), path(_path), namespaceIndex( path, name ), + profileName(name + ".system.profile") { + try { + + { + // check db name is valid size_t L = strlen(nm); uassert( 10028 , "db name is empty", L > 0 ); uassert( 10029 , "bad db name [1]", *nm != '.' ); @@ -36,66 +50,184 @@ namespace mongo { uassert( 10031 , "bad char(s) in db name", strchr(nm, ' ') == 0 ); uassert( 10032 , "db name too long", L < 64 ); } - + newDb = namespaceIndex.exists(); profile = 0; - profileName = name + ".system.profile"; { vector others; getDatabaseNames( others , path ); - - for ( unsigned i=0; i 1 && getFile( n - 1 )->getHeader()->isEmpty() ) { + delete files[ n - 1 ]; + files.pop_back(); + } + } + + MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) { + assert(this); + + namespaceIndex.init(); + if ( n < 0 || n >= DiskLoc::MaxFiles ) { + out() << "getFile(): n=" << n << endl; + massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false); + } + DEV { + if ( n > 100 ) + out() << "getFile(): n=" << n << "?" << endl; + } + MongoDataFile* p = 0; + if ( !preallocateOnly ) { + while ( n >= (int) files.size() ) + files.push_back(0); + p = files[n]; + } + if ( p == 0 ) { + boost::filesystem::path fullName = fileName( n ); + string fullNameString = fullName.string(); + p = new MongoDataFile(n); + int minSize = 0; + if ( n != 0 && files[ n - 1 ] ) + minSize = files[ n - 1 ]->getHeader()->fileLength; + if ( sizeNeeded + DataFileHeader::HeaderSize > minSize ) + minSize = sizeNeeded + DataFileHeader::HeaderSize; + try { + p->open( fullNameString.c_str(), minSize, preallocateOnly ); + } + catch ( AssertionException& ) { + delete p; + throw; + } + if ( preallocateOnly ) + delete p; + else + files[n] = p; + } + return preallocateOnly ? 0 : p; + } + + MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) { + int n = (int) files.size(); + MongoDataFile *ret = getFile( n, sizeNeeded ); + if ( preallocateNextFile ) + preallocateAFile(); + return ret; } + MongoDataFile* Database::suitableFile( int sizeNeeded, bool preallocate ) { - bool Database::setProfilingLevel( int newLevel , string& errmsg ){ + // check existing files + for ( int i=numFiles()-1; i>=0; i-- ) { + MongoDataFile* f = getFile( i ); + if ( f->getHeader()->unusedLength >= sizeNeeded ) + return f; + } + + // allocate files until we either get one big enough or hit maxSize + for ( int i = 0; i < 8; i++ ) { + MongoDataFile* f = addAFile( sizeNeeded, preallocate ); + + if ( f->getHeader()->unusedLength >= sizeNeeded ) + return f; + + if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop + return f; + } + + return 0; + } + + MongoDataFile* Database::newestFile() { + int n = numFiles(); + if ( n == 0 ) + return 0; + return getFile(n-1); + } + + + Extent* Database::allocExtent( const char *ns, int size, bool capped ) { + Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped ); + if( e ) + return e; + return suitableFile( size, !capped )->createExtent( ns, size, capped ); + } + + + bool Database::setProfilingLevel( int newLevel , string& errmsg ) { if ( profile == newLevel ) return true; - - if ( newLevel < 0 || newLevel > 2 ){ + + if ( newLevel < 0 || newLevel > 2 ) { errmsg = "profiling level has to be >=0 and <= 2"; return false; } - - if ( newLevel == 0 ){ + + if ( newLevel == 0 ) { profile = 0; return true; } - + assert( cc().database() == this ); - if ( ! namespaceIndex.details( profileName.c_str() ) ){ + if ( ! namespaceIndex.details( profileName.c_str() ) ) { log(1) << "creating profile ns: " << profileName << endl; BSONObjBuilder spec; spec.appendBool( "capped", true ); spec.append( "size", 131072.0 ); - if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ){ + if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ) { return false; } } @@ -103,26 +235,57 @@ namespace mongo { return true; } - void Database::finishInit(){ + void Database::finishInit() { if ( cmdLine.defaultProfile == profile ) return; - + string errmsg; massert( 12506 , errmsg , setProfilingLevel( cmdLine.defaultProfile , errmsg ) ); } - bool Database::validDBName( const string& ns ){ + bool Database::validDBName( const string& ns ) { if ( ns.size() == 0 || ns.size() > 64 ) return false; size_t good = strcspn( ns.c_str() , "/\\. \"" ); return good == ns.size(); } - void Database::flushFiles( bool sync ){ + void Database::flushFiles( bool sync ) const { dbMutex.assertAtLeastReadLocked(); - for ( unsigned i=0; iflush( sync ); } } + long long Database::fileSize() const { + long long size=0; + for (int n=0; exists(n); n++) + size += boost::filesystem::file_size( fileName(n) ); + return size; + } + + Database* DatabaseHolder::getOrCreate( const string& ns , const string& path , bool& justCreated ) { + dbMutex.assertWriteLocked(); + DBs& m = _paths[path]; + + string dbname = _todb( ns ); + + Database* & db = m[dbname]; + if ( db ) { + justCreated = false; + return db; + } + + log(1) << "Accessing: " << dbname << " for the first time" << endl; + try { + db = new Database( dbname.c_str() , justCreated , path ); + } + catch ( ... ) { + m.erase( dbname ); + throw; + } + _size++; + return db; + } + } // namespace mongo diff --git a/db/database.h b/db/database.h index c7d72c5..6e72ba8 100644 --- a/db/database.h +++ b/db/database.h @@ -23,6 +23,8 @@ namespace mongo { class ClientCursor; + struct ByLocKey; + typedef map CCByLoc; /** * Database represents a database database @@ -32,176 +34,90 @@ namespace mongo { class Database { public: static bool _openAllFiles; - - Database(const char *nm, bool& newDb, const string& _path = dbpath); - - ~Database() { - magic = 0; - btreeStore->closeFiles(name, path); - size_t n = files.size(); - for ( size_t i = 0; i < n; i++ ) - delete files[i]; - } - + + Database(const char *nm, /*out*/ bool& newDb, const string& _path = dbpath); + private: + ~Database(); + public: + /* you must use this to close - there is essential code in this method that is not in the ~Database destructor. + thus the destructor is private. this could be cleaned up one day... + */ + static void closeDatabase( const char *db, const string& path ); + + void openAllFiles(); + + void finishInit(); + /** * tries to make sure that this hasn't been deleted */ - bool isOk(){ - return magic == 781231; - } + bool isOk() const { return magic == 781231; } - bool isEmpty(){ - return ! namespaceIndex.allocated(); - } + bool isEmpty() { return ! namespaceIndex.allocated(); } - boost::filesystem::path fileName( int n ) { - stringstream ss; - ss << name << '.' << n; - boost::filesystem::path fullName; - fullName = boost::filesystem::path(path); - if ( directoryperdb ) - fullName /= name; - fullName /= ss.str(); - return fullName; - } - - bool exists(int n) { - return boost::filesystem::exists( fileName( n ) ); - } + /** + * total file size of Database in bytes + */ + long long fileSize() const; - void openAllFiles() { - int n = 0; - while( exists(n) ) { - getFile(n); - n++; - } - // If last file is empty, consider it preallocated and make sure it's not mapped - // until a write is requested - if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) { - delete files[ n - 1 ]; - files.pop_back(); - } - } + int numFiles() const { return (int)files.size(); } - MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false ) { - assert(this); - - namespaceIndex.init(); - if ( n < 0 || n >= DiskLoc::MaxFiles ) { - out() << "getFile(): n=" << n << endl; -#if 0 - if( n >= RecCache::Base && n <= RecCache::Base+1000 ) - massert( 10294 , "getFile(): bad file number - using recstore db w/nonrecstore db build?", false); -#endif - massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false); - } - DEV { - if ( n > 100 ) - out() << "getFile(): n=" << n << "?" << endl; - } - MongoDataFile* p = 0; - if ( !preallocateOnly ) { - while ( n >= (int) files.size() ) - files.push_back(0); - p = files[n]; - } - if ( p == 0 ) { - boost::filesystem::path fullName = fileName( n ); - string fullNameString = fullName.string(); - p = new MongoDataFile(n); - int minSize = 0; - if ( n != 0 && files[ n - 1 ] ) - minSize = files[ n - 1 ]->getHeader()->fileLength; - if ( sizeNeeded + DataFileHeader::HeaderSize > minSize ) - minSize = sizeNeeded + DataFileHeader::HeaderSize; - try { - p->open( fullNameString.c_str(), minSize, preallocateOnly ); - } - catch ( AssertionException& ) { - delete p; - throw; - } - if ( preallocateOnly ) - delete p; - else - files[n] = p; - } - return preallocateOnly ? 0 : p; - } + /** + * returns file valid for file number n + */ + boost::filesystem::path fileName( int n ) const; - MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile ) { - int n = (int) files.size(); - MongoDataFile *ret = getFile( n, sizeNeeded ); - if ( preallocateNextFile ) - preallocateAFile(); - return ret; - } - - // safe to call this multiple times - the implementation will only preallocate one file - void preallocateAFile() { - int n = (int) files.size(); - getFile( n, 0, true ); - } + bool exists(int n) const { return boost::filesystem::exists( fileName( n ) ); } - MongoDataFile* suitableFile( int sizeNeeded, bool preallocate ) { - MongoDataFile* f = newestFile(); - if ( !f ) { - f = addAFile( sizeNeeded, preallocate ); - } - for ( int i = 0; i < 8; i++ ) { - if ( f->getHeader()->unusedLength >= sizeNeeded ) - break; - f = addAFile( sizeNeeded, preallocate ); - if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop - break; - } - return f; - } + /** + * return file n. if it doesn't exist, create it + */ + MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false ); + + MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile ); + + /** + * makes sure we have an extra file at the end that is empty + * safe to call this multiple times - the implementation will only preallocate one file + */ + void preallocateAFile() { getFile( numFiles() , 0, true ); } + + MongoDataFile* suitableFile( int sizeNeeded, bool preallocate ); + + Extent* allocExtent( const char *ns, int size, bool capped ); + + MongoDataFile* newestFile(); - Extent* allocExtent( const char *ns, int size, bool capped ) { - Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped ); - if( e ) return e; - return suitableFile( size, !capped )->createExtent( ns, size, capped ); - } - - MongoDataFile* newestFile() { - int n = (int) files.size(); - if ( n > 0 ) { - n--; - } else { - return 0; - } - return getFile(n); - } - /** - * @return true if success, false otherwise + * @return true if success. false if bad level or error creating profile ns */ bool setProfilingLevel( int newLevel , string& errmsg ); - void finishInit(); - static bool validDBName( const string& ns ); + void flushFiles( bool sync ) const; - long long fileSize(){ - long long size=0; - for (int n=0; exists(n); n++) - size += boost::filesystem::file_size( fileName(n) ); - return size; + /** + * @return true if ns is part of the database + * ns=foo.bar, db=foo returns true + */ + bool ownsNS( const string& ns ) const { + if ( ! startsWith( ns , name ) ) + return false; + return ns[name.size()] == '.'; } - void flushFiles( bool sync ); - + static bool validDBName( const string& ns ); + + public: // this should be private later + vector files; - string name; // "alleyinsider" - string path; + const string name; // "alleyinsider" + const string path; NamespaceIndex namespaceIndex; int profile; // 0=off. - string profileName; // "alleyinsider.system.profile" - - multimap ccByLoc; - - int magic; // used for making sure the object is still loaded in memory + const string profileName; // "alleyinsider.system.profile" + CCByLoc ccByLoc; + int magic; // used for making sure the object is still loaded in memory }; } // namespace mongo diff --git a/db/db.cpp b/db/db.cpp index d5b9339..548ac14 100644 --- a/db/db.cpp +++ b/db/db.cpp @@ -1,4 +1,4 @@ -// @file db.cpp : Defines the entry point for the mongod application. +// @file db.cpp : Defines main() for the mongod program. /** * Copyright (C) 2008 10gen Inc. @@ -37,7 +37,10 @@ #include "../util/concurrency/task.h" #include "../util/version.h" #include "client.h" +#include "restapi.h" #include "dbwebserver.h" +#include "dur.h" +#include "concurrency.h" #if defined(_WIN32) # include "../util/ntservice.h" @@ -55,31 +58,25 @@ namespace mongo { extern char *appsrvPath; extern int diagLogging; - extern int lenForNewNsFiles; + extern unsigned lenForNewNsFiles; extern int lockFile; - extern bool checkNsFilesOnLoad; + extern bool checkNsFilesOnLoad; extern string repairpath; -#if defined(_WIN32) - std::wstring windowsServiceName = L"MongoDB"; - std::wstring windowsServiceUser = L""; - std::wstring windowsServicePassword = L""; -#endif - - void setupSignals(); + void setupSignals( bool inFork ); void startReplSets(ReplSetCmdline*); void startReplication(); void pairWith(const char *remoteEnd, const char *arb); void exitCleanly( ExitCode code ); CmdLine cmdLine; - bool useJNI = true; + static bool scriptingEnabled = true; bool noHttpInterface = false; bool shouldRepairDatabases = 0; - bool forceRepair = 0; + static bool forceRepair = 0; Timer startupSrandTimer; - const char *ourgetns() { + const char *ourgetns() { Client *c = currentClient.get(); if ( ! c ) return ""; @@ -102,7 +99,7 @@ namespace mongo { OurListener(const string &ip, int p) : Listener(ip, p) { } virtual void accepted(MessagingPort *mp) { - if ( ! connTicketHolder.tryAcquire() ){ + if ( ! connTicketHolder.tryAcquire() ) { log() << "connection refused because too many open connections: " << connTicketHolder.used() << " of " << connTicketHolder.outof() << endl; // TODO: would be nice if we notified them... mp->shutdown(); @@ -113,12 +110,12 @@ namespace mongo { try { boost::thread thr(boost::bind(&connThread,mp)); } - catch ( boost::thread_resource_error& ){ + catch ( boost::thread_resource_error& ) { log() << "can't create new thread, closing connection" << endl; mp->shutdown(); delete mp; } - catch ( ... ){ + catch ( ... ) { log() << "unkonwn exception starting connThread" << endl; mp->shutdown(); delete mp; @@ -126,14 +123,14 @@ namespace mongo { } }; -/* todo: make this a real test. the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */ + /* todo: make this a real test. the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */ // QueryOption_Exhaust #define TESTEXHAUST 0 #if( TESTEXHAUST ) - void testExhaust() { + void testExhaust() { sleepsecs(1); unsigned n = 0; - auto f = [&n](const BSONObj& o) { + auto f = [&n](const BSONObj& o) { assert( o.valid() ); //cout << o << endl; n++; @@ -145,20 +142,20 @@ namespace mongo { db.connect("localhost"); const char *ns = "local.foo"; if( db.count(ns) < 10000 ) - for( int i = 0; i < 20000; i++ ) + for( int i = 0; i < 20000; i++ ) db.insert(ns, BSON("aaa" << 3 << "b" << "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); try { db.query(f, ns, Query() ); } - catch(...) { + catch(...) { cout << "hmmm" << endl; } try { db.query(f, ns, Query() ); } - catch(...) { + catch(...) { cout << "caught" << endl; } @@ -173,7 +170,7 @@ namespace mongo { l.setAsTimeTracker(); startReplication(); if ( !noHttpInterface ) - boost::thread thr(webServerThread); + boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */)); #if(TESTEXHAUST) boost::thread thr(testExhaust); @@ -203,8 +200,7 @@ namespace mongo { app server will open a pool of threads. todo: one day, asio... */ - void connThread( MessagingPort * inPort ) - { + void connThread( MessagingPort * inPort ) { TicketHolderReleaser connTicketReleaser( &connTicketHolder ); /* todo: move to Client object */ @@ -221,11 +217,11 @@ namespace mongo { Message m; while ( 1 ) { - m.reset(); + inPort->clearCounters(); if ( !dbMsgPort->recv(m) ) { if( !cmdLine.quiet ) - log() << "end connection " << dbMsgPort->farEnd.toString() << endl; + log() << "end connection " << dbMsgPort->farEnd.toString() << endl; dbMsgPort->shutdown(); break; } @@ -234,27 +230,15 @@ sendmore: log() << "got request after shutdown()" << endl; break; } - + lastError.startRequest( m , le ); DbResponse dbresponse; - if ( !assembleResponse( m, dbresponse, dbMsgPort->farEnd ) ) { - log() << curTimeMillis() % 10000 << " end msg " << dbMsgPort->farEnd.toString() << endl; - /* todo: we may not wish to allow this, even on localhost: very low priv accounts could stop us. */ - if ( dbMsgPort->farEnd.isLocalHost() ) { - dbMsgPort->shutdown(); - sleepmillis(50); - problem() << "exiting end msg" << endl; - dbexit(EXIT_CLEAN); - } - else { - log() << " (not from localhost, ignoring end msg)" << endl; - } - } + assembleResponse( m, dbresponse, dbMsgPort->farEnd ); if ( dbresponse.response ) { dbMsgPort->reply(m, *dbresponse.response, dbresponse.responseTo); - if( dbresponse.exhaust ) { + if( dbresponse.exhaust ) { MsgData *header = dbresponse.response->header(); QueryResult *qr = (QueryResult *) header; long long cursorid = qr->cursorId; @@ -279,6 +263,10 @@ sendmore: } } } + + networkCounter.hit( inPort->getBytesIn() , inPort->getBytesOut() ); + + m.reset(); } } @@ -293,7 +281,7 @@ sendmore: } catch ( const ClockSkewException & ) { exitCleanly( EXIT_CLOCK_SKEW ); - } + } catch ( std::exception &e ) { problem() << "Uncaught std::exception: " << e.what() << ", terminating" << endl; dbexit( EXIT_UNCAUGHT ); @@ -303,91 +291,48 @@ sendmore: dbexit( EXIT_UNCAUGHT ); } - // any thread cleanup can happen here - - if ( currentClient.get() ) - currentClient->shutdown(); - globalScriptEngine->threadDone(); - } - - void msg(const char *m, const char *address, int port, int extras = 0) { - SockAddr db(address, port); - - // SockAddr db("127.0.0.1", DBPort); - // SockAddr db("192.168.37.1", MessagingPort::DBPort); - // SockAddr db("10.0.21.60", MessagingPort::DBPort); - // SockAddr db("172.16.0.179", MessagingPort::DBPort); - - MessagingPort p; - if ( !p.connect(db) ){ - out() << "msg couldn't connect" << endl; - return; - } - - const int Loops = 1; - for ( int q = 0; q < Loops; q++ ) { - Message send; - Message response; - - send.setData( dbMsg , m); - int len = send.header()->dataLen(); - - for ( int i = 0; i < extras; i++ ) - p.say(/*db, */send); - - Timer t; - bool ok = p.call(send, response); - double tm = ((double) t.micros()) + 1; - out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms " - << "len: " << len << " data: " << response.singleData()->_data << endl; - - if ( q+1 < Loops ) { - out() << "\t\tSLEEP 8 then sending again as a test" << endl; - sleepsecs(8); - } + // thread ending... + { + Client * c = currentClient.get(); + if( c ) c->shutdown(); } - sleepsecs(1); - - p.shutdown(); - } - - void msg(const char *m, int extras = 0) { - msg(m, "127.0.0.1", CmdLine::DefaultDBPort, extras); + globalScriptEngine->threadDone(); } - bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ){ + bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) { static DBDirectClient db; - - if ( h->version == 4 && h->versionMinor == 4 ){ + + if ( h->version == 4 && h->versionMinor == 4 ) { assert( VERSION == 4 ); assert( VERSION_MINOR == 5 ); - + list colls = db.getCollectionNames( dbName ); - for ( list::iterator i=colls.begin(); i!=colls.end(); i++){ + for ( list::iterator i=colls.begin(); i!=colls.end(); i++) { string c = *i; log() << "\t upgrading collection:" << c << endl; BSONObj out; bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out ); - if ( ! ok ){ + if ( ! ok ) { errmsg = "reindex failed"; log() << "\t\t reindex failed: " << out << endl; return false; } } - + h->versionMinor = 5; return true; } - + // do this in the general case return repairDatabase( dbName.c_str(), errmsg ); } - - void repairDatabases() { - // LastError * le = lastError.get( true ); + + // ran at startup. + static void repairDatabasesAndCheckVersion() { + // LastError * le = lastError.get( true ); Client::GodScope gs; log(1) << "enter repairDatabases (to check pdfile version #)" << endl; - + //assert(checkNsFilesOnLoad); checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here. @@ -400,33 +345,39 @@ sendmore: Client::Context ctx( dbName ); MongoDataFile *p = cc().database()->getFile( 0 ); DataFileHeader *h = p->getHeader(); - if ( !h->currentVersion() || forceRepair ) { + if ( !h->isCurrentVersion() || forceRepair ) { + + if( h->version <= 0 ) { + uasserted(10000, str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version << " info: " << h->versionMinor << ' ' << h->fileLength); + } + log() << "****" << endl; log() << "****" << endl; log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", " << "new version: " << VERSION << "." << VERSION_MINOR << endl; - if ( shouldRepairDatabases ){ + if ( shouldRepairDatabases ) { // QUESTION: Repair even if file format is higher version than code? log() << "\t starting upgrade" << endl; string errmsg; assert( doDBUpgrade( dbName , errmsg , h ) ); } else { - log() << "\t Not upgrading, exiting!" << endl; + log() << "\t Not upgrading, exiting" << endl; log() << "\t run --upgrade to upgrade dbs, then start again" << endl; log() << "****" << endl; dbexit( EXIT_NEED_UPGRADE ); shouldRepairDatabases = 1; return; } - } else { - closeDatabase( dbName.c_str() ); + } + else { + Database::closeDatabase( dbName.c_str(), dbpath ); } } log(1) << "done repairDatabases" << endl; - if ( shouldRepairDatabases ){ + if ( shouldRepairDatabases ) { log() << "finished checking dbs" << endl; cc().shutdown(); dbexit( EXIT_CLEAN ); @@ -441,11 +392,11 @@ sendmore: i != boost::filesystem::directory_iterator(); ++i ) { string fileName = boost::filesystem::path(*i).leaf(); if ( boost::filesystem::is_directory( *i ) && - fileName.length() && fileName[ 0 ] == '$' ) + fileName.length() && fileName[ 0 ] == '$' ) boost::filesystem::remove_all( *i ); } } - + void clearTmpCollections() { Client::GodScope gs; vector< string > toDelete; @@ -460,35 +411,38 @@ sendmore: cli.dropCollection( *i ); } } - + + void flushDiagLog(); + /** * does background async flushes of mmapped files */ class DataFileSync : public BackgroundJob { public: - string name() { return "DataFileSync"; } - void run(){ - if( _sleepsecs == 0 ) + string name() const { return "DataFileSync"; } + void run() { + if( cmdLine.syncdelay == 0 ) log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl; - else if( _sleepsecs == 1 ) + else if( cmdLine.syncdelay == 1 ) log() << "--syncdelay 1" << endl; - else if( _sleepsecs != 60 ) - log(1) << "--syncdelay " << _sleepsecs << endl; + else if( cmdLine.syncdelay != 60 ) + log(1) << "--syncdelay " << cmdLine.syncdelay << endl; int time_flushing = 0; - while ( ! inShutdown() ){ - if ( _sleepsecs == 0 ){ + while ( ! inShutdown() ) { + flushDiagLog(); + if ( cmdLine.syncdelay == 0 ) { // in case at some point we add an option to change at runtime sleepsecs(5); continue; } - sleepmillis( (long long) std::max(0.0, (_sleepsecs * 1000) - time_flushing) ); - - if ( inShutdown() ){ + sleepmillis( (long long) std::max(0.0, (cmdLine.syncdelay * 1000) - time_flushing) ); + + if ( inShutdown() ) { // occasional issue trying to flush during shutdown when sleep interrupted break; } - + Date_t start = jsTime(); int numFiles = MemoryMappedFile::flushAll( true ); time_flushing = (int) (jsTime() - start); @@ -498,12 +452,22 @@ sendmore: log(1) << "flushing mmap took " << time_flushing << "ms " << " for " << numFiles << " files" << endl; } } - - double _sleepsecs; // default value controlled by program options + } dataFileSync; + const char * jsInterruptCallback() { + // should be safe to interrupt in js code, even if we have a write lock + return killCurrentOp.checkForInterruptNoAssert( false ); + } + + unsigned jsGetInterruptSpecCallback() { + return cc().curop()->opNum(); + } + void _initAndListen(int listenPort, const char *appserverLoc = NULL) { + Client::initThread("initandlisten"); + bool is32bit = sizeof(int*) == 4; { @@ -534,38 +498,37 @@ sendmore: ss << "repairpath (" << repairpath << ") does not exist"; uassert( 12590 , ss.str().c_str(), boost::filesystem::exists( repairpath ) ); } - + acquirePathLock(); remove_all( dbpath + "/_tmp/" ); - theFileAllocator().start(); + FileAllocator::get()->start(); BOOST_CHECK_EXCEPTION( clearTmpFiles() ); - Client::initThread("initandlisten"); _diaglog.init(); + dur::startup(); + + if( cmdLine.durOptions & CmdLine::DurRecoverOnly ) + return; + + // comes after getDur().startup() because this reads from the database clearTmpCollections(); Module::initAll(); -#if 0 - { - stringstream indexpath; - indexpath << dbpath << "/indexes.dat"; - RecCache::tempStore.init(indexpath.str().c_str(), BucketSize); - } -#endif - - if ( useJNI ) { + if ( scriptingEnabled ) { ScriptEngine::setup(); + globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback ); + globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback ); } - repairDatabases(); + repairDatabasesAndCheckVersion(); /* we didn't want to pre-open all fiels for the repair check above. for regular operation we do for read/write lock concurrency reasons. - */ + */ Database::_openAllFiles = true; if ( shouldRepairDatabases ) @@ -597,7 +560,7 @@ sendmore: log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl; dbexit( EXIT_UNCAUGHT ); } - catch ( int& n ){ + catch ( int& n ) { log() << "exception in initAndListen int: " << n << ", terminating" << endl; dbexit( EXIT_UNCAUGHT ); } @@ -607,13 +570,13 @@ sendmore: } } - #if defined(_WIN32) +#if defined(_WIN32) bool initService() { ServiceController::reportStatus( SERVICE_RUNNING ); initAndListen( cmdLine.port, appsrvPath ); return true; } - #endif +#endif } // namespace mongo @@ -647,16 +610,17 @@ string arg_error_check(int argc, char* argv[]) { return ""; } -int main(int argc, char* argv[], char *envp[] ) -{ +int main(int argc, char* argv[]) { static StaticObserver staticObserver; getcurns = ourgetns; po::options_description general_options("General options"); - #if defined(_WIN32) - po::options_description windows_scm_options("Windows Service Control Manager options"); - #endif +#if defined(_WIN32) + po::options_description windows_scm_options("Windows Service Control Manager options"); +#endif po::options_description replication_options("Replication options"); + po::options_description ms_options("Master/slave options"); + po::options_description rs_options("Replica set options"); po::options_description sharding_options("Sharding options"); po::options_description visible_options("Allowed options"); po::options_description hidden_options("Hidden options"); @@ -666,94 +630,106 @@ int main(int argc, char* argv[], char *envp[] ) CmdLine::addGlobalOptions( general_options , hidden_options ); general_options.add_options() - ("dbpath", po::value() , "directory for datafiles") - ("directoryperdb", "each database will be stored in a separate directory") - ("repairpath", po::value() , "root directory for repair files - defaults to dbpath" ) - ("cpu", "periodically show cpu and iowait utilization") - ("noauth", "run without security") - ("auth", "run with security") - ("objcheck", "inspect client data for validity on receipt") - ("quota", "enable db quota management") - ("quotaFiles", po::value(), "number of files allower per db, requires --quota") - ("appsrvpath", po::value(), "root directory for the babble app server") - ("nocursors", "diagnostic/debugging option") - ("nohints", "ignore query hints") - ("nohttpinterface", "disable http interface") - ("rest","turn on simple rest api") - ("noscripting", "disable scripting engine") - ("noprealloc", "disable data file preallocation") - ("smallfiles", "use a smaller default file size") - ("nssize", po::value()->default_value(16), ".ns file size (in MB) for new databases") - ("diaglog", po::value(), "0=off 1=W 2=R 3=both 7=W+some reads") - ("sysinfo", "print some diagnostic system information") - ("upgrade", "upgrade db if needed") - ("repair", "run repair on all dbs") - ("notablescan", "do not allow table scans") - ("syncdelay",po::value(&dataFileSync._sleepsecs)->default_value(60), "seconds between disk syncs (0=never, but not recommended)") - ("profile",po::value(), "0=off 1=slow, 2=all") - ("slowms",po::value(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" ) - ("maxConns",po::value(), "max number of simultaneous connections") - #if !defined(_WIN32) - ("nounixsocket", "disable listening on unix sockets") - #endif - ("ipv6", "enable IPv6 support (disabled by default)") - ; - #if defined(_WIN32) - windows_scm_options.add_options() - ("install", "install mongodb service") - ("remove", "remove mongodb service") - ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)") - ("service", "start mongodb service") - ("serviceName", po::value(), "windows service name") - ("serviceUser", po::value(), "user name service executes as") - ("servicePassword", po::value(), "password used to authenticate serviceUser") - ; - #endif - - replication_options.add_options() - ("master", "master mode") - ("slave", "slave mode") - ("source", po::value(), "when slave: specify master as ") - ("only", po::value(), "when slave: specify a single database to replicate") - ("pairwith", po::value(), "address of server to pair with") - ("arbiter", po::value(), "address of arbiter server") - ("slavedelay", po::value(), "specify delay (in seconds) to be used when applying master ops to slave") - ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer") - ("autoresync", "automatically resync if slave data is stale") - ("oplogSize", po::value(), "size limit (in MB) for op log") - ("opIdMem", po::value(), "size limit (in bytes) for in memory storage of op ids") - ; - - sharding_options.add_options() - ("configsvr", "declare this is a config db of a cluster") - ("shardsvr", "declare this is a shard db of a cluster") - ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk. this is on by default for now, but default will switch" ) - ; + ("auth", "run with security") + ("cpu", "periodically show cpu and iowait utilization") + ("dbpath", po::value() , "directory for datafiles") + ("diaglog", po::value(), "0=off 1=W 2=R 3=both 7=W+some reads") + ("directoryperdb", "each database will be stored in a separate directory") + ("journal", "enable journaling") + ("journalOptions", po::value(), "journal diagnostic options") + ("ipv6", "enable IPv6 support (disabled by default)") + ("jsonp","allow JSONP access via http (has security implications)") + ("maxConns",po::value(), "max number of simultaneous connections") + ("noauth", "run without security") + ("nohttpinterface", "disable http interface") + ("noprealloc", "disable data file preallocation - will often hurt performance") + ("noscripting", "disable scripting engine") + ("notablescan", "do not allow table scans") +#if !defined(_WIN32) + ("nounixsocket", "disable listening on unix sockets") +#endif + ("nssize", po::value()->default_value(16), ".ns file size (in MB) for new databases") + ("objcheck", "inspect client data for validity on receipt") + ("profile",po::value(), "0=off 1=slow, 2=all") + ("quota", "limits each database to a certain number of files (8 default)") + ("quotaFiles", po::value(), "number of files allower per db, requires --quota") + ("rest","turn on simple rest api") + ("repair", "run repair on all dbs") + ("repairpath", po::value() , "root directory for repair files - defaults to dbpath" ) + ("slowms",po::value(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" ) + ("smallfiles", "use a smaller default file size") + ("syncdelay",po::value(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)") + ("sysinfo", "print some diagnostic system information") + ("upgrade", "upgrade db if needed") + ; + +#if defined(_WIN32) + CmdLine::addWindowsOptions( windows_scm_options, hidden_options ); +#endif + + replication_options.add_options() + ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer") + ("autoresync", "automatically resync if slave data is stale") + ("oplogSize", po::value(), "size limit (in MB) for op log") + ; + + ms_options.add_options() + ("master", "master mode") + ("slave", "slave mode") + ("source", po::value(), "when slave: specify master as ") + ("only", po::value(), "when slave: specify a single database to replicate") + ("slavedelay", po::value(), "specify delay (in seconds) to be used when applying master ops to slave") + ; + + rs_options.add_options() + ("replSet", po::value(), "arg is [/]") + ; + + sharding_options.add_options() + ("configsvr", "declare this is a config db of a cluster; default port 27019; default dir /data/configdb") + ("shardsvr", "declare this is a shard db of a cluster; default port 27018") + ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk. this is on by default for now, but default will switch" ) + ; hidden_options.add_options() - ("pretouch", po::value(), "n pretouch threads for applying replicationed operations") - ("replSet", po::value(), "specify repl set seed hostnames format /,,etc...") - ("command", po::value< vector >(), "command") - ("cacheSize", po::value(), "cache size (in MB) for rec store") - ; + ("pretouch", po::value(), "n pretouch threads for applying replicationed operations") + ("command", po::value< vector >(), "command") + ("cacheSize", po::value(), "cache size (in MB) for rec store") + // these move to unhidden later: + ("opIdMem", po::value(), "size limit (in bytes) for in memory storage of op ids for replica pairs DEPRECATED") + ("pairwith", po::value(), "address of server to pair with DEPRECATED") + ("arbiter", po::value(), "address of replica pair arbiter server DEPRECATED") + ("nodur", "disable journaling (currently the default)") + ("appsrvpath", po::value(), "root directory for the babble app server") + ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION") + ("nohints", "ignore query hints") + ("dur", "enable journaling") // deprecated version + ("durOptions", po::value(), "durability diagnostic options") // deprecated version + ; positional_options.add("command", 3); visible_options.add(general_options); - #if defined(_WIN32) - visible_options.add(windows_scm_options); - #endif +#if defined(_WIN32) + visible_options.add(windows_scm_options); +#endif visible_options.add(replication_options); + visible_options.add(ms_options); + visible_options.add(rs_options); visible_options.add(sharding_options); Module::addOptions( visible_options ); setupCoreSignals(); - setupSignals(); + setupSignals( false ); dbExecCommand = argv[0]; srand(curTimeMicros()); +#if( BOOST_VERSION >= 104500 ) + boost::filesystem::path::default_name_check( boost::filesystem2::no_check ); +#else boost::filesystem::path::default_name_check( boost::filesystem::no_check ); +#endif { unsigned x = 0x12345678; @@ -764,18 +740,12 @@ int main(int argc, char* argv[], char *envp[] ) } } - UnitTest::runTests(); - if( argc == 1 ) cout << dbExecCommand << " --help for help and startup options" << endl; { - bool installService = false; - bool removeService = false; - bool reinstallService = false; - bool startService = false; po::variables_map params; - + string error_message = arg_error_check(argc, argv); if (error_message != "") { cout << error_message << endl << endl; @@ -795,10 +765,19 @@ int main(int argc, char* argv[], char *envp[] ) printGitVersion(); return 0; } - if ( params.count( "dbpath" ) ) + if ( params.count( "dbpath" ) ) { dbpath = params["dbpath"].as(); - else + if ( params.count( "fork" ) && dbpath[0] != '/' ) { + // we need to change dbpath if we fork since we change + // cwd to "/" + // fork only exists on *nix + // so '/' is safe + dbpath = cmdLine.cwd + "/" + dbpath; + } + } + else { dbpath = "/data/db/"; + } if ( params.count("directoryperdb")) { directoryperdb = true; @@ -819,6 +798,18 @@ int main(int argc, char* argv[], char *envp[] ) cmdLine.quota = true; cmdLine.quotaFiles = params["quotaFiles"].as() - 1; } + if( params.count("nodur") ) { + cmdLine.dur = false; + } + if( params.count("dur") || params.count( "journal" ) ) { + cmdLine.dur = true; + } + if (params.count("durOptions")) { + cmdLine.durOptions = params["durOptions"].as(); + } + if (params.count("journalOptions")) { + cmdLine.durOptions = params["durOptions"].as(); + } if (params.count("objcheck")) { objcheck = true; } @@ -828,8 +819,12 @@ int main(int argc, char* argv[], char *envp[] ) } if (params.count("repairpath")) { repairpath = params["repairpath"].as(); - uassert( 12589, "repairpath has to be non-zero", repairpath.size() ); - } else { + if (!repairpath.size()) { + out() << "repairpath has to be non-zero" << endl; + dbexit( EXIT_BADOPTIONS ); + } + } + else { repairpath = dbpath; } if (params.count("nocursors")) { @@ -844,11 +839,15 @@ int main(int argc, char* argv[], char *envp[] ) if (params.count("rest")) { cmdLine.rest = true; } + if (params.count("jsonp")) { + cmdLine.jsonp = true; + } if (params.count("noscripting")) { - useJNI = false; + scriptingEnabled = false; } if (params.count("noprealloc")) { cmdLine.prealloc = false; + cout << "note: noprealloc may hurt performance in many applications" << endl; } if (params.count("smallfiles")) { cmdLine.smallfiles = true; @@ -873,29 +872,7 @@ int main(int argc, char* argv[], char *envp[] ) shouldRepairDatabases = 1; } if (params.count("notablescan")) { - cmdLine.notablescan = true; - } - if (params.count("install")) { - if ( ! params.count( "logpath" ) ){ - cout << "--install has to be used with --logpath" << endl; - ::exit(-1); - } - - installService = true; - } - if (params.count("remove")) { - removeService = true; - } - if (params.count("reinstall")) { - if ( ! params.count( "logpath" ) ){ - cout << "--reinstall has to be used with --logpath" << endl; - ::exit(-1); - } - - reinstallService = true; - } - if (params.count("service")) { - startService = true; + cmdLine.noTableScan = true; } if (params.count("master")) { replSettings.master = true; @@ -916,16 +893,17 @@ int main(int argc, char* argv[], char *envp[] ) /* specifies what the source in local.sources should be */ cmdLine.source = params["source"].as().c_str(); } - if( params.count("pretouch") ) { + if( params.count("pretouch") ) { cmdLine.pretouch = params["pretouch"].as(); } if (params.count("replSet")) { if (params.count("slavedelay")) { - cout << "--slavedelay cannot be used with --replSet" << endl; - ::exit(-1); - } else if (params.count("only")) { - cout << "--only cannot be used with --replSet" << endl; - ::exit(-1); + out() << "--slavedelay cannot be used with --replSet" << endl; + dbexit( EXIT_BADOPTIONS ); + } + else if (params.count("only")) { + out() << "--only cannot be used with --replSet" << endl; + dbexit( EXIT_BADOPTIONS ); } /* seed list of hosts for the repl set */ cmdLine._replSet = params["replSet"].as().c_str(); @@ -937,103 +915,108 @@ int main(int argc, char* argv[], char *envp[] ) cout << "***********************************\n" << "WARNING WARNING WARNING\n" << " replica pairs are deprecated\n" - << " see: http://www.mongodb.org/display/DOCS/Replica+Pairs \n" + << " see: http://www.mongodb.org/display/DOCS/Replica+Pairs \n" << "***********************************" << endl; string paired = params["pairwith"].as(); if (params.count("arbiter")) { string arbiter = params["arbiter"].as(); pairWith(paired.c_str(), arbiter.c_str()); - } else { + } + else { pairWith(paired.c_str(), "-"); } - } else if (params.count("arbiter")) { - uasserted(10999,"specifying --arbiter without --pairwith"); + } + else if (params.count("arbiter")) { + out() << "specifying --arbiter without --pairwith" << endl; + dbexit( EXIT_BADOPTIONS ); } if( params.count("nssize") ) { int x = params["nssize"].as(); - uassert( 10034 , "bad --nssize arg", x > 0 && x <= (0x7fffffff/1024/1024)); + if (x <= 0 || x > (0x7fffffff/1024/1024)) { + out() << "bad --nssize arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } lenForNewNsFiles = x * 1024 * 1024; assert(lenForNewNsFiles > 0); } if (params.count("oplogSize")) { - long x = params["oplogSize"].as(); - uassert( 10035 , "bad --oplogSize arg", x > 0); + long long x = params["oplogSize"].as(); + if (x <= 0) { + out() << "bad --oplogSize arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } + // note a small size such as x==1 is ok for an arbiter. + if( x > 1000 && sizeof(void*) == 4 ) { + out() << "--oplogSize of " << x << "MB is too big for 32 bit version. Use 64 bit build instead." << endl; + dbexit( EXIT_BADOPTIONS ); + } cmdLine.oplogSize = x * 1024 * 1024; assert(cmdLine.oplogSize > 0); } if (params.count("opIdMem")) { long x = params["opIdMem"].as(); - uassert( 10036 , "bad --opIdMem arg", x > 0); + if (x <= 0) { + out() << "bad --opIdMem arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } replSettings.opIdMem = x; assert(replSettings.opIdMem > 0); } if (params.count("cacheSize")) { long x = params["cacheSize"].as(); - uassert( 10037 , "bad --cacheSize arg", x > 0); + if (x <= 0) { + out() << "bad --cacheSize arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } log() << "--cacheSize option not currently supported" << endl; - //setRecCacheSize(x); - } - if (params.count("port") == 0 ) { - if( params.count("configsvr") ) { - cmdLine.port = CmdLine::ConfigServerPort; - } - if( params.count("shardsvr") ) - cmdLine.port = CmdLine::ShardServerPort; - } - else { - if ( cmdLine.port <= 0 || cmdLine.port > 65535 ){ + } + if (params.count("port") == 0 ) { + if( params.count("configsvr") ) { + cmdLine.port = CmdLine::ConfigServerPort; + } + if( params.count("shardsvr") ) + cmdLine.port = CmdLine::ShardServerPort; + } + else { + if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) { out() << "bad --port number" << endl; dbexit( EXIT_BADOPTIONS ); } } - if ( params.count("configsvr" ) ){ + if ( params.count("configsvr" ) ) { + if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) { + log() << "replication should not be enabled on a config server" << endl; + ::exit(-1); + } if ( params.count( "diaglog" ) == 0 ) _diaglog.level = 1; if ( params.count( "dbpath" ) == 0 ) dbpath = "/data/configdb"; } - if ( params.count( "profile" ) ){ + if ( params.count( "profile" ) ) { cmdLine.defaultProfile = params["profile"].as(); } - if ( params.count( "maxConns" ) ){ + if ( params.count( "maxConns" ) ) { int newSize = params["maxConns"].as(); - uassert( 12507 , "maxConns has to be at least 5" , newSize >= 5 ); - uassert( 12508 , "maxConns can't be greater than 10000000" , newSize < 10000000 ); + if ( newSize < 5 ) { + out() << "maxConns has to be at least 5" << endl; + dbexit( EXIT_BADOPTIONS ); + } + else if ( newSize >= 10000000 ) { + out() << "maxConns can't be greater than 10000000" << endl; + dbexit( EXIT_BADOPTIONS ); + } connTicketHolder.resize( newSize ); } - if (params.count("nounixsocket")){ + if (params.count("nounixsocket")) { noUnixSocket = true; } - if (params.count("ipv6")){ + if (params.count("ipv6")) { enableIPv6(); } - if (params.count("noMoveParanoia")){ + if (params.count("noMoveParanoia")) { cmdLine.moveParanoia = false; } -#if defined(_WIN32) - if (params.count("serviceName")){ - string x = params["serviceName"].as(); - windowsServiceName = wstring(x.size(),L' '); - for ( size_t i=0; i(); - windowsServiceUser = wstring(x.size(),L' '); - for ( size_t i=0; i(); - windowsServicePassword = wstring(x.size(),L' '); - for ( size_t i=0; i command = params["command"].as< vector >(); - if (command[0].compare("msg") == 0) { - const char *m; - - if (command.size() < 3) { - cout << "Too few parameters to 'msg' command" << endl; - cout << visible_options << endl; - return 0; - } - - m = command[1].c_str(); - - msg(m, "127.0.0.1", atoi(command[2].c_str())); - return 0; - } if (command[0].compare("run") == 0) { if (command.size() > 1) { cout << "Too many parameters to 'run' command" << endl; @@ -1076,31 +1045,17 @@ int main(int argc, char* argv[], char *envp[] ) return 0; } + if( cmdLine.pretouch ) + log() << "--pretouch " << cmdLine.pretouch << endl; + #if defined(_WIN32) - if ( reinstallService ) { - ServiceController::removeService( windowsServiceName ); - } - if ( installService || reinstallService ) { - if ( !ServiceController::installService( windowsServiceName , L"Mongo DB", L"Mongo DB Server", windowsServiceUser, windowsServicePassword, dbpath, argc, argv ) ) - dbexit( EXIT_NTSERVICE_ERROR ); - dbexit( EXIT_CLEAN ); - } - else if ( removeService ) { - if ( !ServiceController::removeService( windowsServiceName ) ) - dbexit( EXIT_NTSERVICE_ERROR ); - dbexit( EXIT_CLEAN ); - } - else if ( startService ) { - if ( !ServiceController::startService( windowsServiceName , mongo::initService ) ) - dbexit( EXIT_NTSERVICE_ERROR ); - dbexit( EXIT_CLEAN ); + if (serviceParamsCheck( params, dbpath, argc, argv )) { + return 0; } #endif } - if( cmdLine.pretouch ) - log() << "--pretouch " << cmdLine.pretouch << endl; - + UnitTest::runTests(); initAndListen(cmdLine.port, appsrvPath); dbexit(EXIT_CLEAN); return 0; @@ -1113,12 +1068,11 @@ namespace mongo { #undef out void exitCleanly( ExitCode code ) { - goingAway = true; killCurrentOp.killAll(); { dblock lk; log() << "now exiting" << endl; - dbexit( code ); + dbexit( code ); } } @@ -1154,7 +1108,12 @@ namespace mongo { oss << "Backtrace:" << endl; printStackTrace( oss ); rawOut( oss.str() ); - dbexit( EXIT_ABRUBT ); + + if( cmdLine.dur ) { + ::exit(EXIT_ABRUPT); + } + + dbexit( EXIT_ABRUPT ); } sigset_t asyncSignals; @@ -1171,12 +1130,14 @@ namespace mongo { // this will be called in certain c++ error cases, for example if there are two active // exceptions void myterminate() { - rawOut( "terminate() called, printing stack:\n" ); + rawOut( "terminate() called, printing stack:" ); printStackTrace(); abort(); } - - void setupSignals() { + + void setupSignals_ignoreHelper( int signal ) {} + + void setupSignals( bool inFork ) { assert( signal(SIGSEGV, abruptQuit) != SIG_ERR ); assert( signal(SIGFPE, abruptQuit) != SIG_ERR ); assert( signal(SIGABRT, abruptQuit) != SIG_ERR ); @@ -1187,55 +1148,58 @@ namespace mongo { setupSIGTRAPforGDB(); sigemptyset( &asyncSignals ); - sigaddset( &asyncSignals, SIGHUP ); + + if ( inFork ) + assert( signal( SIGHUP , setupSignals_ignoreHelper ) != SIG_ERR ); + else + sigaddset( &asyncSignals, SIGHUP ); + sigaddset( &asyncSignals, SIGINT ); sigaddset( &asyncSignals, SIGTERM ); assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 ); boost::thread it( interruptThread ); - + set_terminate( myterminate ); } #else -void ctrlCTerminate() { - log() << "got kill or ctrl-c signal, will terminate after current cmd ends" << endl; - Client::initThread( "ctrlCTerminate" ); - exitCleanly( EXIT_KILL ); -} -BOOL CtrlHandler( DWORD fdwCtrlType ) -{ - switch( fdwCtrlType ) - { - case CTRL_C_EVENT: - rawOut("Ctrl-C signal\n"); - ctrlCTerminate(); - return( TRUE ); - case CTRL_CLOSE_EVENT: - rawOut("CTRL_CLOSE_EVENT signal\n"); - ctrlCTerminate(); - return( TRUE ); - case CTRL_BREAK_EVENT: - rawOut("CTRL_BREAK_EVENT signal\n"); - ctrlCTerminate(); - return TRUE; - case CTRL_LOGOFF_EVENT: - rawOut("CTRL_LOGOFF_EVENT signal (ignored)\n"); - return FALSE; - case CTRL_SHUTDOWN_EVENT: - rawOut("CTRL_SHUTDOWN_EVENT signal (ignored)\n"); - return FALSE; - default: - return FALSE; + void ctrlCTerminate() { + log() << "got kill or ctrl-c signal, will terminate after current cmd ends" << endl; + Client::initThread( "ctrlCTerminate" ); + exitCleanly( EXIT_KILL ); + } + BOOL CtrlHandler( DWORD fdwCtrlType ) { + switch( fdwCtrlType ) { + case CTRL_C_EVENT: + rawOut("Ctrl-C signal"); + ctrlCTerminate(); + return( TRUE ); + case CTRL_CLOSE_EVENT: + rawOut("CTRL_CLOSE_EVENT signal"); + ctrlCTerminate(); + return( TRUE ); + case CTRL_BREAK_EVENT: + rawOut("CTRL_BREAK_EVENT signal"); + ctrlCTerminate(); + return TRUE; + case CTRL_LOGOFF_EVENT: + rawOut("CTRL_LOGOFF_EVENT signal (ignored)"); + return FALSE; + case CTRL_SHUTDOWN_EVENT: + rawOut("CTRL_SHUTDOWN_EVENT signal (ignored)"); + return FALSE; + default: + return FALSE; + } } -} void myPurecallHandler() { - rawOut( "pure virtual method called, printing stack:\n" ); + rawOut( "pure virtual method called, printing stack:" ); printStackTrace(); - abort(); + abort(); } - - void setupSignals() { + + void setupSignals( bool inFork ) { if( SetConsoleCtrlHandler( (PHANDLER_ROUTINE) CtrlHandler, TRUE ) ) ; else @@ -1245,6 +1209,3 @@ BOOL CtrlHandler( DWORD fdwCtrlType ) #endif } // namespace mongo - -//#include "recstore.h" -//#include "reccache.h" diff --git a/db/db.h b/db/db.h index a261f58..7ef7d03 100644 --- a/db/db.h +++ b/db/db.h @@ -26,19 +26,6 @@ namespace mongo { // void jniCallback(Message& m, Message& out); - /* Note the limit here is rather arbitrary and is simply a standard. generally the code works - with any object that fits in ram. - - Also note that the server has some basic checks to enforce this limit but those checks are not exhaustive - for example need to check for size too big after - update $push (append) operation - various db.eval() type operations - - Note also we sometimes do work with objects slightly larger - an object in the replication local.oplog - could be slightly larger. - */ - const int MaxBSONObjectSize = 4 * 1024 * 1024; - /** * class to hold path + dbname -> Database * might be able to optimizer further @@ -48,8 +35,7 @@ namespace mongo { typedef map DBs; typedef map Paths; - DatabaseHolder() : _size(0){ - } + DatabaseHolder() : _size(0) { } bool isLoaded( const string& ns , const string& path ) const { dbMutex.assertAtLeastReadLocked(); @@ -57,29 +43,29 @@ namespace mongo { if ( x == _paths.end() ) return false; const DBs& m = x->second; - + string db = _todb( ns ); DBs::const_iterator it = m.find(db); return it != m.end(); } - + Database * get( const string& ns , const string& path ) const { dbMutex.assertAtLeastReadLocked(); Paths::const_iterator x = _paths.find( path ); if ( x == _paths.end() ) return 0; const DBs& m = x->second; - + string db = _todb( ns ); DBs::const_iterator it = m.find(db); - if ( it != m.end() ) + if ( it != m.end() ) return it->second; return 0; } - - void put( const string& ns , const string& path , Database * db ){ + + void put( const string& ns , const string& path , Database * db ) { dbMutex.assertWriteLocked(); DBs& m = _paths[path]; Database*& d = m[_todb(ns)]; @@ -87,35 +73,10 @@ namespace mongo { _size++; d = db; } - - Database* getOrCreate( const string& ns , const string& path , bool& justCreated ){ - dbMutex.assertWriteLocked(); - DBs& m = _paths[path]; - - string dbname = _todb( ns ); - - Database* & db = m[dbname]; - if ( db ){ - justCreated = false; - return db; - } - - log(1) << "Accessing: " << dbname << " for the first time" << endl; - try { - db = new Database( dbname.c_str() , justCreated , path ); - } - catch ( ... ){ - m.erase( dbname ); - throw; - } - _size++; - return db; - } - + Database* getOrCreate( const string& ns , const string& path , bool& justCreated ); - - void erase( const string& ns , const string& path ){ + void erase( const string& ns , const string& path ) { dbMutex.assertWriteLocked(); DBs& m = _paths[path]; _size -= (int)m.erase( _todb( ns ) ); @@ -124,71 +85,77 @@ namespace mongo { /* force - force close even if something underway - use at shutdown */ bool closeAll( const string& path , BSONObjBuilder& result, bool force ); - int size(){ + int size() { return _size; } - + + void forEach(boost::function f) const { + dbMutex.assertAtLeastReadLocked(); + for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) { + DBs m = i->second; + for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) { + f(j->second); + } + } + } + /** * gets all unique db names, ignoring paths */ void getAllShortNames( set& all ) const { dbMutex.assertAtLeastReadLocked(); - for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){ + for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) { DBs m = i->second; - for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ){ + for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) { all.insert( j->first ); } } } private: - + string _todb( const string& ns ) const { string d = __todb( ns ); - uassert( 13280 , (string)"invalid db name: " + ns , Database::validDBName( d ) ); + uassert( 13280 , (string)"invalid db name: " + ns , Database::validDBName( d ) ); return d; } string __todb( const string& ns ) const { size_t i = ns.find( '.' ); - if ( i == string::npos ){ + if ( i == string::npos ) { uassert( 13074 , "db name can't be empty" , ns.size() ); return ns; } uassert( 13075 , "db name can't be empty" , i > 0 ); return ns.substr( 0 , i ); } - + Paths _paths; int _size; - + }; extern DatabaseHolder dbHolder; - // shared functionality for removing references to a database from this program instance - // does not delete the files on disk - void closeDatabase( const char *cl, const string& path = dbpath ); - struct dbtemprelease { Client::Context * _context; int _locktype; - + dbtemprelease() { _context = cc().getContext(); _locktype = dbMutex.getState(); assert( _locktype ); - + if ( _locktype > 0 ) { - massert( 10298 , "can't temprelease nested write lock", _locktype == 1); + massert( 10298 , "can't temprelease nested write lock", _locktype == 1); if ( _context ) _context->unlocked(); dbMutex.unlock(); - } + } else { - massert( 10299 , "can't temprelease nested read lock", _locktype == -1); + massert( 10299 , "can't temprelease nested read lock", _locktype == -1); if ( _context ) _context->unlocked(); dbMutex.unlock_shared(); - } + } } ~dbtemprelease() { @@ -196,11 +163,11 @@ namespace mongo { dbMutex.lock(); else dbMutex.lock_shared(); - + if ( _context ) _context->relocked(); } }; - + /** only does a temp release if we're not nested and have a lock @@ -208,22 +175,22 @@ namespace mongo { struct dbtempreleasecond { dbtemprelease * real; int locktype; - - dbtempreleasecond(){ + + dbtempreleasecond() { real = 0; locktype = dbMutex.getState(); if ( locktype == 1 || locktype == -1 ) real = new dbtemprelease(); } - - ~dbtempreleasecond(){ - if ( real ){ + + ~dbtempreleasecond() { + if ( real ) { delete real; real = 0; } } - - bool unlocked(){ + + bool unlocked() { return real > 0; } }; diff --git a/db/db.sln b/db/db.sln deleted file mode 100644 index b02b79d..0000000 --- a/db/db.sln +++ /dev/null @@ -1,86 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 10.00 -# Visual Studio 2008 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}" - ProjectSection(SolutionItems) = preProject - driverHelpers.cpp = driverHelpers.cpp - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}" - ProjectSection(SolutionItems) = preProject - ..\tools\bridge.cpp = ..\tools\bridge.cpp - ..\tools\dump.cpp = ..\tools\dump.cpp - ..\tools\export.cpp = ..\tools\export.cpp - ..\tools\files.cpp = ..\tools\files.cpp - ..\tools\import.cpp = ..\tools\import.cpp - ..\tools\restore.cpp = ..\tools\restore.cpp - ..\tools\sniffer.cpp = ..\tools\sniffer.cpp - ..\tools\stat.cpp = ..\tools\stat.cpp - ..\tools\tool.cpp = ..\tools\tool.cpp - ..\tools\tool.h = ..\tools\tool.h - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}" - ProjectSection(SolutionItems) = preProject - ..\util\mmap_posix.cpp = ..\util\mmap_posix.cpp - ..\util\processinfo_darwin.cpp = ..\util\processinfo_darwin.cpp - ..\util\processinfo_linux2.cpp = ..\util\processinfo_linux2.cpp - ..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shell", "shell", "{407B4B88-3451-433C-B74F-31B31FEB5791}" - ProjectSection(SolutionItems) = preProject - ..\shell\dbshell.cpp = ..\shell\dbshell.cpp - ..\shell\mongo_vstudio.cpp = ..\shell\mongo_vstudio.cpp - ..\shell\utils.cpp = ..\shell\utils.cpp - ..\shell\utils.h = ..\shell\utils.h - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other source files", "other source files", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}" - ProjectSection(SolutionItems) = preProject - ..\buildscripts\buildboost.bat = ..\buildscripts\buildboost.bat - ..\buildscripts\buildboost64.bat = ..\buildscripts\buildboost64.bat - ..\SConstruct = ..\SConstruct - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondemo\bsondemo.vcproj", "{C9DB5EB7-81AA-4185-BAA1-DA035654402F}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Win32 = Debug|Win32 - Release|Win32 = Release|Win32 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32 - {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32 - {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32 - {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32 - {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32 - {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32 - {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32 - {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32 - {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32 - {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32 - {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32 - {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32 - {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.ActiveCfg = Debug|Win32 - {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.Build.0 = Debug|Win32 - {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.ActiveCfg = Release|Win32 - {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.Build.0 = Release|Win32 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(NestedProjects) = preSolution - {2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F} - {2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F} - {407B4B88-3451-433C-B74F-31B31FEB5791} = {12B11474-2D74-48C3-BB3D-F03249BEA88F} - {4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F} - EndGlobalSection -EndGlobal diff --git a/db/db.vcproj b/db/db.vcproj deleted file mode 100644 index 2eac6eb..0000000 --- a/db/db.vcproj +++ /dev/null @@ -1,1885 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/db/db.vcxproj b/db/db.vcxproj index 0cabbd0..ad9c6d2 100644 --- a/db/db.vcxproj +++ b/db/db.vcxproj @@ -89,6 +89,10 @@ + .;..;$(IncludePath) + ..;$(IncludePath) + ..;$(IncludePath) + ..;$(IncludePath) @@ -192,7 +196,9 @@ + + @@ -435,8 +441,16 @@ + + + NotUsing + NotUsing + NotUsing + NotUsing + + @@ -445,19 +459,40 @@ + + NotUsing + + + + + + + + + + + + + + + + + + + @@ -468,7 +503,6 @@ - @@ -484,7 +518,6 @@ - @@ -505,16 +538,15 @@ - - + @@ -552,16 +584,6 @@ - - - - - - - - - - @@ -574,6 +596,8 @@ + + @@ -604,26 +628,37 @@ + + + + + + + + + + + + + + + - - - - @@ -656,7 +691,6 @@ - diff --git a/db/db.vcxproj.filters b/db/db.vcxproj.filters index bf30b4e..a2011df 100755 --- a/db/db.vcxproj.filters +++ b/db/db.vcxproj.filters @@ -1,928 +1,329 @@  - - replSets - - - replSets - - - replSets - - - db\btree - - - db\btree - - - replSets - - - client - - - client - - - client - - - replSets - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - util\pcre - - - replSets - - - util\concurrency - - - util\concurrency - - - replSets - - - scripting - - - scripting - - - scripting - - - client - - - replSets - - - repl_old - - - db\core - - - db\core - - - repl_old - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - sharding - - - sharding - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - util\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - util\core - - - db\core - - - db\core - - - db\core - - - util\core - - - util\core - - - util\core - - - db\modules - - - util\core - - - util\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - util\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\storage engine - - - repl_old - - - repl_old - - - sharding - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\concurrency - - - util\core - - - util\core - - - util\core - - - util\core - - - db - - - db - - - sharding - - - sharding - - - util\core - - - db\geo - - - db\core - - - sharding - - - sharding - - - sharding - - - sharding - - - client - - - util\core - - - util\concurrency - - - client - - - sharding - - - sharding - - - replSets - - - replSets - - - replSets - - - db\geo - - - db\core - - - util - - - util - - - db - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - replSets - - - replSets - - - bson - - - bson - - - bson - - - bson - - - bson - - - bson - - - bson - - - db\btree - - - replSets - - - client - - - client - - - util\mongoutils - - - util\mongoutils - - - util\pcre - - - replSets - - - util\concurrency - - - util\concurrency - - - client - - - client - - - client - - - util\concurrency - - - replSets - - - util\concurrency - - - util\concurrency - - - stats - - - stats - - - stats - - - util\concurrency - - - util\concurrency - - - util\concurrency - - - scripting - - - scripting - - - scripting - - - scripting - - - scripting - - - scripting - - - client - - - replSets - - - replSets - - - repl_old - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - util\core - - - util\core - - - db\storage engine - - - util\core - - - util\core - - - util\core - - - util\core - - - db\core - - - db\core - - - db\core - - - db\core - - - util\core - - - util\core - - - util\core - - - util\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - util\core - - - util\core - - - db\core - - - db\core - - - util\core - - - util\core - - - util\core - - - db\core - - - util\core - - - client - - - db\core - - - util\core - - - bson - - - bson - - - db\core - - - db\core - - - db\core - - - db\core - - - db\core - - - db\storage engine - - - db\storage engine - - - db\storage engine - - - db\storage engine - - - db\storage engine - - - repl_old - - - repl_old - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - db\core - - - util\concurrency - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - util\core - - - db - - - util\core - - - db\geo - - - db\core - - - client - - - sharding - - - repl_old - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - {4b29c82d-d30a-4bf1-9c78-19f59c5777ba} - - - {d2c3db88-7fb7-4365-a63b-b7ad45d316ae} - - - {8e6fe846-2833-45bb-b13b-c0f0d4d38593} - - - {cc5d96e6-1805-422b-804d-adcb367dc721} - - - {fa527226-9b03-4f17-8e4c-80d31fb1e449} - - - {932baf83-ba80-49e5-8280-f1b9c8dbbde6} - - - {88f4374a-9d55-44a2-a234-c758cc4affa9} - - - {6204f40e-3a9c-44e2-a88b-0e1b6fd9a510} - - - {37b238b2-21ec-4788-bdf9-a59b43490454} - - - {6b78f34f-e6b0-49e4-b04e-6478c3a3c077} - - - {d565a775-7a99-4860-b25f-441e1655b7c6} - - - {466f15bb-4d5b-4634-ba6b-05a282e0a174} - - - {d7f08f93-36bf-49cd-9e1c-ba1fec3234ce} - - - {e899caa1-9a90-4604-ac2e-68d5ca12425c} - - - {9775f24c-3a29-4e0d-b5de-991c592cf376} - - - {9aea1b83-cdcb-48a8-97e6-47805cacdc29} - - - {aff20a87-2efe-4861-930f-8780c08cbea5} - - - {2a0924a5-9bd9-4c86-a149-0df09dcb5548} - - - {03b0d798-b13d-48f4-930d-ca827e2a3f00} - - - {3b73f786-d352-446f-a5f5-df49384baf7a} - - - {4a1ea357-1077-4ad1-85b4-db48a6e1eb46} - + - - replSets - - - util\mongoutils - - - db - - - Resource Files - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - - - replSets\testing - + + + + + + + + + + + + + + + + + + + + + + - - libs - - - libs - - - libs - - - libs - - - - - Resource Files - + + + + \ No newline at end of file diff --git a/db/db_10.sln b/db/db_10.sln old mode 100644 new mode 100755 index d68d897..f74ac3d --- a/db/db_10.sln +++ b/db/db_10.sln @@ -8,16 +8,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{40 EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}" ProjectSection(SolutionItems) = preProject - ..\tools\bridge.cpp = ..\tools\bridge.cpp - ..\tools\bsondump.cpp = ..\tools\bsondump.cpp - ..\tools\dump.cpp = ..\tools\dump.cpp ..\tools\export.cpp = ..\tools\export.cpp - ..\tools\import.cpp = ..\tools\import.cpp - ..\tools\restore.cpp = ..\tools\restore.cpp ..\tools\sniffer.cpp = ..\tools\sniffer.cpp - ..\tools\stat.cpp = ..\tools\stat.cpp - ..\tools\tool.cpp = ..\tools\tool.cpp - ..\tools\tool.h = ..\tools\tool.h EndProjectSection EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}" diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp index 7bd7203..8974bd3 100644 --- a/db/dbcommands.cpp +++ b/db/dbcommands.cpp @@ -40,11 +40,13 @@ #include "stats/counters.h" #include "background.h" #include "../util/version.h" +#include "../s/d_writeback.h" +#include "dur_stats.h" namespace mongo { extern int otherTraceLevel; - void flushOpLog( stringstream &ss ); + void flushDiagLog(); /* reset any errors so that getlasterror comes back clean. @@ -54,7 +56,7 @@ namespace mongo { */ class CmdResetError : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual bool logTheOp() { return false; @@ -74,8 +76,8 @@ namespace mongo { } } cmdResetError; - /* set by replica sets if specified in the configuration. - a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE + /* set by replica sets if specified in the configuration. + a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE and would like to keep that) (for now, it simply orphans any old copy as config changes should be extremely rare). note: once non-null, never goes to null again. @@ -84,33 +86,38 @@ namespace mongo { class CmdGetLastError : public Command { public: - virtual LockType locktype() const { return NONE; } - virtual bool requiresAuth() { return false; } - virtual bool logTheOp() { - return false; - } - virtual bool slaveOk() const { - return true; - } + CmdGetLastError() : Command("getLastError", false, "getlasterror") { } + virtual LockType locktype() const { return NONE; } + virtual bool requiresAuth() { return false; } + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { return true; } virtual void help( stringstream& help ) const { - help << "return error status of the last operation on this connection"; + help << "return error status of the last operation on this connection\n" + << "options:\n" + << " fsync - fsync before returning, or wait for journal commit if running with --dur\n" + << " w - await replication to w servers (including self) before returning\n" + << " wtimeout - timeout for w in milliseconds"; } - CmdGetLastError() : Command("getLastError", false, "getlasterror") {} - bool run(const string& dbnamne, BSONObj& _cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + bool run(const string& dbname, BSONObj& _cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { LastError *le = lastError.disableForCommand(); + + bool err = false; + if ( le->nPrev != 1 ) - LastError::noError.appendSelf( result ); + err = LastError::noError.appendSelf( result , false ); else - le->appendSelf( result ); - + err = le->appendSelf( result , false ); + Client& c = cc(); c.appendLastOp( result ); + result.appendNumber( "connectionId" , c.getConnectionId() ); + BSONObj cmdObj = _cmdObj; - { + { BSONObj::iterator i(_cmdObj); i.next(); - if( !i.more() ) { + if( !i.more() ) { /* empty, use default */ BSONObj *def = getLastErrorDefault; if( def ) @@ -118,13 +125,27 @@ namespace mongo { } } - if ( cmdObj["fsync"].trueValue() ){ - log() << "fsync from getlasterror" << endl; - result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) ); + if ( cmdObj["fsync"].trueValue() ) { + Timer t; + if( !getDur().awaitCommit() ) { + // if get here, not running with --dur + log() << "fsync from getlasterror" << endl; + result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) ); + } + else { + // this perhaps is temp. how long we wait for the group commit to occur. + result.append( "waited", t.millis() ); + } } - + + if ( err ) { + // doesn't make sense to wait for replication + // if there was an error + return true; + } + BSONElement e = cmdObj["w"]; - if ( e.isNumber() ){ + if ( e.isNumber() ) { int timeout = cmdObj["wtimeout"].numberInt(); Timer t; @@ -132,15 +153,43 @@ namespace mongo { long long passes = 0; char buf[32]; - while ( 1 ){ - if ( opReplicatedEnough( c.getLastOp() , w ) ) - break; + while ( 1 ) { + OpTime op(c.getLastOp()); - if ( timeout > 0 && t.millis() >= timeout ){ + if ( op.isNull() ) { + if ( anyReplEnabled() ) { + result.append( "wnote" , "no write has been done on this connection" ); + } + else if ( w <= 1 ) { + // don't do anything + // w=1 and no repl, so this is fine + } + else { + // w=2 and no repl + result.append( "wnote" , "no replication has been enabled, so w=2+ won't work" ); + result.append( "err", "norepl" ); + return true; + } + break; + } + + // check this first for w=0 or w=1 + if ( opReplicatedEnough( op, w ) ) + break; + + // if replication isn't enabled (e.g., config servers) + if ( ! anyReplEnabled() ) { + result.append( "err", "norepl" ); + return true; + } + + + if ( timeout > 0 && t.millis() >= timeout ) { result.append( "wtimeout" , true ); errmsg = "timed out waiting for slaves"; result.append( "waited" , t.millis() ); - return false; + result.append( "err" , "timeout" ); + return true; } assert( sprintf( buf , "w block pass: %lld" , ++passes ) < 30 ); @@ -150,14 +199,15 @@ namespace mongo { } result.appendNumber( "wtime" , t.millis() ); } - + + result.appendNull( "err" ); return true; } } cmdGetLastError; class CmdGetPrevError : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual bool logTheOp() { return false; @@ -169,7 +219,7 @@ namespace mongo { return true; } CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {} - bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { LastError *le = lastError.disableForCommand(); le->appendSelf( result ); if ( le->valid ) @@ -191,16 +241,16 @@ namespace mongo { virtual bool slaveOk() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } CmdDropDatabase() : Command("dropDatabase") {} - bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { BSONElement e = cmdObj.firstElement(); - log() << "dropDatabase " << dbnamne << endl; + log() << "dropDatabase " << dbname << endl; int p = (int) e.number(); if ( p != 1 ) return false; - dropDatabase(dbnamne); - result.append( "dropped" , dbnamne ); + dropDatabase(dbname); + result.append( "dropped" , dbname ); return true; } } cmdDropDatabase; @@ -216,7 +266,7 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "repair database. also compacts. note: slow."; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } CmdRepairDatabase() : Command("repairDatabase") {} bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { BSONElement e = cmdObj.firstElement(); @@ -231,7 +281,7 @@ namespace mongo { return repairDatabase( dbname, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles ); } } cmdRepairDatabase; - + /* set db profiling level todo: how do we handle profiling information put in the db with replication? sensibly or not? @@ -245,9 +295,10 @@ namespace mongo { help << "enable or disable performance profiling\n"; help << "{ profile : }\n"; help << "0=off 1=log slow ops 2=log all\n"; + help << "-1 to get current values\n"; help << "http://www.mongodb.org/display/DOCS/Database+Profiler"; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } CmdProfile() : Command("profile") {} bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { BSONElement e = cmdObj.firstElement(); @@ -256,7 +307,7 @@ namespace mongo { int p = (int) e.number(); bool ok = false; - + if ( p == -1 ) ok = true; else if ( p >= 0 && p <= 2 ) { @@ -266,7 +317,7 @@ namespace mongo { BSONElement slow = cmdObj["slowms"]; if ( slow.isNumber() ) cmdLine.slowMS = slow.numberInt(); - + return ok; } } cmdProfile; @@ -279,8 +330,8 @@ namespace mongo { CmdServerStatus() : Command("serverStatus", true) { started = time(0); } - - virtual LockType locktype() const { return NONE; } + + virtual LockType locktype() const { return NONE; } virtual void help( stringstream& help ) const { help << "returns lots of administrative server statistics"; @@ -291,9 +342,11 @@ namespace mongo { BSONObjBuilder timeBuilder(128); - bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); + bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); + result.append( "host" , prettyHostName() ); result.append("version", versionString); + result.append("process","mongod"); result.append("uptime",(double) (time(0)-started)); result.append("uptimeEstimate",(double) (start/1000)); result.appendDate( "localTime" , jsTime() ); @@ -309,27 +362,41 @@ namespace mongo { t.append("totalTime", tt); t.append("lockTime", tl); t.append("ratio", (tt ? tl/tt : 0)); - - BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) ); - int w=0, r=0; - Client::recommendedYieldMicros( &w , &r ); - ttt.append( "total" , w + r ); - ttt.append( "readers" , r ); - ttt.append( "writers" , w ); - ttt.done(); + + { + BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) ); + int w=0, r=0; + Client::recommendedYieldMicros( &w , &r ); + ttt.append( "total" , w + r ); + ttt.append( "readers" , r ); + ttt.append( "writers" , w ); + ttt.done(); + } + + { + BSONObjBuilder ttt( t.subobjStart( "activeClients" ) ); + int w=0, r=0; + Client::getActiveClientCount( w , r ); + ttt.append( "total" , w + r ); + ttt.append( "readers" , r ); + ttt.append( "writers" , w ); + ttt.done(); + } + + result.append( "globalLock" , t.obj() ); } timeBuilder.appendNumber( "after basic" , Listener::getElapsedTimeMillis() - start ); - if ( authed ){ - + { + BSONObjBuilder t( result.subobjStart( "mem" ) ); - + t.append("bits", ( sizeof(int*) == 4 ? 32 : 64 ) ); ProcessInfo p; - if ( p.supported() ){ + if ( p.supported() ) { t.appendNumber( "resident" , p.getResidentSize() ); t.appendNumber( "virtual" , p.getVirtualMemorySize() ); t.appendBool( "supported" , true ); @@ -338,14 +405,16 @@ namespace mongo { result.append( "note" , "not all mem info support on this platform" ); t.appendBool( "supported" , false ); } - + + timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start ); + t.appendNumber( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ); t.done(); - + } - timeBuilder.appendNumber( "after is authed" , Listener::getElapsedTimeMillis() - start ); - + timeBuilder.appendNumber( "after mem" , Listener::getElapsedTimeMillis() - start ); + { BSONObjBuilder bb( result.subobjStart( "connections" ) ); bb.append( "current" , connTicketHolder.used() ); @@ -353,15 +422,15 @@ namespace mongo { bb.done(); } timeBuilder.appendNumber( "after connections" , Listener::getElapsedTimeMillis() - start ); - - if ( authed ){ + + { BSONObjBuilder bb( result.subobjStart( "extra_info" ) ); bb.append("note", "fields vary by platform"); ProcessInfo p; p.getExtraInfo(bb); bb.done(); timeBuilder.appendNumber( "after extra info" , Listener::getElapsedTimeMillis() - start ); - + } { @@ -369,31 +438,43 @@ namespace mongo { globalIndexCounters.append( bb ); bb.done(); } - + { BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) ); globalFlushCounters.append( bb ); bb.done(); } - + { BSONObjBuilder bb( result.subobjStart( "cursors" ) ); ClientCursor::appendStats( bb ); bb.done(); } - - timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start ); - if ( anyReplEnabled() ){ + { + BSONObjBuilder bb( result.subobjStart( "network" ) ); + networkCounter.append( bb ); + bb.done(); + } + + + timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start ); + + if ( anyReplEnabled() ) { BSONObjBuilder bb( result.subobjStart( "repl" ) ); appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() ); bb.done(); + + if ( ! _isMaster() ) { + result.append( "opcountersRepl" , replOpCounters.getObj() ); + } + } - timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start ); - + timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start ); + result.append( "opcounters" , globalOpCounters.getObj() ); - + { BSONObjBuilder asserts( result.subobjStart( "asserts" ) ); asserts.append( "regular" , assertionCount.regular ); @@ -404,12 +485,18 @@ namespace mongo { asserts.done(); } - timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start ); + timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start ); + + result.append( "writeBacksQueued" , ! writeBackManager.queuesEmpty() ); + + if( cmdLine.dur ) { + result.append("dur", dur::stats.asObj()); + } if ( ! authed ) result.append( "note" , "run against admin for more info" ); - - if ( Listener::getElapsedTimeMillis() - start > 1000 ){ + + if ( Listener::getElapsedTimeMillis() - start > 1000 ) { BSONObj t = timeBuilder.obj(); log() << "serverStatus was very slow: " << t << endl; result.append( "timing" , t ); @@ -426,7 +513,7 @@ namespace mongo { return true; } virtual void help( stringstream& help ) const { help << "internal"; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } CmdGetOpTime() : Command("getoptime") { } bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { writelock l( "" ); @@ -456,12 +543,10 @@ namespace mongo { return true; } void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() ); - stringstream ss; - flushOpLog( ss ); - out() << ss.str() << endl; + flushDiagLog(); if ( !cmdLine.quiet ) tlog() << "CMD: diagLogging set to " << _diaglog.level << " from: " << was << endl; result.append( "was" , was ); @@ -471,7 +556,7 @@ namespace mongo { /* remove bit from a bit array - actually remove its slot, not a clear note: this function does not work with x == 63 -- that is ok - but keep in mind in the future if max indexes were extended to + but keep in mind in the future if max indexes were extended to exactly 64 it would be a problem */ unsigned long long removeBit(unsigned long long b, int x) { @@ -499,6 +584,7 @@ namespace mongo { BackgroundOperation::assertNoBgOpInProgForNs(ns); + d = d->writingWithExtra(); d->aboutToDeleteAnIndex(); /* there may be pointers pointing at keys in the btree(s). kill them. */ @@ -513,7 +599,8 @@ namespace mongo { for ( int i = 0; i < d->nIndexes; i++ ) { if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) { idIndex = &d->idx(i); - } else { + } + else { d->idx(i).kill_idx(); } } @@ -526,9 +613,9 @@ namespace mongo { /* assuming here that id index is not multikey: */ d->multiKeyIndexBits = 0; assureSysIndexesEmptied(ns, idIndex); - anObjBuilder.append("msg", mayDeleteIdIndex ? - "indexes dropped for collection" : - "non-_id indexes dropped for collection"); + anObjBuilder.append("msg", mayDeleteIdIndex ? + "indexes dropped for collection" : + "non-_id indexes dropped for collection"); } else { // delete just one index @@ -551,9 +638,10 @@ namespace mongo { d->nIndexes--; for ( int i = x; i < d->nIndexes; i++ ) d->idx(i) = d->idx(i+1); - } else { + } + else { int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't - if( n ) { + if( n ) { log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl; } log() << "dropIndexes: " << name << " not found" << endl; @@ -578,7 +666,7 @@ namespace mongo { return false; } virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : }"; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr(); NamespaceDetails *d = nsdetails(nsToDrop.c_str()); @@ -597,7 +685,7 @@ namespace mongo { /* select count(*) */ class CmdCount : public Command { public: - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } CmdCount() : Command("count") { } virtual bool logTheOp() { return false; @@ -619,7 +707,7 @@ namespace mongo { long long n = runCount(ns.c_str(), cmdObj, err); long long nn = n; bool ok = true; - if ( n == -1 ){ + if ( n == -1 ) { nn = 0; result.appendBool( "missing" , true ); } @@ -647,7 +735,7 @@ namespace mongo { virtual bool adminOnly() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual void help( stringstream& help ) const { help << "create a collection"; } @@ -670,7 +758,7 @@ namespace mongo { virtual bool slaveOk() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual void help( stringstream& help ) const { help << "drop indexes for a collection"; } @@ -686,9 +774,9 @@ namespace mongo { if ( f.type() == String ) { return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false ); } - else if ( f.type() == Object ){ + else if ( f.type() == Object ) { int idxId = d->findIndexByKeyPattern( f.embeddedObject() ); - if ( idxId < 0 ){ + if ( idxId < 0 ) { errmsg = "can't find index with key:"; errmsg += f.embeddedObject().toString(); return false; @@ -715,7 +803,7 @@ namespace mongo { public: virtual bool logTheOp() { return false; } // only reindexes on the one node virtual bool slaveOk() const { return true; } // can reindex on a secondary - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual void help( stringstream& help ) const { help << "re-index a collection"; } @@ -729,7 +817,7 @@ namespace mongo { tlog() << "CMD: reIndex " << toDeleteNs << endl; BackgroundOperation::assertNoBgOpInProgForNs(toDeleteNs.c_str()); - if ( ! d ){ + if ( ! d ) { errmsg = "ns not found"; return false; } @@ -737,7 +825,7 @@ namespace mongo { list all; auto_ptr i = db.getIndexes( toDeleteNs ); BSONObjBuilder b; - while ( i->more() ){ + while ( i->more() ) { BSONObj o = i->next().getOwned(); b.append( BSONObjBuilder::numStr( all.size() ) , o ); all.push_back( o ); @@ -745,12 +833,12 @@ namespace mongo { bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true ); - if ( ! ok ){ + if ( ! ok ) { errmsg = "dropIndexes failed"; return false; } - for ( list::iterator i=all.begin(); i!=all.end(); i++ ){ + for ( list::iterator i=all.begin(); i!=all.end(); i++ ) { BSONObj o = *i; theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true ); } @@ -773,9 +861,9 @@ namespace mongo { virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream& help ) const { help << "list databases on this server"; } - CmdListDatabases() : Command("listDatabases") {} + CmdListDatabases() : Command("listDatabases" , true ) {} bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { vector< string > dbNames; getDatabaseNames( dbNames ); @@ -795,11 +883,11 @@ namespace mongo { seen.insert( i->c_str() ); } - + // TODO: erh 1/1/2010 I think this is broken where path != dbpath ?? set allShortNames; dbHolder.getAllShortNames( allShortNames ); - for ( set::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ){ + for ( set::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) { string name = *i; if ( seen.count( name ) ) @@ -819,33 +907,45 @@ namespace mongo { } } cmdListDatabases; - /* note an access to a database right after this will open it back up - so this is mainly - for diagnostic purposes. + /* note an access to a database right after this will open it back up - so this is mainly + for diagnostic purposes. */ class CmdCloseAllDatabases : public Command { public: virtual void help( stringstream& help ) const { help << "Close all database files.\nA new request will cause an immediate reopening; thus, this is mostly for testing purposes."; } virtual bool adminOnly() const { return true; } virtual bool slaveOk() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {} bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { - return dbHolder.closeAll( dbpath , result, false ); + bool ok; + try { + ok = dbHolder.closeAll( dbpath , result, false ); + } + catch(DBException&) { + throw; + } + catch(...) { + log() << "ERROR uncaught exception in command closeAllDatabases" << endl; + errmsg = "unexpected uncaught exception"; + return false; + } + return ok; } } cmdCloseAllDatabases; class CmdFileMD5 : public Command { public: - CmdFileMD5() : Command( "filemd5" ){} + CmdFileMD5() : Command( "filemd5" ) {} virtual bool slaveOk() const { return true; } virtual void help( stringstream& help ) const { help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }"; } - virtual LockType locktype() const { return READ; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + virtual LockType locktype() const { return READ; } + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { string ns = dbname; ns += "."; { @@ -867,8 +967,8 @@ namespace mongo { scoped_ptr cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str())); int n = 0; - while ( cursor->ok() ){ - if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ){ + while ( cursor->ok() ) { + if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ) { log() << "**** NOT MATCHING ****" << endl; PRINT(cursor->current()); cursor->advance(); @@ -884,7 +984,7 @@ namespace mongo { BSONElement ne = obj["n"]; assert(ne.isNumber()); int myn = ne.numberInt(); - if ( n != myn ){ + if ( n != myn ) { log() << "should have chunk: " << n << " have:" << myn << endl; DBDirectClient client; @@ -902,12 +1002,13 @@ namespace mongo { md5_append( &st , (const md5_byte_t*)(data) , len ); n++; - } catch (...) { + } + catch (...) { yield.relock(); // needed before yield goes out of scope throw; } - if ( ! yield.stillOk() ){ + if ( ! yield.stillOk() ) { uasserted(13281, "File deleted during filemd5 command"); } } @@ -932,15 +1033,15 @@ namespace mongo { public: CmdDatasize() : Command( "dataSize", false, "datasize" ) {} virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { help << - "determine data size for a set of data in a certain range" - "\nexample: { datasize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }" - "\nkeyPattern, min, and max parameters are optional." - "\nnote: This command may take a while to run"; + "determine data size for a set of data in a certain range" + "\nexample: { dataSize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }" + "\nkeyPattern, min, and max parameters are optional." + "\nnote: This command may take a while to run"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { Timer timer; string ns = jsobj.firstElement().String(); @@ -951,39 +1052,39 @@ namespace mongo { Client::Context ctx( ns ); NamespaceDetails *d = nsdetails(ns.c_str()); - - if ( ! d || d->nrecords == 0 ){ + + if ( ! d || d->stats.nrecords == 0 ) { result.appendNumber( "size" , 0 ); result.appendNumber( "numObjects" , 0 ); result.append( "millis" , timer.millis() ); return true; } - + result.appendBool( "estimate" , estimate ); shared_ptr c; if ( min.isEmpty() && max.isEmpty() ) { - if ( estimate ){ - result.appendNumber( "size" , d->datasize ); - result.appendNumber( "numObjects" , d->nrecords ); + if ( estimate ) { + result.appendNumber( "size" , d->stats.datasize ); + result.appendNumber( "numObjects" , d->stats.nrecords ); result.append( "millis" , timer.millis() ); return 1; } c = theDataFileMgr.findAll( ns.c_str() ); - } + } else if ( min.isEmpty() || max.isEmpty() ) { errmsg = "only one of min or max specified"; return false; - } + } else { IndexDetails *idx = cmdIndexDetailsForRange( ns.c_str(), errmsg, min, max, keyPattern ); if ( idx == 0 ) return false; - + c.reset( new BtreeCursor( d, d->idxNo(*idx), *idx, min, max, false, 1 ) ); } - - long long avgObjSize = d->datasize / d->nrecords; + + long long avgObjSize = d->stats.datasize / d->stats.nrecords; long long maxSize = jsobj["maxSize"].numberLong(); long long maxObjects = jsobj["maxObjects"].numberLong(); @@ -996,11 +1097,11 @@ namespace mongo { size += avgObjSize; else size += c->currLoc().rec()->netLength(); - + numObjects++; - - if ( ( maxSize && size > maxSize ) || - ( maxObjects && numObjects > maxObjects ) ){ + + if ( ( maxSize && size > maxSize ) || + ( maxObjects && numObjects > maxObjects ) ) { result.appendBool( "maxReached" , true ); break; } @@ -1010,7 +1111,7 @@ namespace mongo { ostringstream os; os << "Finding size for ns: " << ns; - if ( ! min.isEmpty() ){ + if ( ! min.isEmpty() ) { os << " between " << min << " and " << max; } logIfSlow( timer , os.str() ); @@ -1023,27 +1124,27 @@ namespace mongo { } cmdDatasize; namespace { - long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ){ + long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ) { dbMutex.assertAtLeastReadLocked(); NamespaceDetails * nsd = nsdetails( ns.c_str() ); if ( ! nsd ) return 0; - - long long totalSize = 0; + + long long totalSize = 0; NamespaceDetails::IndexIterator ii = nsd->ii(); - while ( ii.more() ){ + while ( ii.more() ) { IndexDetails& d = ii.next(); string collNS = d.indexNamespace(); NamespaceDetails * mine = nsdetails( collNS.c_str() ); - if ( ! mine ){ + if ( ! mine ) { log() << "error: have index [" << collNS << "] but no NamespaceDetails" << endl; continue; } - totalSize += mine->datasize; + totalSize += mine->stats.datasize; if ( details ) - details->appendNumber( d.indexName() , mine->datasize / scale ); + details->appendNumber( d.indexName() , mine->stats.datasize / scale ); } return totalSize; } @@ -1053,42 +1154,48 @@ namespace mongo { public: CollectionStats() : Command( "collStats", false, "collstats" ) {} virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { string ns = dbname + "." + jsobj.firstElement().valuestr(); Client::Context cx( ns ); - + NamespaceDetails * nsd = nsdetails( ns.c_str() ); - if ( ! nsd ){ + if ( ! nsd ) { errmsg = "ns not found"; return false; } result.append( "ns" , ns.c_str() ); - + int scale = 1; - if ( jsobj["scale"].isNumber() ){ + if ( jsobj["scale"].isNumber() ) { scale = jsobj["scale"].numberInt(); - if ( scale <= 0 ){ + if ( scale <= 0 ) { errmsg = "scale has to be > 0"; return false; } - + } - else if ( jsobj["scale"].trueValue() ){ + else if ( jsobj["scale"].trueValue() ) { errmsg = "scale has to be a number > 0"; return false; } - long long size = nsd->datasize / scale; - result.appendNumber( "count" , nsd->nrecords ); + bool verbose = jsobj["verbose"].trueValue(); + + long long size = nsd->stats.datasize / scale; + result.appendNumber( "count" , nsd->stats.nrecords ); result.appendNumber( "size" , size ); - result.append ( "avgObjSize" , double(size) / double(nsd->nrecords) ); + if( nsd->stats.nrecords ) + result.append ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) ); + int numExtents; - result.appendNumber( "storageSize" , nsd->storageSize( &numExtents ) / scale ); + BSONArrayBuilder extents; + + result.appendNumber( "storageSize" , nsd->storageSize( &numExtents , verbose ? &extents : 0 ) / scale ); result.append( "numExtents" , numExtents ); result.append( "nindexes" , nsd->nIndexes ); result.append( "lastExtentSize" , nsd->lastExtentSize / scale ); @@ -1098,12 +1205,15 @@ namespace mongo { BSONObjBuilder indexSizes; result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale ); result.append("indexSizes", indexSizes.obj()); - - if ( nsd->capped ){ + + if ( nsd->capped ) { result.append( "capped" , nsd->capped ); result.append( "max" , nsd->max ); } + if ( verbose ) + result.appendArray( "extents" , extents.arr() ); + return true; } } cmdCollectionStatis; @@ -1112,11 +1222,11 @@ namespace mongo { public: DBStats() : Command( "dbStats", false, "dbstats" ) {} virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { - help << " example: { dbstats:1 } "; + help << " example: { dbStats:1 } "; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { list collections; Database* d = cc().database(); if ( d ) @@ -1130,19 +1240,19 @@ namespace mongo { long long indexes = 0; long long indexSize = 0; - for (list::const_iterator it = collections.begin(); it != collections.end(); ++it){ + for (list::const_iterator it = collections.begin(); it != collections.end(); ++it) { const string ns = *it; NamespaceDetails * nsd = nsdetails( ns.c_str() ); - if ( ! nsd ){ + if ( ! nsd ) { errmsg = "missing ns: "; errmsg += ns; return false; } ncollections += 1; - objects += nsd->nrecords; - size += nsd->datasize; + objects += nsd->stats.nrecords; + size += nsd->stats.datasize; int temp; storageSize += nsd->storageSize( &temp ); @@ -1151,10 +1261,11 @@ namespace mongo { indexes += nsd->nIndexes; indexSize += getIndexSizeForCollection(dbname, ns); } - + + result.append ( "db" , dbname ); result.appendNumber( "collections" , ncollections ); result.appendNumber( "objects" , objects ); - result.append ( "avgObjSize" , double(size) / double(objects) ); + result.append ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) ); result.appendNumber( "dataSize" , size ); result.appendNumber( "storageSize" , storageSize); result.appendNumber( "numExtents" , numExtents ); @@ -1162,7 +1273,7 @@ namespace mongo { result.appendNumber( "indexSize" , indexSize ); result.appendNumber( "fileSize" , d->fileSize() ); - return true; + return true; } } cmdDBStats; @@ -1171,11 +1282,11 @@ namespace mongo { public: CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {} virtual bool slaveOk() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual void help( stringstream &help ) const { help << "{ cloneCollectionAsCapped:, toCollection:, size: }"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { string from = jsobj.getStringField( "cloneCollectionAsCapped" ); string to = jsobj.getStringField( "toCollection" ); long long size = (long long)jsobj.getField( "size" ).number(); @@ -1189,7 +1300,7 @@ namespace mongo { string toNs = dbname + "." + to; NamespaceDetails *nsd = nsdetails( fromNs.c_str() ); massert( 10301 , "source collection " + fromNs + " does not exist", nsd ); - long long excessSize = nsd->datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size' + long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size' DiskLoc extent = nsd->firstExtent; for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) { excessSize -= extent.ext()->length; @@ -1202,7 +1313,7 @@ namespace mongo { { shared_ptr c = theDataFileMgr.findAll( fromNs.c_str(), startLoc ); ClientCursor *cc = new ClientCursor(0, c, fromNs.c_str()); - id = cc->cursorid; + id = cc->cursorid(); } DBDirectClient client; @@ -1223,20 +1334,20 @@ namespace mongo { } } cmdCloneCollectionAsCapped; - /* jan2010: - Converts the given collection to a capped collection w/ the specified size. - This command is not highly used, and is not currently supported with sharded - environments. + /* jan2010: + Converts the given collection to a capped collection w/ the specified size. + This command is not highly used, and is not currently supported with sharded + environments. */ class CmdConvertToCapped : public Command { public: CmdConvertToCapped() : Command( "convertToCapped" ) {} virtual bool slaveOk() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual void help( stringstream &help ) const { help << "{ convertToCapped:, size: }"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str()); string from = jsobj.getStringField( "convertToCapped" ); @@ -1247,13 +1358,16 @@ namespace mongo { return false; } + string shortTmpName = str::stream() << ".tmp.convertToCapped." << from; + string longTmpName = str::stream() << dbname << "." << shortTmpName; + DBDirectClient client; - client.dropCollection( dbname + "." + from + ".$temp_convertToCapped" ); + client.dropCollection( longTmpName ); BSONObj info; if ( !client.runCommand( dbname , - BSON( "cloneCollectionAsCapped" << from << "toCollection" << ( from + ".$temp_convertToCapped" ) << "size" << double( size ) ), - info ) ) { + BSON( "cloneCollectionAsCapped" << from << "toCollection" << shortTmpName << "size" << double( size ) ), + info ) ) { errmsg = "cloneCollectionAsCapped failed: " + info.toString(); return false; } @@ -1264,9 +1378,9 @@ namespace mongo { } if ( !client.runCommand( "admin", - BSON( "renameCollection" << ( dbname + "." + from + ".$temp_convertToCapped" ) - << "to" << ( dbname + "." + from ) ), - info ) ) { + BSON( "renameCollection" << longTmpName << + "to" << ( dbname + "." + from ) ), + info ) ) { errmsg = "renameCollection failed: " + info.toString(); return false; } @@ -1275,239 +1389,15 @@ namespace mongo { } } cmdConvertToCapped; - class GroupCommand : public Command { - public: - GroupCommand() : Command("group"){} - virtual LockType locktype() const { return READ; } - virtual bool slaveOk() const { return true; } - virtual bool slaveOverrideOk() { return true; } - virtual void help( stringstream &help ) const { - help << "http://www.mongodb.org/display/DOCS/Aggregation"; - } - - BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ){ - if ( func ){ - BSONObjBuilder b( obj.objsize() + 32 ); - b.append( "0" , obj ); - int res = s->invoke( func , b.obj() ); - uassert( 10041 , (string)"invoke failed in $keyf: " + s->getError() , res == 0 ); - int type = s->type("return"); - uassert( 10042 , "return of $key has to be an object" , type == Object ); - return s->getObject( "return" ); - } - return obj.extractFields( keyPattern , true ); - } - - bool group( string realdbname , const string& ns , const BSONObj& query , - BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope , - BSONObj initial , string finalize , - string& errmsg , BSONObjBuilder& result ){ - - - auto_ptr s = globalScriptEngine->getPooledScope( realdbname ); - s->localConnect( realdbname.c_str() ); - - if ( reduceScope ) - s->init( reduceScope ); - - s->setObject( "$initial" , initial , true ); - - s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 ); - s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 ); - ScriptingFunction f = s->createFunction( - "function(){ " - " if ( $arr[n] == null ){ " - " next = {}; " - " Object.extend( next , $key ); " - " Object.extend( next , $initial , true ); " - " $arr[n] = next; " - " next = null; " - " } " - " $reduce( obj , $arr[n] ); " - "}" ); - - ScriptingFunction keyFunction = 0; - if ( keyFunctionCode.size() ){ - keyFunction = s->createFunction( keyFunctionCode.c_str() ); - } - - - double keysize = keyPattern.objsize() * 3; - double keynum = 1; - - map map; - list blah; - - shared_ptr cursor = bestGuessCursor(ns.c_str() , query , BSONObj() ); - - while ( cursor->ok() ){ - if ( cursor->matcher() && ! cursor->matcher()->matchesCurrent( cursor.get() ) ){ - cursor->advance(); - continue; - } - - BSONObj obj = cursor->current(); - cursor->advance(); - - BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() ); - keysize += key.objsize(); - keynum++; - - int& n = map[key]; - if ( n == 0 ){ - n = map.size(); - s->setObject( "$key" , key , true ); - - uassert( 10043 , "group() can't handle more than 10000 unique keys" , n <= 10000 ); - } - - s->setObject( "obj" , obj , true ); - s->setNumber( "n" , n - 1 ); - if ( s->invoke( f , BSONObj() , 0 , true ) ){ - throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() ); - } - } - - if (!finalize.empty()){ - s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 ); - ScriptingFunction g = s->createFunction( - "function(){ " - " for(var i=0; i < $arr.length; i++){ " - " var ret = $finalize($arr[i]); " - " if (ret !== undefined) " - " $arr[i] = ret; " - " } " - "}" ); - s->invoke( g , BSONObj() , 0 , true ); - } - - result.appendArray( "retval" , s->getObject( "$arr" ) ); - result.append( "count" , keynum - 1 ); - result.append( "keys" , (int)(map.size()) ); - s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 ); - s->gc(); - - return true; - } - - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - - /* db.$cmd.findOne( { group :

} ) */ - const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck(); - - BSONObj q; - if ( p["cond"].type() == Object ) - q = p["cond"].embeddedObject(); - else if ( p["condition"].type() == Object ) - q = p["condition"].embeddedObject(); - else - q = getQuery( p ); - - if ( p["ns"].type() != String ){ - errmsg = "ns has to be set"; - return false; - } - - string ns = dbname + "." + p["ns"].String(); - - BSONObj key; - string keyf; - if ( p["key"].type() == Object ){ - key = p["key"].embeddedObjectUserCheck(); - if ( ! p["$keyf"].eoo() ){ - errmsg = "can't have key and $keyf"; - return false; - } - } - else if ( p["$keyf"].type() ){ - keyf = p["$keyf"]._asCode(); - } - else { - // no key specified, will use entire object as key - } - - BSONElement reduce = p["$reduce"]; - if ( reduce.eoo() ){ - errmsg = "$reduce has to be set"; - return false; - } - - BSONElement initial = p["initial"]; - if ( initial.type() != Object ){ - errmsg = "initial has to be an object"; - return false; - } - - - string finalize; - if (p["finalize"].type()) - finalize = p["finalize"]._asCode(); - - return group( dbname , ns , q , - key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() , - initial.embeddedObject() , finalize , - errmsg , result ); - } - - } cmdGroup; - - - class DistinctCommand : public Command { - public: - DistinctCommand() : Command("distinct"){} - virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return READ; } - virtual void help( stringstream &help ) const { - help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }"; - } - - bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - string ns = dbname + '.' + cmdObj.firstElement().valuestr(); - - string key = cmdObj["key"].valuestrsafe(); - BSONObj keyPattern = BSON( key << 1 ); - - BSONObj query = getQuery( cmdObj ); - - BSONElementSet values; - shared_ptr cursor = bestGuessCursor(ns.c_str() , query , BSONObj() ); - scoped_ptr cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns)); - - while ( cursor->ok() ){ - if ( !cursor->matcher() || cursor->matcher()->matchesCurrent( cursor.get() ) ){ - BSONObj o = cursor->current(); - o.getFieldsDotted( key, values ); - } - - cursor->advance(); - - if (!cc->yieldSometimes()) - break; - } - - BSONArrayBuilder b( result.subarrayStart( "values" ) ); - for ( BSONElementSet::iterator i = values.begin() ; i != values.end(); i++ ){ - b.append( *i ); - } - BSONObj arr = b.done(); - - uassert(10044, "distinct too big, 4mb cap", - (arr.objsize() + 1024) < (4 * 1024 * 1024)); - - return true; - } - - } distinctCmd; - /* Find and Modify an object returning either the old (default) or new value*/ class CmdFindAndModify : public Command { public: virtual void help( stringstream &help ) const { - help << - "{ findandmodify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n" - "{ findandmodify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n" - "Either update or remove is required, all other fields have default values.\n" - "Output is in the \"value\" field\n"; + help << + "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n" + "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n" + "Either update or remove is required, all other fields have default values.\n" + "Output is in the \"value\" field\n"; } CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { } @@ -1517,7 +1407,7 @@ namespace mongo { virtual bool slaveOk() const { return false; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return WRITE; } virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { static DBDirectClient db; @@ -1535,8 +1425,8 @@ namespace mongo { const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder); BSONObj out = db.findOne(ns, q, fields); - if (out.isEmpty()){ - if (!upsert){ + if (out.isEmpty()) { + if (!upsert) { errmsg = "No matching object found"; return false; } @@ -1546,9 +1436,13 @@ namespace mongo { uassert(13330, "upsert mode requires query field", !origQuery.isEmpty()); db.update(ns, origQuery, update.embeddedObjectUserCheck(), true); - if (cmdObj["new"].trueValue()){ - BSONObj gle = db.getLastErrorDetailed(); + BSONObj gle = db.getLastErrorDetailed(); + if (gle["err"].type() == String) { + errmsg = gle["err"].String(); + return false; + } + if (cmdObj["new"].trueValue()) { BSONElement _id = gle["upserted"]; if (_id.eoo()) _id = origQuery["_id"]; @@ -1556,33 +1450,46 @@ namespace mongo { out = db.findOne(ns, QUERY("_id" << _id), fields); } - } else { - - Query idQuery = QUERY( "_id" << out["_id"]); + } + else { - if (cmdObj["remove"].trueValue()){ + if (cmdObj["remove"].trueValue()) { uassert(12515, "can't remove and update", cmdObj["update"].eoo()); - db.remove(ns, idQuery, 1); - - } else { // update - - // need to include original query for $ positional operator - BSONObjBuilder b; - b.append(out["_id"]); - BSONObjIterator it(origQuery); - while (it.more()){ - BSONElement e = it.next(); - if (strcmp(e.fieldName(), "_id")) - b.append(e); + db.remove(ns, QUERY("_id" << out["_id"]), 1); + + } + else { // update + + BSONElement queryId = origQuery["_id"]; + if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) { + // need to include original query for $ positional operator + + BSONObjBuilder b; + b.append(out["_id"]); + BSONObjIterator it(origQuery); + while (it.more()) { + BSONElement e = it.next(); + if (strcmp(e.fieldName(), "_id")) + b.append(e); + } + q = Query(b.obj()); } - q = Query(b.obj()); + + if (q.isComplex()) // update doesn't work with complex queries + q = Query(q.getFilter().getOwned()); BSONElement update = cmdObj["update"]; uassert(12516, "must specify remove or update", !update.eoo()); db.update(ns, q, update.embeddedObjectUserCheck()); + BSONObj gle = db.getLastErrorDetailed(); + if (gle["err"].type() == String) { + errmsg = gle["err"].String(); + return false; + } + if (cmdObj["new"].trueValue()) - out = db.findOne(ns, idQuery, fields); + out = db.findOne(ns, QUERY("_id" << out["_id"]), fields); } } @@ -1591,7 +1498,7 @@ namespace mongo { return true; } } cmdFindAndModify; - + /* Returns client's uri */ class CmdWhatsMyUri : public Command { public: @@ -1599,20 +1506,20 @@ namespace mongo { virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual void help( stringstream &help ) const { help << "{whatsmyuri:1}"; - } + } virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { BSONObj info = cc().curop()->infoNoauth(); result << "you" << info[ "client" ]; return true; } } cmdWhatsMyUri; - + /* For testing only, not for general use */ class GodInsert : public Command { public: @@ -1629,7 +1536,7 @@ namespace mongo { } virtual void help( stringstream &help ) const { help << "internal. for testing only."; - } + } virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string coll = cmdObj[ "godinsert" ].valuestrsafe(); uassert( 13049, "godinsert must specify a collection", !coll.empty() ); @@ -1642,31 +1549,32 @@ namespace mongo { class DBHashCmd : public Command { public: - DBHashCmd() : Command( "dbHash", false, "dbhash" ){} + DBHashCmd() : Command( "dbHash", false, "dbhash" ) {} virtual bool slaveOk() const { return true; } virtual LockType locktype() const { return READ; } - virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { list colls; Database* db = cc().database(); if ( db ) db->namespaceIndex.getNamespaces( colls ); colls.sort(); - + result.appendNumber( "numCollections" , (long long)colls.size() ); - + result.append( "host" , prettyHostName() ); + md5_state_t globalState; md5_init(&globalState); BSONObjBuilder bb( result.subobjStart( "collections" ) ); - for ( list::iterator i=colls.begin(); i != colls.end(); i++ ){ + for ( list::iterator i=colls.begin(); i != colls.end(); i++ ) { string c = *i; if ( c.find( ".system.profil" ) != string::npos ) continue; - + shared_ptr cursor; NamespaceDetails * nsd = nsdetails( c.c_str() ); - + // debug SERVER-761 NamespaceDetails::IndexIterator ii = nsd->ii(); while( ii.more() ) { @@ -1678,15 +1586,15 @@ namespace mongo { log() << endl; } } - + int idNum = nsd->findIdIndex(); - if ( idNum >= 0 ){ + if ( idNum >= 0 ) { cursor.reset( new BtreeCursor( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) ); } - else if ( c.find( ".system." ) != string::npos ){ + else if ( c.find( ".system." ) != string::npos ) { continue; } - else if ( nsd->capped ){ + else if ( nsd->capped ) { cursor = findTableScan( c.c_str() , BSONObj() ); } else { @@ -1697,9 +1605,9 @@ namespace mongo { md5_state_t st; md5_init(&st); - + long long n = 0; - while ( cursor->ok() ){ + while ( cursor->ok() ) { BSONObj c = cursor->current(); md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() ); n++; @@ -1708,7 +1616,7 @@ namespace mongo { md5digest d; md5_finish(&st, d); string hash = digestToString( d ); - + bb.append( c.c_str() + ( dbname.size() + 1 ) , hash ); md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() ); @@ -1727,9 +1635,9 @@ namespace mongo { } dbhashCmd; /* for diagnostic / testing purposes. */ - class CmdSleep : public Command { + class CmdSleep : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool adminOnly() const { return true; } virtual bool logTheOp() { return false; } virtual bool slaveOk() const { return true; } @@ -1739,46 +1647,43 @@ namespace mongo { } CmdSleep() : Command("sleep") { } bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if( cmdObj.getBoolField("w") ) { + + + int secs = 100; + if ( cmdObj["secs"].isNumber() ) + secs = cmdObj["secs"].numberInt(); + + if( cmdObj.getBoolField("w") ) { writelock lk(""); - sleepsecs(100); + sleepsecs(secs); } else { readlock lk(""); - sleepsecs(100); + sleepsecs(secs); } + return true; } } cmdSleep; - class AvailableQueryOptions : public Command { - public: - AvailableQueryOptions() : Command( "availablequeryoptions" ){} - virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return NONE; } - virtual bool requiresAuth() { return false; } - virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - result << "options" << QueryOption_AllSupported; - return true; - } - } availableQueryOptionsCmd; - // just for testing class CapTrunc : public Command { public: - CapTrunc() : Command( "captrunc" ){} + CapTrunc() : Command( "captrunc" ) {} virtual bool slaveOk() const { return false; } virtual LockType locktype() const { return WRITE; } virtual bool requiresAuth() { return true; } - virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string coll = cmdObj[ "captrunc" ].valuestrsafe(); uassert( 13416, "captrunc must specify a collection", !coll.empty() ); string ns = dbname + "." + coll; int n = cmdObj.getIntField( "n" ); + + // inclusive range? bool inc = cmdObj.getBoolField( "inc" ); NamespaceDetails *nsd = nsdetails( ns.c_str() ); ReverseCappedCursor c( nsd ); - massert( 13417, "captrunc invalid collection", c.ok() ); + massert( 13417, "captrunc collection not found or empty", c.ok() ); for( int i = 0; i < n; ++i ) { massert( 13418, "captrunc invalid n", c.advance() ); } @@ -1786,16 +1691,16 @@ namespace mongo { nsd->cappedTruncateAfter( ns.c_str(), end, inc ); return true; } - } capTruncCmd; - + } capTruncCmd; + // just for testing class EmptyCapped : public Command { public: - EmptyCapped() : Command( "emptycapped" ){} + EmptyCapped() : Command( "emptycapped" ) {} virtual bool slaveOk() const { return false; } virtual LockType locktype() const { return WRITE; } virtual bool requiresAuth() { return true; } - virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string coll = cmdObj[ "emptycapped" ].valuestrsafe(); uassert( 13428, "emptycapped must specify a collection", !coll.empty() ); string ns = dbname + "." + coll; @@ -1804,9 +1709,9 @@ namespace mongo { nsd->emptyCappedCollection( ns.c_str() ); return true; } - } emptyCappedCmd; - - /** + } emptyCappedCmd; + + /** * this handles - auth - locking @@ -1814,53 +1719,52 @@ namespace mongo { then calls run() */ bool execCommand( Command * c , - Client& client , int queryOptions , - const char *cmdns, BSONObj& cmdObj , - BSONObjBuilder& result, - bool fromRepl ){ - + Client& client , int queryOptions , + const char *cmdns, BSONObj& cmdObj , + BSONObjBuilder& result, + bool fromRepl ) { + string dbname = nsToDatabase( cmdns ); - - AuthenticationInfo *ai = client.getAuthenticationInfo(); - if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) { - result.append( "errmsg" , + AuthenticationInfo *ai = client.getAuthenticationInfo(); + + if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) { + result.append( "errmsg" , "unauthorized: this command must run from localhost when running db without auth" ); log() << "command denied: " << cmdObj.toString() << endl; return false; } - if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) { result.append( "errmsg" , "access denied; use admin db" ); log() << "command denied: " << cmdObj.toString() << endl; return false; - } + } - if ( cmdObj["help"].trueValue() ){ + if ( cmdObj["help"].trueValue() ) { stringstream ss; ss << "help for: " << c->name << " "; c->help( ss ); result.append( "help" , ss.str() ); result.append( "lockType" , c->locktype() ); return true; - } + } - bool canRunHere = + bool canRunHere = isMaster( dbname.c_str() ) || c->slaveOk() || ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) || fromRepl; - if ( ! canRunHere ){ + if ( ! canRunHere ) { result.append( "errmsg" , "not master" ); return false; } if ( c->adminOnly() ) log( 2 ) << "command: " << cmdObj << endl; - - if ( c->locktype() == Command::NONE ){ + + if ( c->locktype() == Command::NONE ) { // we also trust that this won't crash string errmsg; int ok = c->run( dbname , cmdObj , errmsg , result , fromRepl ); @@ -1868,35 +1772,35 @@ namespace mongo { result.append( "errmsg" , errmsg ); return ok; } - + bool needWriteLock = c->locktype() == Command::WRITE; - - if ( ! needWriteLock ){ + + if ( ! needWriteLock ) { assert( ! c->logTheOp() ); } mongolock lk( needWriteLock ); Client::Context ctx( dbname , dbpath , &lk , c->requiresAuth() ); - + try { string errmsg; - if ( ! c->run(dbname, cmdObj, errmsg, result, fromRepl ) ){ + if ( ! c->run(dbname, cmdObj, errmsg, result, fromRepl ) ) { result.append( "errmsg" , errmsg ); return false; } } - catch ( DBException& e ){ + catch ( DBException& e ) { stringstream ss; ss << "exception: " << e.what(); result.append( "errmsg" , ss.str() ); result.append( "code" , e.getCode() ); return false; } - - if ( c->logTheOp() && ! fromRepl ){ + + if ( c->logTheOp() && ! fromRepl ) { logOp("c", cmdns, cmdObj); } - + return true; } @@ -1912,9 +1816,9 @@ namespace mongo { cc().curop()->ensureStarted(); string dbname = nsToDatabase( ns ); - if( logLevel >= 1 ) + if( logLevel >= 1 ) log() << "run command " << ns << ' ' << _cmdobj << endl; - + const char *p = strchr(ns, '.'); if ( !p ) return false; if ( strcmp(p, ".$cmd") != 0 ) return false; @@ -1934,14 +1838,14 @@ namespace mongo { bool ok = false; BSONElement e = jsobj.firstElement(); - + Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0; - if ( c ){ + if ( c ) { ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl ); } else { - anObjBuilder.append("errmsg", "no such cmd"); + anObjBuilder.append("errmsg", str::stream() << "no such cmd: " << e.fieldName() ); anObjBuilder.append("bad cmd" , _cmdobj ); } @@ -1953,5 +1857,5 @@ namespace mongo { return true; } - + } // namespace mongo diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp index 2d08ac8..82a9c91 100644 --- a/db/dbcommands_admin.cpp +++ b/db/dbcommands_admin.cpp @@ -25,34 +25,36 @@ #include "pch.h" #include "jsobj.h" #include "pdfile.h" -#include "namespace.h" +#include "namespace-inl.h" #include "commands.h" #include "cmdline.h" #include "btree.h" -#include "curop.h" +#include "curop-inl.h" #include "../util/background.h" +#include "../util/logfile.h" +#include "../util/alignedbuilder.h" #include "../scripting/engine.h" namespace mongo { class CleanCmd : public Command { public: - CleanCmd() : Command( "clean" ){} + CleanCmd() : Command( "clean" ) {} virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return WRITE; } - + virtual LockType locktype() const { return WRITE; } + virtual void help(stringstream& h) const { h << "internal"; } - bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe(); - + if ( !cmdLine.quiet ) tlog() << "CMD: clean " << dropns << endl; - + NamespaceDetails *d = nsdetails(dropns.c_str()); - - if ( ! d ){ + + if ( ! d ) { errmsg = "ns not found"; return 0; } @@ -63,39 +65,108 @@ namespace mongo { result.append("ns", dropns.c_str()); return 1; } - + } cleanCmd; - + + namespace dur { + filesystem::path getJournalDir(); + } + + class JournalLatencyTestCmd : public Command { + public: + JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {} + + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return true; } + virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; } + + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + filesystem::path p = dur::getJournalDir(); + p /= "journalLatencyTest"; + + // remove file if already present + try { + remove(p); + } + catch(...) { } + + BSONObjBuilder bb[2]; + for( int pass = 0; pass < 2; pass++ ) { + LogFile f(p.string()); + AlignedBuilder b(1024 * 1024); + { + Timer t; + for( int i = 0 ; i < 100; i++ ) { + f.synchronousAppend(b.buf(), 8192); + } + bb[pass].append("8KB", t.millis() / 100.0); + } + { + const int N = 50; + Timer t2; + long long x = 0; + for( int i = 0 ; i < N; i++ ) { + Timer t; + f.synchronousAppend(b.buf(), 8192); + x += t.micros(); + sleepmillis(4); + } + long long y = t2.micros() - 4*N*1000; + // not really trusting the timer granularity on all platforms so whichever is higher of x and y + bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0)); + } + { + Timer t; + for( int i = 0 ; i < 20; i++ ) { + f.synchronousAppend(b.buf(), 1024 * 1024); + } + bb[pass].append("1MB", t.millis() / 20.0); + } + // second time around, we are prealloced. + } + result.append("timeMillis", bb[0].obj()); + result.append("timeMillisWithPrealloc", bb[1].obj()); + + try { + remove(p); + } + catch(...) { } + + return 1; + } + } journalLatencyTestCmd; + class ValidateCmd : public Command { public: - ValidateCmd() : Command( "validate" ){} + ValidateCmd() : Command( "validate" ) {} virtual bool slaveOk() const { return true; } - + virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness. Slow."; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } //{ validate: "collectionnamewithoutthedbpart" [, scandata: ] } */ - - bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + + bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { string ns = dbname + "." + cmdObj.firstElement().valuestrsafe(); NamespaceDetails * d = nsdetails( ns.c_str() ); if ( !cmdLine.quiet ) tlog() << "CMD: validate " << ns << endl; - if ( ! d ){ + if ( ! d ) { errmsg = "ns not found"; return 0; } - + result.append( "ns", ns ); result.append( "result" , validateNS( ns.c_str() , d, &cmdObj ) ); return 1; } - - + + string validateNS(const char *ns, NamespaceDetails *d, BSONObj *cmdObj) { bool scanData = true; if( cmdObj && cmdObj->hasElement("scandata") && !cmdObj->getBoolField("scandata") ) @@ -106,13 +177,13 @@ namespace mongo { //ss << " details: " << hex << d << " ofs:" << nsindex(ns)->detailsOffset(d) << dec << endl; if ( d->capped ) ss << " capped:" << d->capped << " max:" << d->max << '\n'; - - ss << " firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.buf << '\n'; - ss << " lastExtent:" << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.buf << '\n'; + + ss << " firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()<< '\n'; + ss << " lastExtent:" << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString() << '\n'; try { d->firstExtent.ext()->assertOk(); d->lastExtent.ext()->assertOk(); - + DiskLoc el = d->firstExtent; int ne = 0; while( !el.isNull() ) { @@ -123,12 +194,13 @@ namespace mongo { killCurrentOp.checkForInterrupt(); } ss << " # extents:" << ne << '\n'; - } catch (...) { + } + catch (...) { valid=false; ss << " extent asserted "; } - ss << " datasize?:" << d->datasize << " nrecords?:" << d->nrecords << " lastExtentSize:" << d->lastExtentSize << '\n'; + ss << " datasize?:" << d->stats.datasize << " nrecords?:" << d->stats.nrecords << " lastExtentSize:" << d->lastExtentSize << '\n'; ss << " padding:" << d->paddingFactor << '\n'; try { @@ -175,7 +247,7 @@ namespace mongo { else ss << " (OK)"; ss << '\n'; } - ss << " " << n << " objects found, nobj:" << d->nrecords << '\n'; + ss << " " << n << " objects found, nobj:" << d->stats.nrecords << '\n'; ss << " " << len << " bytes data w/headers\n"; ss << " " << nlen << " bytes data wout/headers\n"; } @@ -198,7 +270,7 @@ namespace mongo { ndel++; if ( loc.questionable() ) { - if( d->capped && !loc.isValid() && i == 1 ) { + if( d->capped && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ @@ -218,7 +290,8 @@ namespace mongo { k++; killCurrentOp.checkForInterrupt(); } - } catch (...) { + } + catch (...) { ss <<" ?exception in deleted chain for bucket " << i << endl; valid = false; } @@ -236,7 +309,7 @@ namespace mongo { while( i.more() ) { IndexDetails& id = i.next(); ss << " " << id.indexNamespace() << " keys:" << - id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl; + id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl; } } catch (...) { @@ -261,36 +334,36 @@ namespace mongo { extern unsigned lockedForWriting; extern mongo::mutex lockedForWritingMutex; -/* - class UnlockCommand : public Command { - public: - UnlockCommand() : Command( "unlock" ) { } - virtual bool readOnly() { return true; } - virtual bool slaveOk() const { return true; } - virtual bool adminOnly() const { return true; } - virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if( lockedForWriting ) { - log() << "command: unlock requested" << endl; - errmsg = "unlock requested"; - unlockRequested = true; - } - else { - errmsg = "not locked, so cannot unlock"; - return 0; + /* + class UnlockCommand : public Command { + public: + UnlockCommand() : Command( "unlock" ) { } + virtual bool readOnly() { return true; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( lockedForWriting ) { + log() << "command: unlock requested" << endl; + errmsg = "unlock requested"; + unlockRequested = true; + } + else { + errmsg = "not locked, so cannot unlock"; + return 0; + } + return 1; } - return 1; - } - - } unlockCommand; -*/ + + } unlockCommand; + */ /* see unlockFsync() for unlocking: db.$cmd.sys.unlock.findOne() */ class FSyncCommand : public Command { - class LockDBJob : public BackgroundJob { + class LockDBJob : public BackgroundJob { protected: - string name() { return "lockdbjob"; } - void run() { + virtual string name() const { return "lockdbjob"; } + void run() { Client::initThread("fsyncjob"); Client& c = cc(); { @@ -301,8 +374,8 @@ namespace mongo { MemoryMappedFile::flushAll(true); log() << "db is now locked for snapshotting, no writes allowed. use db.$cmd.sys.unlock.findOne() to unlock" << endl; _ready = true; - while( 1 ) { - if( unlockRequested ) { + while( 1 ) { + if( unlockRequested ) { unlockRequested = false; break; } @@ -316,54 +389,70 @@ namespace mongo { } public: bool& _ready; - LockDBJob(bool& ready) : _ready(ready) { - deleteSelf = true; + LockDBJob(bool& ready) : BackgroundJob( true /* delete self */ ), _ready(ready) { _ready = false; } }; public: - FSyncCommand() : Command( "fsync" ){} - virtual LockType locktype() const { return WRITE; } + FSyncCommand() : Command( "fsync" ) {} + virtual LockType locktype() const { return WRITE; } virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return true; } - /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { + /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { string x = cmdObj["exec"].valuestrsafe(); return !x.empty(); }*/ virtual void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/fsync+Command"; } virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - /* async means do an fsync, but return immediately */ - bool sync = ! cmdObj["async"].trueValue(); + bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately bool lock = cmdObj["lock"].trueValue(); log() << "CMD fsync: sync:" << sync << " lock:" << lock << endl; - if( lock ) { + if( lock ) { + // fsync and lock variation + uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested); uassert(12032, "fsync: sync option must be true when using lock", sync); - /* With releaseEarly(), we must be extremely careful we don't do anything - where we would have assumed we were locked. profiling is one of those things. - Perhaps at profile time we could check if we released early -- however, + /* With releaseEarly(), we must be extremely careful we don't do anything + where we would have assumed we were locked. profiling is one of those things. + Perhaps at profile time we could check if we released early -- however, we need to be careful to keep that code very fast it's a very common code path when on. */ uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0); + + // todo future: Perhaps we could do this in the background thread. As is now, writes may interleave between + // the releaseEarly below and the acquisition of the readlock in the background thread. + // However the real problem is that it seems complex to unlock here and then have a window for + // writes before the bg job -- can be done correctly but harder to reason about correctness. + // If this command ran within a read lock in the first place, would it work, and then that + // would be quite easy? + // Or, could we downgrade the write lock to a read lock, wait for ready, then release? + getDur().syncDataAndTruncateJournal(); + bool ready = false; LockDBJob *l = new LockDBJob(ready); + dbMutex.releaseEarly(); + l->go(); - // don't return until background thread has acquired the write lock - while( !ready ) { + // don't return until background thread has acquired the read lock + while( !ready ) { sleepmillis(10); } result.append("info", "now locked against writes, use db.$cmd.sys.unlock.findOne() to unlock"); } else { + // the simple fsync command case + + if (sync) + getDur().commitNow(); result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) ); } return 1; } - + } fsyncCmd; - + } diff --git a/db/dbcommands_generic.cpp b/db/dbcommands_generic.cpp index 25c6a93..a555b6c 100644 --- a/db/dbcommands_generic.cpp +++ b/db/dbcommands_generic.cpp @@ -52,114 +52,192 @@ namespace mongo { CmdBuildInfo() : Command( "buildInfo", true, "buildinfo" ) {} virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual void help( stringstream &help ) const { help << "get version #, etc.\n"; help << "{ buildinfo:1 }"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo(); result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 ); - result.appendBool( "debug" , -#ifdef _DEBUG - true -#else - false -#endif - ); + result.appendBool( "debug" , debug ); + result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); return true; } } cmdBuildInfo; + /** experimental. either remove or add support in repl sets also. in a repl set, getting this setting from the + repl set config could make sense. + */ + unsigned replApplyBatchSize = 1; - /* just to check if the db has asserted */ - class CmdAssertInfo : public Command { + class CmdGet : public Command { public: - virtual bool slaveOk() const { + CmdGet() : Command( "getParameter" ) { } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "get administrative option(s)\nexample:\n"; + help << "{ getParameter:1, notablescan:1 }\n"; + help << "supported so far:\n"; + help << " quiet\n"; + help << " notablescan\n"; + help << " logLevel\n"; + help << " syncdelay\n"; + help << "{ getParameter:'*' } to get everything\n"; + } + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + bool all = *cmdObj.firstElement().valuestrsafe() == '*'; + + int before = result.len(); + + if( all || cmdObj.hasElement("quiet") ) { + result.append("quiet", cmdLine.quiet ); + } + if( all || cmdObj.hasElement("notablescan") ) { + result.append("notablescan", cmdLine.noTableScan); + } + if( all || cmdObj.hasElement("logLevel") ) { + result.append("logLevel", logLevel); + } + if( all || cmdObj.hasElement("syncdelay") ) { + result.append("syncdelay", cmdLine.syncdelay); + } + if( all || cmdObj.hasElement("replApplyBatchSize") ) { + result.append("replApplyBatchSize", replApplyBatchSize); + } + + if ( before == result.len() ) { + errmsg = "no option found to get"; + return false; + } return true; } - virtual void help( stringstream& help ) const { - help << "check if any asserts have occurred on the server"; + } cmdGet; + + class CmdSet : public Command { + public: + CmdSet() : Command( "setParameter" ) { } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "set administrative option(s)\nexample:\n"; + help << "{ setParameter:1, notablescan:true }\n"; + help << "supported so far:\n"; + help << " notablescan\n"; + help << " logLevel\n"; + help << " quiet\n"; } - virtual LockType locktype() const { return WRITE; } - CmdAssertInfo() : Command("assertInfo",true,"assertinfo") {} - bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - result.appendBool("dbasserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet()); - result.appendBool("asserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet() || lastAssert[3].isSet()); - result.append("assert", lastAssert[AssertRegular].toString()); - result.append("assertw", lastAssert[AssertW].toString()); - result.append("assertmsg", lastAssert[AssertMsg].toString()); - result.append("assertuser", lastAssert[AssertUser].toString()); + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + int s = 0; + if( cmdObj.hasElement("notablescan") ) { + result.append("was", cmdLine.noTableScan); + cmdLine.noTableScan = cmdObj["notablescan"].Bool(); + s++; + } + if( cmdObj.hasElement("quiet") ) { + result.append("was", cmdLine.quiet ); + cmdLine.quiet = cmdObj["quiet"].Bool(); + s++; + } + if( cmdObj.hasElement("syncdelay") ) { + result.append("was", cmdLine.syncdelay ); + cmdLine.syncdelay = cmdObj["syncdelay"].Number(); + s++; + } + if( cmdObj.hasElement( "logLevel" ) ) { + result.append("was", logLevel ); + logLevel = cmdObj["logLevel"].numberInt(); + s++; + } + if( cmdObj.hasElement( "replApplyBatchSize" ) ) { + result.append("was", replApplyBatchSize ); + BSONElement e = cmdObj["replApplyBatchSize"]; + ParameterValidator * v = ParameterValidator::get( e.fieldName() ); + assert( v ); + if ( ! v->isValid( e , errmsg ) ) + return false; + replApplyBatchSize = e.numberInt(); + s++; + } + + if( s == 0 ) { + errmsg = "no option found to set, use '*' to get all "; + return false; + } + return true; } - } cmdAsserts; + } cmdSet; class PingCommand : public Command { public: - PingCommand() : Command( "ping" ){} + PingCommand() : Command( "ping" ) {} virtual bool slaveOk() const { return true; } virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; } virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } - virtual bool run(const string& badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + virtual bool run(const string& badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { // IMPORTANT: Don't put anything in here that might lock db - including authentication return true; } } pingCmd; - + class FeaturesCmd : public Command { public: - FeaturesCmd() : Command( "features", true ){} - void help(stringstream& h) const { h << "return on build level feature settings"; } + FeaturesCmd() : Command( "features", true ) {} + void help(stringstream& h) const { h << "return build level feature settings"; } virtual bool slaveOk() const { return true; } - virtual bool readOnly(){ return true; } - virtual LockType locktype() const { return READ; } - virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ - if ( globalScriptEngine ){ + virtual bool readOnly() { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if ( globalScriptEngine ) { BSONObjBuilder bb( result.subobjStart( "js" ) ); result.append( "utf8" , globalScriptEngine->utf8Ok() ); bb.done(); } - if ( cmdObj["oidReset"].trueValue() ){ - result.append( "oidMachineOld" , OID::staticMachine() ); - OID::newState(); + if ( cmdObj["oidReset"].trueValue() ) { + result.append( "oidMachineOld" , OID::getMachineId() ); + OID::regenMachineId(); } - result.append( "oidMachine" , OID::staticMachine() ); + result.append( "oidMachine" , OID::getMachineId() ); return true; } - + } featuresCmd; class LogRotateCmd : public Command { public: - LogRotateCmd() : Command( "logRotate" ){} - virtual LockType locktype() const { return NONE; } + LogRotateCmd() : Command( "logRotate" ) {} + virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return true; } virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { rotateLogs(); return 1; - } - + } + } logRotateCmd; - + class ListCommandsCmd : public Command { public: virtual void help( stringstream &help ) const { help << "get a list of all db commands"; } - ListCommandsCmd() : Command( "listCommands", false ){} - virtual LockType locktype() const { return NONE; } + ListCommandsCmd() : Command( "listCommands", false ) {} + virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return false; } virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { BSONObjBuilder b( result.subobjStart( "commands" ) ); - for ( map::iterator i=_commands->begin(); i!=_commands->end(); ++i ){ + for ( map::iterator i=_commands->begin(); i!=_commands->end(); ++i ) { Command * c = i->second; // don't show oldnames if (i->first != c->name) continue; - BSONObjBuilder temp( b.subobjStart( c->name.c_str() ) ); + BSONObjBuilder temp( b.subobjStart( c->name ) ); { stringstream help; @@ -174,10 +252,10 @@ namespace mongo { b.done(); return 1; - } + } } listCommandsCmd; - + class CmdShutdown : public Command { public: virtual bool requiresAuth() { return true; } @@ -189,7 +267,7 @@ namespace mongo { virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return NONE; } virtual void help( stringstream& help ) const { help << "shutdown the database. must be ran against admin db and either (1) ran from localhost or (2) authenticated.\n"; } @@ -199,8 +277,11 @@ namespace mongo { if ( c ) { c->shutdown(); } + log() << "terminating, shutdown command received" << endl; - dbexit( EXIT_CLEAN ); // this never returns + + dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns + assert(0); return true; } } cmdShutdown; @@ -217,7 +298,7 @@ namespace mongo { virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } CmdForceError() : Command("forceerror") {} bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { uassert( 10038 , "forced error", false); @@ -225,6 +306,17 @@ namespace mongo { } } cmdForceError; - + class AvailableQueryOptions : public Command { + public: + AvailableQueryOptions() : Command( "availableQueryOptions" , false , "availablequeryoptions" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool requiresAuth() { return false; } + virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + result << "options" << QueryOption_AllSupported; + return true; + } + } availableQueryOptionsCmd; + } diff --git a/db/dbeval.cpp b/db/dbeval.cpp index e8a42b2..31d5260 100644 --- a/db/dbeval.cpp +++ b/db/dbeval.cpp @@ -37,7 +37,7 @@ namespace mongo { const int edebug=0; - bool dbEval(const char *ns, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) { + bool dbEval(const string& dbName, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) { BSONElement e = cmd.firstElement(); uassert( 10046 , "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String ); @@ -60,16 +60,16 @@ namespace mongo { return false; } - auto_ptr s = globalScriptEngine->getPooledScope( ns ); + auto_ptr s = globalScriptEngine->getPooledScope( dbName ); ScriptingFunction f = s->createFunction(code); if ( f == 0 ) { errmsg = (string)"compile failed: " + s->getError(); return false; } - + if ( e.type() == CodeWScope ) s->init( e.codeWScopeScopeData() ); - s->localConnect( cc().database()->name.c_str() ); + s->localConnect( dbName.c_str() ); BSONObj args; { @@ -89,7 +89,7 @@ namespace mongo { res = s->invoke(f,args, cmdLine.quota ? 10 * 60 * 1000 : 0 ); int m = t.millis(); if ( m > cmdLine.slowMS ) { - out() << "dbeval slow, time: " << dec << m << "ms " << ns << endl; + out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl; if ( m >= 1000 ) log() << code << endl; else OCCASIONALLY log() << code << endl; } @@ -100,7 +100,7 @@ namespace mongo { errmsg += s->getError(); return false; } - + s->append( result , "retval" , "return" ); return true; @@ -122,16 +122,19 @@ namespace mongo { virtual LockType locktype() const { return NONE; } CmdEval() : Command("eval", false, "$eval") { } bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - + AuthenticationInfo *ai = cc().getAuthenticationInfo(); uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) ); - + + if ( cmdObj["nolock"].trueValue() ) { + return dbEval(dbname, cmdObj, result, errmsg); + } + // write security will be enforced in DBDirectClient mongolock lk( ai->isAuthorized( dbname.c_str() ) ); Client::Context ctx( dbname ); - - return dbEval(dbname.c_str(), cmdObj, result, errmsg); + return dbEval(dbname, cmdObj, result, errmsg); } } cmdeval; diff --git a/db/dbhelpers.cpp b/db/dbhelpers.cpp index 205787e..75db430 100644 --- a/db/dbhelpers.cpp +++ b/db/dbhelpers.cpp @@ -28,39 +28,6 @@ namespace mongo { - CursorIterator::CursorIterator( shared_ptr c , BSONObj filter ) - : _cursor( c ){ - if ( ! filter.isEmpty() ) - _matcher.reset( new CoveredIndexMatcher( filter , BSONObj() ) ); - _advance(); - } - - BSONObj CursorIterator::next(){ - BSONObj o = _o; - _advance(); - return o; - } - - bool CursorIterator::hasNext(){ - return ! _o.isEmpty(); - } - - void CursorIterator::_advance(){ - if ( ! _cursor->ok() ){ - _o = BSONObj(); - return; - } - - while ( _cursor->ok() ){ - _o = _cursor->current(); - _cursor->advance(); - if ( _matcher.get() == 0 || _matcher->matches( _o ) ) - return; - } - - _o = BSONObj(); - } - void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) { NamespaceDetails *d = nsdetails(ns); if( d == 0 ) @@ -74,7 +41,7 @@ namespace mongo { } } - if( d->nIndexes >= NamespaceDetails::NIndexesMax ) { + if( d->nIndexes >= NamespaceDetails::NIndexesMax ) { problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n'; return; } @@ -91,6 +58,7 @@ namespace mongo { theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize()); } + /** Simple QueryOp implementation to return first match. Does not support yielding. */ class FindOne : public QueryOp { public: FindOne( bool requireIndex ) : requireIndex_( requireIndex ) {} @@ -111,10 +79,15 @@ namespace mongo { one_ = c_->current(); loc_ = c_->currLoc(); setStop(); - } else { + } + else { c_->advance(); } } + virtual long long nscanned() { + assert( c_.get() ); + return c_->nscanned(); + } virtual bool mayRecordPlan() const { return false; } virtual QueryOp *_createChild() const { return new FindOne( requireIndex_ ); } BSONObj one() const { return one_; } @@ -125,11 +98,11 @@ namespace mongo { BSONObj one_; DiskLoc loc_; }; - - /* fetch a single object from collection ns that matches query + + /* fetch a single object from collection ns that matches query set your db SavedContext first */ - bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) { + bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) { MultiPlanScanner s( ns, query, BSONObj(), 0, !requireIndex ); FindOne original( requireIndex ); shared_ptr< FindOne > res = s.runOp( original ); @@ -141,10 +114,10 @@ namespace mongo { return true; } - /* fetch a single object from collection ns that matches query + /* fetch a single object from collection ns that matches query set your db SavedContext first */ - DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) { + DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) { MultiPlanScanner s( ns, query, BSONObj(), 0, !requireIndex ); FindOne original( requireIndex ); shared_ptr< FindOne > res = s.runOp( original ); @@ -153,15 +126,8 @@ namespace mongo { return res->loc(); } - auto_ptr Helpers::find( const char *ns , BSONObj query , bool requireIndex ){ - uassert( 10047 , "requireIndex not supported in Helpers::find yet" , ! requireIndex ); - auto_ptr i; - i.reset( new CursorIterator( DataFileMgr::findAll( ns ) , query ) ); - return i; - } - bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result , - bool * nsFound , bool * indexFound ){ + bool * nsFound , bool * indexFound ) { dbMutex.assertAtLeastReadLocked(); Database *database = c.database(); assert( database ); @@ -170,7 +136,7 @@ namespace mongo { return false; if ( nsFound ) *nsFound = 1; - + int idxNo = d->findIdIndex(); if ( idxNo < 0 ) return false; @@ -178,9 +144,9 @@ namespace mongo { *indexFound = 1; IndexDetails& i = d->idx( idxNo ); - + BSONObj key = i.getKeyFromQuery( query ); - + DiskLoc loc = i.head.btree()->findSingle( i , i.head , key ); if ( loc.isNull() ) return false; @@ -188,16 +154,16 @@ namespace mongo { return true; } - DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) { - int idxNo = d->findIdIndex(); - uassert(13430, "no _id index", idxNo>=0); - IndexDetails& i = d->idx( idxNo ); - BSONObj key = i.getKeyFromQuery( idquery ); - return i.head.btree()->findSingle( i , i.head , key ); + DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) { + int idxNo = d->findIdIndex(); + uassert(13430, "no _id index", idxNo>=0); + IndexDetails& i = d->idx( idxNo ); + BSONObj key = i.getKeyFromQuery( idquery ); + return i.head.btree()->findSingle( i , i.head , key ); } - bool Helpers::isEmpty(const char *ns) { - Client::Context context(ns); + bool Helpers::isEmpty(const char *ns, bool doAuth) { + Client::Context context(ns, dbpath, NULL, doAuth); shared_ptr c = DataFileMgr::findAll(ns); return !c->ok(); } @@ -221,17 +187,17 @@ namespace mongo { bool Helpers::getLast(const char *ns, BSONObj& result) { Client::Context ctx(ns); shared_ptr c = findTableScan(ns, reverseNaturalObj); - if( !c->ok() ) + if( !c->ok() ) return false; result = c->current(); return true; } - void Helpers::upsert( const string& ns , const BSONObj& o ){ + void Helpers::upsert( const string& ns , const BSONObj& o ) { BSONElement e = o["_id"]; assert( e.type() ); BSONObj id = e.wrap(); - + OpDebug debug; Client::Context context(ns); updateObjects(ns.c_str(), o, /*pattern=*/id, /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug ); @@ -249,12 +215,12 @@ namespace mongo { _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug ); } - BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ){ + BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) { BSONObjBuilder me; BSONObjBuilder k; BSONObjIterator i( o ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); k.append( e.fieldName() , 1 ); me.appendAs( e , "" ); @@ -262,8 +228,8 @@ namespace mongo { key = k.obj(); return me.obj(); } - - long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ){ + + long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) { BSONObj keya , keyb; BSONObj minClean = toKeyFormat( min , keya ); BSONObj maxClean = toKeyFormat( max , keyb ); @@ -276,33 +242,35 @@ namespace mongo { int ii = nsd->findIndexByKeyPattern( keya ); assert( ii >= 0 ); - + long long num = 0; - + IndexDetails& i = nsd->idx( ii ); shared_ptr c( new BtreeCursor( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) ); auto_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); cc->setDoingDeletes( true ); - - while ( c->ok() ){ + + while ( c->ok() ) { DiskLoc rloc = c->currLoc(); - BSONObj key = c->currKey(); if ( callback ) callback->goingToDelete( c->current() ); - + c->advance(); c->noteLocation(); - + logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() ); theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc); num++; c->checkLocation(); - if ( yield && ! cc->yieldSometimes() ){ + getDur().commitIfNeeded(); + + if ( yield && ! cc->yieldSometimes() ) { // cursor got finished by someone else, so we're done + cc.release(); // if the collection/db is dropped, cc may be deleted break; } } @@ -325,11 +293,12 @@ namespace mongo { BSONObjBuilder result; dropCollection( name_, errmsg, result ); } - } catch ( ... ) { + } + catch ( ... ) { problem() << "exception cleaning up DbSet" << endl; } } - + void DbSet::reset( const string &name, const BSONObj &key ) { if ( !name.empty() ) name_ = name; @@ -338,74 +307,77 @@ namespace mongo { Client::Context c( name_.c_str() ); if ( nsdetails( name_.c_str() ) ) { Helpers::emptyCollection( name_.c_str() ); - } else { + } + else { string err; massert( 10303 , err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) ); } - Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" ); + Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" ); } - + bool DbSet::get( const BSONObj &obj ) const { Client::Context c( name_.c_str() ); BSONObj temp; return Helpers::findOne( name_.c_str(), obj, temp, true ); } - + void DbSet::set( const BSONObj &obj, bool val ) { Client::Context c( name_.c_str() ); if ( val ) { try { BSONObj k = obj; theDataFileMgr.insertWithObjMod( name_.c_str(), k, false ); - } catch ( DBException& ) { + } + catch ( DBException& ) { // dup key - already in set } - } else { + } + else { deleteObjects( name_.c_str(), obj, true, false, false ); - } + } } - RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0){ + RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0) { static int NUM = 0; - + _root = dbpath; if ( a.size() ) _root /= a; if ( b.size() ) _root /= b; assert( a.size() || b.size() ); - + _file = _root; - + stringstream ss; ss << why << "." << terseCurrentTime(false) << "." << NUM++ << ".bson"; _file /= ss.str(); } - - RemoveSaver::~RemoveSaver(){ - if ( _out ){ + + RemoveSaver::~RemoveSaver() { + if ( _out ) { _out->close(); delete _out; _out = 0; } } - - void RemoveSaver::goingToDelete( const BSONObj& o ){ - if ( ! _out ){ + + void RemoveSaver::goingToDelete( const BSONObj& o ) { + if ( ! _out ) { create_directories( _root ); _out = new ofstream(); _out->open( _file.string().c_str() , ios_base::out | ios_base::binary ); - if ( ! _out->good() ){ + if ( ! _out->good() ) { log( LL_WARNING ) << "couldn't create file: " << _file.string() << " for remove saving" << endl; delete _out; _out = 0; return; } - + } _out->write( o.objdata() , o.objsize() ); } - - + + } // namespace mongo diff --git a/db/dbhelpers.h b/db/dbhelpers.h index ee9a59c..e793d3f 100644 --- a/db/dbhelpers.h +++ b/db/dbhelpers.h @@ -33,24 +33,10 @@ namespace mongo { class Cursor; class CoveredIndexMatcher; - class CursorIterator { - public: - CursorIterator( shared_ptr c , BSONObj filter = BSONObj() ); - BSONObj next(); - bool hasNext(); - - private: - void _advance(); - - shared_ptr _cursor; - auto_ptr _matcher; - BSONObj _o; - }; - /** all helpers assume locking is handled above them */ - struct Helpers { + struct Helpers { /* ensure the specified index exists. @@ -68,7 +54,7 @@ namespace mongo { /* fetch a single object from collection ns that matches query. set your db SavedContext first. - @param query - the query to perform. note this is the low level portion of query so "orderby : ..." + @param query - the query to perform. note this is the low level portion of query so "orderby : ..." won't work. @param requireIndex if true, complain if no index for the query. a way to guard against @@ -77,21 +63,19 @@ namespace mongo { @return true if object found */ static bool findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex = false); - static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex); + static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex); /** * @param foundIndex if passed in will be set to 1 if ns and index found * @return true if object found */ - static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result , + static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result , bool * nsFound = 0 , bool * indexFound = 0 ); - /* uasserts if no _id index. + /* uasserts if no _id index. @return null loc if not found */ static DiskLoc findById(NamespaceDetails *d, BSONObj query); - static auto_ptr find( const char *ns , BSONObj query = BSONObj() , bool requireIndex = false ); - /** Get/put the first (or last) object from a collection. Generally only useful if the collection only ever has a single object -- which is a "singleton collection". @@ -103,7 +87,7 @@ namespace mongo { static void putSingleton(const char *ns, BSONObj obj); static void putSingletonGod(const char *ns, BSONObj obj, bool logTheOp); static bool getFirst(const char *ns, BSONObj& result) { return getSingleton(ns, result); } - static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1} + static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1} /** * you have to lock @@ -115,14 +99,14 @@ namespace mongo { /** You do not need to set the database before calling. @return true if collection is empty. */ - static bool isEmpty(const char *ns); + static bool isEmpty(const char *ns, bool doAuth=true); // TODO: this should be somewhere else probably static BSONObj toKeyFormat( const BSONObj& o , BSONObj& key ); class RemoveCallback { public: - virtual ~RemoveCallback(){} + virtual ~RemoveCallback() {} virtual void goingToDelete( const BSONObj& o ) = 0; }; /* removeRange: operation is oplog'd */ @@ -163,13 +147,13 @@ namespace mongo { ~RemoveSaver(); void goingToDelete( const BSONObj& o ); - + private: path _root; path _file; ofstream* _out; - + }; - + } // namespace mongo diff --git a/db/dbmessage.h b/db/dbmessage.h index 2849de8..cc1d1d8 100644 --- a/db/dbmessage.h +++ b/db/dbmessage.h @@ -18,7 +18,7 @@ #include "diskloc.h" #include "jsobj.h" -#include "namespace.h" +#include "namespace-inl.h" #include "../util/message.h" #include "../client/constants.h" @@ -35,7 +35,7 @@ namespace mongo { */ extern bool objcheck; - + #pragma pack(1) struct QueryResult : public MsgData { long long cursorId; @@ -50,7 +50,7 @@ namespace mongo { int& _resultFlags() { return dataAsInt(); } - void setResultFlagsToOk() { + void setResultFlagsToOk() { _resultFlags() = ResultFlag_AwaitCapable; } }; @@ -63,8 +63,7 @@ namespace mongo { */ class DbMessage { public: - DbMessage(const Message& _m) : m(_m) - { + DbMessage(const Message& _m) : m(_m) , mark(0) { // for received messages, Message has only one buffer theEnd = _m.singleData()->_data + _m.header()->dataLen(); char *r = _m.singleData()->_data; @@ -86,7 +85,7 @@ namespace mongo { const char * afterNS() const { return data + strlen( data ) + 1; } - + int getInt( int num ) const { const int * foo = (const int*)afterNS(); return foo[num]; @@ -96,7 +95,17 @@ namespace mongo { return getInt( 1 ); } - void resetPull(){ nextjsobj = data; } + /** + * get an int64 at specified offsetBytes after ns + */ + long long getInt64( int offsetBytes ) const { + const char * x = afterNS(); + x += offsetBytes; + const long long * ll = (const long long*)x; + return ll[0]; + } + + void resetPull() { nextjsobj = data; } int pullInt() const { return pullInt(); } int& pullInt() { if ( nextjsobj == data ) @@ -140,10 +149,10 @@ namespace mongo { BSONObj js(nextjsobj); massert( 10305 , "Client Error: Invalid object size", js.objsize() > 3 ); massert( 10306 , "Client Error: Next object larger than space left in message", - js.objsize() < ( theEnd - data ) ); + js.objsize() < ( theEnd - data ) ); if ( objcheck && !js.valid() ) { massert( 10307 , "Client Error: bad object in message", false); - } + } nextjsobj += js.objsize(); if ( nextjsobj >= theEnd ) nextjsobj = 0; @@ -152,11 +161,12 @@ namespace mongo { const Message& msg() const { return m; } - void markSet(){ + void markSet() { mark = nextjsobj; } - - void markReset(){ + + void markReset() { + assert( mark ); nextjsobj = mark; } @@ -180,7 +190,7 @@ namespace mongo { int queryOptions; BSONObj query; BSONObj fields; - + /* parses the message into the above fields */ QueryMessage(DbMessage& d) { ns = d.getns(); @@ -232,8 +242,7 @@ namespace mongo { /* object reply helper. */ inline void replyToQuery(int queryResultFlags, AbstractMessagingPort* p, Message& requestMsg, - BSONObj& responseObj) - { + BSONObj& responseObj) { replyToQuery(queryResultFlags, p, requestMsg, (void *) responseObj.objdata(), responseObj.objsize(), 1); diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp index f17a283..7aa6148 100644 --- a/db/dbwebserver.cpp +++ b/db/dbwebserver.cpp @@ -32,6 +32,7 @@ #include "../util/version.h" #include "../util/ramlog.h" #include +#include "../util/admin_access.h" #include "dbwebserver.h" #include #undef assert @@ -52,18 +53,20 @@ namespace mongo { }; bool execCommand( Command * c , - Client& client , int queryOptions , - const char *ns, BSONObj& cmdObj , - BSONObjBuilder& result, + Client& client , int queryOptions , + const char *ns, BSONObj& cmdObj , + BSONObjBuilder& result, bool fromRepl ); class DbWebServer : public MiniWebServer { public: - DbWebServer(const string& ip, int port) : MiniWebServer(ip, port) { + DbWebServer(const string& ip, int port, const AdminAccess* webUsers) + : MiniWebServer(ip, port), _webUsers(webUsers) { WebStatusPlugin::initAll(); } private: + const AdminAccess* _webUsers; // not owned here void doUnlockedStuff(stringstream& ss) { /* this is in the header already ss << "port: " << port << '\n'; */ @@ -75,37 +78,35 @@ namespace mongo { ss << ""; } - private: - bool allowed( const char * rq , vector& headers, const SockAddr &from ) { if ( from.isLocalHost() ) return true; - if ( ! webHaveAdminUsers() ) + if ( ! _webUsers->haveAdminUsers() ) return true; string auth = getHeader( rq , "Authorization" ); - if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ){ + if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ) { auth = auth.substr( 7 ) + ", "; map parms; pcrecpp::StringPiece input( auth ); - + string name, val; pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, "); - while ( re.Consume( &input, &name, &val) ){ + while ( re.Consume( &input, &name, &val) ) { parms[name] = val; } - BSONObj user = webGetAdminUser( parms["username"] ); - if ( ! user.isEmpty() ){ + BSONObj user = _webUsers->getAdminUser( parms["username"] ); + if ( ! user.isEmpty() ) { string ha1 = user["pwd"].str(); string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] ); - + stringstream r; r << ha1 << ':' << parms["nonce"]; - if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ){ + if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ) { r << ':'; r << parms["nc"]; r << ':'; @@ -116,22 +117,20 @@ namespace mongo { r << ':'; r << ha2; string r1 = md5simpledigest( r.str() ); - + if ( r1 == parms["response"] ) return true; } - - } - + stringstream authHeader; - authHeader - << "WWW-Authenticate: " - << "Digest realm=\"mongo\", " - << "nonce=\"abc\", " - << "algorithm=MD5, qop=\"auth\" " - ; - + authHeader + << "WWW-Authenticate: " + << "Digest realm=\"mongo\", " + << "nonce=\"abc\", " + << "algorithm=MD5, qop=\"auth\" " + ; + headers.push_back( authHeader.str() ); return 0; } @@ -144,24 +143,39 @@ namespace mongo { int& responseCode, vector& headers, // if completely empty, content-type: text/html will be added const SockAddr &from - ) - { + ) { if ( url.size() > 1 ) { - + if ( ! allowed( rq , headers, from ) ) { responseCode = 401; headers.push_back( "Content-Type: text/plain" ); responseMsg = "not allowed\n"; return; - } + } { + BSONObj params; + const size_t pos = url.find( "?" ); + if ( pos != string::npos ) { + MiniWebServer::parseParams( params , url.substr( pos + 1 ) ); + url = url.substr(0, pos); + } + DbWebHandler * handler = DbWebHandler::findHandler( url ); - if ( handler ){ - if ( handler->requiresREST( url ) && ! cmdLine.rest ) + if ( handler ) { + if ( handler->requiresREST( url ) && ! cmdLine.rest ) { _rejectREST( responseMsg , responseCode , headers ); - else - handler->handle( rq , url , responseMsg , responseCode , headers , from ); + } + else { + string callback = params.getStringField("jsonp"); + uassert(13453, "server not started with --jsonp", callback.empty() || cmdLine.jsonp); + + handler->handle( rq , url , params , responseMsg , responseCode , headers , from ); + + if (responseCode == 200 && !callback.empty()) { + responseMsg = callback + '(' + responseMsg + ')'; + } + } return; } } @@ -171,27 +185,27 @@ namespace mongo { _rejectREST( responseMsg , responseCode , headers ); return; } - + responseCode = 404; headers.push_back( "Content-Type: text/html" ); responseMsg = "unknown url\n"; return; } - + // generate home page - if ( ! allowed( rq , headers, from ) ){ + if ( ! allowed( rq , headers, from ) ) { responseCode = 401; responseMsg = "not allowed\n"; return; - } + } responseCode = 200; stringstream ss; string dbname; { stringstream z; - z << "mongod " << prettyHostName(); + z << cmdLine.binaryName << ' ' << prettyHostName(); dbname = z.str(); } ss << start(dbname) << h2(dbname); @@ -202,12 +216,18 @@ namespace mongo { { const map *m = Command::webCommands(); if( m ) { - ss << a("", "These read-only context-less commands can be executed from the web interface. Results are json format, unless ?text is appended in which case the result is output as text for easier human viewing", "Commands") << ": "; - for( map::const_iterator i = m->begin(); i != m->end(); i++ ) { + ss << + a("", + "These read-only context-less commands can be executed from the web interface. " + "Results are json format, unless ?text=1 is appended in which case the result is output as text " + "for easier human viewing", + "Commands") + << ": "; + for( map::const_iterator i = m->begin(); i != m->end(); i++ ) { stringstream h; i->second->help(h); string help = h.str(); - ss << "first << "?text\""; + ss << "first << "?text=1\""; if( help != "no help defined" ) ss << " title=\"" << help << '"'; ss << ">" << i->first << " "; @@ -216,69 +236,67 @@ namespace mongo { } } ss << '\n'; - /* - ss << "HTTP admin port:" << _port << "

\n"; - */ + /* + ss << "HTTP admin port:" << _port << "

\n"; + */ doUnlockedStuff(ss); WebStatusPlugin::runAll( ss ); - + ss << "\n"; responseMsg = ss.str(); - - } - void _rejectREST( string& responseMsg , int& responseCode, vector& headers ){ - responseCode = 403; - stringstream ss; - ss << "REST is not enabled. use --rest to turn on.\n"; - ss << "check that port " << _port << " is secured for the network too.\n"; - responseMsg = ss.str(); - headers.push_back( "Content-Type: text/plain" ); + void _rejectREST( string& responseMsg , int& responseCode, vector& headers ) { + responseCode = 403; + stringstream ss; + ss << "REST is not enabled. use --rest to turn on.\n"; + ss << "check that port " << _port << " is secured for the network too.\n"; + responseMsg = ss.str(); + headers.push_back( "Content-Type: text/plain" ); } }; // --- - - bool prisort( const Prioritizable * a , const Prioritizable * b ){ + + bool prisort( const Prioritizable * a , const Prioritizable * b ) { return a->priority() < b->priority(); } // -- status framework --- - WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader ) + WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader ) : Prioritizable(priority), _name( secionName ) , _subHeading( subheader ) { if ( ! _plugins ) _plugins = new vector(); _plugins->push_back( this ); } - void WebStatusPlugin::initAll(){ + void WebStatusPlugin::initAll() { if ( ! _plugins ) return; - + sort( _plugins->begin(), _plugins->end() , prisort ); - + for ( unsigned i=0; i<_plugins->size(); i++ ) (*_plugins)[i]->init(); } - void WebStatusPlugin::runAll( stringstream& ss ){ + void WebStatusPlugin::runAll( stringstream& ss ) { if ( ! _plugins ) return; - - for ( unsigned i=0; i<_plugins->size(); i++ ){ + + for ( unsigned i=0; i<_plugins->size(); i++ ) { WebStatusPlugin * p = (*_plugins)[i]; - ss << "


\n" + ss << "
\n" << "" << p->_name << ""; - + ss << " " << p->_subHeading; ss << "
\n"; - + p->run(ss); } @@ -290,29 +308,30 @@ namespace mongo { class LogPlugin : public WebStatusPlugin { public: - LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0){ + LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0) { } - - virtual void init(){ + + virtual void init() { assert( ! _log ); _log = new RamLog(); Logstream::get().addGlobalTee( _log ); } - virtual void run( stringstream& ss ){ + virtual void run( stringstream& ss ) { _log->toHTML( ss ); } RamLog * _log; }; - + LogPlugin * logPlugin = new LogPlugin(); // -- handler framework --- DbWebHandler::DbWebHandler( const string& name , double priority , bool requiresREST ) - : Prioritizable(priority), _name(name) , _requiresREST(requiresREST){ + : Prioritizable(priority), _name(name) , _requiresREST(requiresREST) { - { // setup strings + { + // setup strings _defaultUrl = "/"; _defaultUrl += name; @@ -320,8 +339,9 @@ namespace mongo { ss << name << " priority: " << priority << " rest: " << requiresREST; _toString = ss.str(); } - - { // add to handler list + + { + // add to handler list if ( ! _handlers ) _handlers = new vector(); _handlers->push_back( this ); @@ -329,11 +349,11 @@ namespace mongo { } } - DbWebHandler * DbWebHandler::findHandler( const string& url ){ + DbWebHandler * DbWebHandler::findHandler( const string& url ) { if ( ! _handlers ) return 0; - - for ( unsigned i=0; i<_handlers->size(); i++ ){ + + for ( unsigned i=0; i<_handlers->size(); i++ ) { DbWebHandler * h = (*_handlers)[i]; if ( h->handles( url ) ) return h; @@ -341,76 +361,71 @@ namespace mongo { return 0; } - + vector * DbWebHandler::_handlers = 0; // --- basic handlers --- class FavIconHandler : public DbWebHandler { public: - FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ){} + FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ) {} - virtual void handle( const char *rq, string url, + virtual void handle( const char *rq, string url, BSONObj params, string& responseMsg, int& responseCode, - vector& headers, const SockAddr &from ){ + vector& headers, const SockAddr &from ) { responseCode = 404; headers.push_back( "Content-Type: text/plain" ); responseMsg = "no favicon\n"; } } faviconHandler; - + class StatusHandler : public DbWebHandler { public: - StatusHandler() : DbWebHandler( "_status" , 1 , false ){} - - virtual void handle( const char *rq, string url, + StatusHandler() : DbWebHandler( "_status" , 1 , false ) {} + + virtual void handle( const char *rq, string url, BSONObj params, string& responseMsg, int& responseCode, - vector& headers, const SockAddr &from ){ + vector& headers, const SockAddr &from ) { headers.push_back( "Content-Type: application/json" ); responseCode = 200; - + static vector commands; - if ( commands.size() == 0 ){ + if ( commands.size() == 0 ) { commands.push_back( "serverStatus" ); commands.push_back( "buildinfo" ); } - - BSONObj params; - if ( url.find( "?" ) != string::npos ) { - MiniWebServer::parseParams( params , url.substr( url.find( "?" ) + 1 ) ); - } - + BSONObjBuilder buf(1024); - - for ( unsigned i=0; ilocktype() == 0 ); - + BSONObj co; { BSONObjBuilder b; b.append( cmd , 1 ); - - if ( cmd == "serverStatus" && params["repl"].type() ){ + + if ( cmd == "serverStatus" && params["repl"].type() ) { b.append( "repl" , atoi( params["repl"].valuestr() ) ); } - + co = b.obj(); } - + string errmsg; - + BSONObjBuilder sub; if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) ) buf.append( cmd , errmsg ); else buf.append( cmd , sub.obj() ); } - + responseMsg = buf.obj().jsonString(); } @@ -419,14 +434,14 @@ namespace mongo { class CommandListHandler : public DbWebHandler { public: - CommandListHandler() : DbWebHandler( "_commands" , 1 , true ){} - - virtual void handle( const char *rq, string url, + CommandListHandler() : DbWebHandler( "_commands" , 1 , true ) {} + + virtual void handle( const char *rq, string url, BSONObj params, string& responseMsg, int& responseCode, - vector& headers, const SockAddr &from ){ + vector& headers, const SockAddr &from ) { headers.push_back( "Content-Type: text/html" ); responseCode = 200; - + stringstream ss; ss << start("Commands List"); ss << p( a("/", "back", "Home") ); @@ -435,41 +450,21 @@ namespace mongo { ss << "S:slave-ok R:read-lock W:write-lock A:admin-only
\n"; ss << table(); ss << "CommandAttributesHelp\n"; - for( map::const_iterator i = m->begin(); i != m->end(); i++ ) + for( map::const_iterator i = m->begin(); i != m->end(); i++ ) i->second->htmlHelp(ss); ss << _table() << _end(); - + responseMsg = ss.str(); } } commandListHandler; class CommandsHandler : public DbWebHandler { public: - CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ){} - - bool _cmd( const string& url , string& cmd , bool& text ) const { - const char * x = url.c_str(); - - if ( x[0] != '/' ){ - // this should never happen - return false; - } - - if ( strchr( x + 1 , '/' ) ) - return false; - - x++; + CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ) {} - const char * end = strstr( x , "?text" ); - if ( end ){ - text = true; - cmd = string( x , end - x ); - } - else { - text = false; - cmd = string(x); - } - + bool _cmd( const string& url , string& cmd , bool& text, bo params ) const { + cmd = str::after(url, '/'); + text = params["text"].boolean(); return true; } @@ -477,45 +472,43 @@ namespace mongo { const map *m = Command::webCommands(); if( ! m ) return 0; - + map::const_iterator i = m->find(cmd); if ( i == m->end() ) return 0; - + return i->second; } - virtual bool handles( const string& url ) const { + virtual bool handles( const string& url ) const { string cmd; bool text; - if ( ! _cmd( url , cmd , text ) ) + if ( ! _cmd( url , cmd , text, bo() ) ) return false; - - return _cmd( cmd ); + return _cmd(cmd) != 0; } - - virtual void handle( const char *rq, string url, + + virtual void handle( const char *rq, string url, BSONObj params, string& responseMsg, int& responseCode, - vector& headers, const SockAddr &from ){ - + vector& headers, const SockAddr &from ) { string cmd; bool text = false; - assert( _cmd( url , cmd , text ) ); + assert( _cmd( url , cmd , text, params ) ); Command * c = _cmd( cmd ); assert( c ); BSONObj cmdObj = BSON( cmd << 1 ); Client& client = cc(); - + BSONObjBuilder result; execCommand(c, client, 0, "admin.", cmdObj , result, false); - + responseCode = 200; - - string j = result.done().jsonString(JS, text ); + + string j = result.done().jsonString(Strict, text ); responseMsg = j; - - if( text ){ + + if( text ) { headers.push_back( "Content-Type: text/plain" ); responseMsg += '\n'; } @@ -524,23 +517,16 @@ namespace mongo { } } - + } commandsHandler; // --- external ---- - string prettyHostName() { - stringstream s; - s << getHostName(); - if( mongo::cmdLine.port != CmdLine::DefaultDBPort ) - s << ':' << mongo::cmdLine.port; - return s.str(); - } - - void webServerThread() { + void webServerThread(const AdminAccess* adminAccess) { + boost::scoped_ptr adminAccessPtr(adminAccess); // adminAccess is owned here Client::initThread("websvr"); const int p = cmdLine.port + 1000; - DbWebServer mini(cmdLine.bind_ip, p); + DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get()); log() << "web admin interface listening on port " << p << endl; mini.initAndListen(); cc().shutdown(); diff --git a/db/dbwebserver.h b/db/dbwebserver.h index d1a2f0d..bdbcba2 100644 --- a/db/dbwebserver.h +++ b/db/dbwebserver.h @@ -17,20 +17,22 @@ * along with this program. If not, see . */ +#include "../util/admin_access.h" + namespace mongo { class Prioritizable { public: - Prioritizable( double p ) : _priority(p){} + Prioritizable( double p ) : _priority(p) {} double priority() const { return _priority; } private: double _priority; }; - + class DbWebHandler : public Prioritizable { public: DbWebHandler( const string& name , double priority , bool requiresREST ); - virtual ~DbWebHandler(){} + virtual ~DbWebHandler() {} virtual bool handles( const string& url ) const { return url == _defaultUrl; } @@ -38,20 +40,21 @@ namespace mongo { virtual void handle( const char *rq, // the full request string url, + BSONObj params, // set these and return them: string& responseMsg, int& responseCode, vector& headers, // if completely empty, content-type: text/html will be added const SockAddr &from - ) = 0; - + ) = 0; + string toString() const { return _toString; } static DbWebHandler * findHandler( const string& url ); private: string _name; bool _requiresREST; - + string _defaultUrl; string _toString; @@ -61,8 +64,8 @@ namespace mongo { class WebStatusPlugin : public Prioritizable { public: WebStatusPlugin( const string& secionName , double priority , const string& subheader = "" ); - virtual ~WebStatusPlugin(){} - + virtual ~WebStatusPlugin() {} + virtual void run( stringstream& ss ) = 0; /** called when web server stats up */ virtual void init() = 0; @@ -73,18 +76,10 @@ namespace mongo { string _name; string _subHeading; static vector * _plugins; - + }; - void webServerThread(); + void webServerThread( const AdminAccess* admins ); string prettyHostName(); - - /** @return if there are any admin users. this should not block for long and throw if can't get a lock if needed */ - bool webHaveAdminUsers(); - - /** @return admin user with this name. this should not block for long and throw if can't get a lock if needed */ - BSONObj webGetAdminUser( const string& username ); }; - - diff --git a/db/diskloc.h b/db/diskloc.h index 2747abd..f356c73 100644 --- a/db/diskloc.h +++ b/db/diskloc.h @@ -14,7 +14,7 @@ * along with this program. If not, see . */ -/* storage.h +/* @file diskloc.h Storage subsystem management. Lays out our datafiles on disk, manages disk space. @@ -26,7 +26,6 @@ namespace mongo { - class Record; class DeletedRecord; class Extent; @@ -34,77 +33,64 @@ namespace mongo { class MongoDataFile; #pragma pack(1) + /** represents a disk location/offset on disk in a database. 64 bits. + it is assumed these will be passed around by value a lot so don't do anything to make them large + (such as adding a virtual function) + */ class DiskLoc { - int fileNo; /* this will be volume, file #, etc. */ + int _a; // this will be volume, file #, etc. but is a logical value could be anything depending on storage engine int ofs; + public: - // Note: MaxFiles imposes a limit of about 32TB of data per process - enum SentinelValues { MaxFiles=16000, NullOfs = -1 }; - int a() const { - return fileNo; - } + enum SentinelValues { + NullOfs = -1, + MaxFiles=16000 // thus a limit of about 32TB of data per db + }; - DiskLoc(int a, int b) : fileNo(a), ofs(b) { - //assert(ofs!=0); - } + DiskLoc(int a, int b) : _a(a), ofs(b) { } DiskLoc() { Null(); } DiskLoc(const DiskLoc& l) { - fileNo=l.fileNo; + _a=l._a; ofs=l.ofs; } - bool questionable() { + bool questionable() const { return ofs < -1 || - fileNo < -1 || - fileNo > 524288; + _a < -1 || + _a > 524288; } - bool isNull() const { - return fileNo == -1; - // return ofs == NullOfs; - } + bool isNull() const { return _a == -1; } void Null() { - fileNo = -1; - ofs = 0; - } - void assertOk() { - assert(!isNull()); + _a = -1; + ofs = 0; /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */ } + void assertOk() { assert(!isNull()); } void setInvalid() { - fileNo = -2; + _a = -2; ofs = 0; } - bool isValid() const { - return fileNo != -2; - } + bool isValid() const { return _a != -2; } string toString() const { if ( isNull() ) return "null"; stringstream ss; - ss << hex << fileNo << ':' << ofs; + ss << hex << _a << ':' << ofs; return ss.str(); } - BSONObj toBSONObj() const { - return BSON( "file" << fileNo << "offset" << ofs ); - } + BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs ); } - int& GETOFS() { - return ofs; - } - int getOfs() const { - return ofs; - } + int a() const { return _a; } + + int& GETOFS() { return ofs; } + int getOfs() const { return ofs; } void set(int a, int b) { - fileNo=a; + _a=a; ofs=b; } - void setOfs(int _fileNo, int _ofs) { - fileNo = _fileNo; - ofs = _ofs; - } void inc(int amt) { assert( !isNull() ); @@ -112,23 +98,23 @@ namespace mongo { } bool sameFile(DiskLoc b) { - return fileNo == b.fileNo; + return _a== b._a; } bool operator==(const DiskLoc& b) const { - return fileNo==b.fileNo && ofs == b.ofs; + return _a==b._a&& ofs == b.ofs; } bool operator!=(const DiskLoc& b) const { return !(*this==b); } const DiskLoc& operator=(const DiskLoc& b) { - fileNo=b.fileNo; + _a=b._a; ofs = b.ofs; //assert(ofs!=0); return *this; } int compare(const DiskLoc& b) const { - int x = fileNo - b.fileNo; + int x = _a - b._a; if ( x ) return x; return ofs - b.ofs; @@ -137,18 +123,27 @@ namespace mongo { return compare(b) < 0; } - /* get the "thing" associated with this disk location. - it is assumed the object is what it is -- you must asure that: - think of this as an unchecked type cast. + /** + * Marks this disk loc for writing + * @returns a non const reference to this disk loc + * This function explicitly signals we are writing and casts away const + */ + DiskLoc& writing() const; // see dur.h + + /* Get the "thing" associated with this disk location. + it is assumed the object is what you say it is -- you must assure that + (think of this as an unchecked type cast) + Note: set your Context first so that the database to which the diskloc applies is known. */ BSONObj obj() const; Record* rec() const; DeletedRecord* drec() const; Extent* ext() const; - BtreeBucket* btree() const; - BtreeBucket* btreemod() const; // marks modified / dirty + const BtreeBucket* btree() const; + // Explicitly signals we are writing and casts away const + BtreeBucket* btreemod() const; - MongoDataFile& pdf() const; + /*MongoDataFile& pdf() const;*/ }; #pragma pack() diff --git a/db/driverHelpers.cpp b/db/driverHelpers.cpp index d8971ad..d98a33b 100644 --- a/db/driverHelpers.cpp +++ b/db/driverHelpers.cpp @@ -24,11 +24,11 @@ #include "pch.h" #include "jsobj.h" #include "pdfile.h" -#include "namespace.h" +#include "namespace-inl.h" #include "commands.h" #include "cmdline.h" #include "btree.h" -#include "curop.h" +#include "curop-inl.h" #include "../util/background.h" #include "../scripting/engine.h" @@ -36,18 +36,18 @@ namespace mongo { class BasicDriverHelper : public Command { public: - BasicDriverHelper( const char * name ) : Command( name ){} - + BasicDriverHelper( const char * name ) : Command( name ) {} + virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } - virtual bool slaveOverrideOk(){ return true; } + virtual bool slaveOverrideOk() { return true; } }; class ObjectIdTest : public BasicDriverHelper { public: - ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ){} - virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ - if ( cmdObj.firstElement().type() != jstOID ){ + ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {} + virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if ( cmdObj.firstElement().type() != jstOID ) { errmsg = "not oid"; return false; } diff --git a/db/dur.cpp b/db/dur.cpp new file mode 100644 index 0000000..15b4565 --- /dev/null +++ b/db/dur.cpp @@ -0,0 +1,635 @@ +// @file dur.cpp durability in the storage engine (crash-safeness / journaling) + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +/* + phases + + PREPLOGBUFFER + we will build an output buffer ourself and then use O_DIRECT + we could be in read lock for this + for very large objects write directly to redo log in situ? + WRITETOJOURNAL + we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity + have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that). + for now (1.7.5/1.8.0) we are in read lock which is not ideal. + WRITETODATAFILES + apply the writes back to the non-private MMF after they are for certain in redo log + REMAPPRIVATEVIEW + we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real + remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want + to be too frequent. + there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will + be required. so doing these remaps fractionally is helpful. + + @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc +*/ + +#include "pch.h" +#include "cmdline.h" +#include "client.h" +#include "dur.h" +#include "dur_journal.h" +#include "dur_commitjob.h" +#include "dur_recover.h" +#include "../util/concurrency/race.h" +#include "../util/mongoutils/hash.h" +#include "../util/mongoutils/str.h" +#include "../util/timer.h" +#include "dur_stats.h" + +using namespace mongoutils; + +namespace mongo { + + namespace dur { + + void WRITETODATAFILES(); + void PREPLOGBUFFER(); + + /** declared later in this file + only used in this file -- use DurableInterface::commitNow() outside + */ + static void groupCommit(); + + CommitJob commitJob; + + Stats stats; + + void Stats::S::reset() { + memset(this, 0, sizeof(*this)); + } + + Stats::Stats() { + _a.reset(); + _b.reset(); + curr = &_a; + _intervalMicros = 3000000; + } + + Stats::S * Stats::other() { + return curr == &_a ? &_b : &_a; + } + + BSONObj Stats::S::_asObj() { + return BSON( + "commits" << _commits << + "journaledMB" << _journaledBytes / 1000000.0 << + "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 << + "commitsInWriteLock" << _commitsInWriteLock << + "earlyCommits" << _earlyCommits << + "timeMs" << + BSON( "dt" << _dtMillis << + "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) << + "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) << + "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) << + "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000) + ) + ); + } + + BSONObj Stats::asObj() { + return other()->_asObj(); + } + + void Stats::rotate() { + unsigned long long now = curTimeMicros64(); + unsigned long long dt = now - _lastRotate; + if( dt >= _intervalMicros && _intervalMicros ) { + // rotate + curr->_dtMillis = (unsigned) (dt/1000); + _lastRotate = now; + curr = other(); + curr->reset(); + } + } + + void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) { + memcpy(dst, src, len); + } + + void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) { + MemoryMappedFile::makeWritable(dst, len); + + // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches + // + // this also makes setNoJournal threadsafe, which is good as we call it from a read (not a write) lock + // in class SlaveTracking + // + scoped_lock lk( privateViews._mutex() ); + size_t ofs; + MongoMMF *f = privateViews.find_inlock(dst, ofs); + assert(f); + void *w = (((char *)f->view_write())+ofs); + // first write it to the writable (file) view + memcpy(w, src, len); + if( memcmp(w, dst, len) ) { + // if we get here, a copy-on-write had previously occurred. so write it to the private view too + // to keep them in sync. we do this as we do not want to cause a copy on write unnecessarily. + memcpy(dst, src, len); + } + } + + /** base declare write intent function that all the helpers call. */ + void DurableImpl::declareWriteIntent(void *p, unsigned len) { + commitJob.note(p, len); + } + + static DurableImpl* durableImpl = new DurableImpl(); + static NonDurableImpl* nonDurableImpl = new NonDurableImpl(); + DurableInterface* DurableInterface::_impl = nonDurableImpl; + + void DurableInterface::enableDurability() { + assert(_impl == nonDurableImpl); + _impl = durableImpl; + } + + void DurableInterface::disableDurability() { + assert(_impl == durableImpl); + massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten()); + _impl = nonDurableImpl; + } + + bool DurableImpl::commitNow() { + stats.curr->_earlyCommits++; + groupCommit(); + return true; + } + + bool DurableImpl::awaitCommit() { + commitJob.awaitNextCommit(); + return true; + } + + /** Declare that a file has been created + Normally writes are applied only after journaling, for safety. But here the file + is created first, and the journal will just replay the creation if the create didn't + happen because of crashing. + */ + void DurableImpl::createdFile(string filename, unsigned long long len) { + shared_ptr op( new FileCreatedOp(filename, len) ); + commitJob.noteOp(op); + } + + void* DurableImpl::writingPtr(void *x, unsigned len) { + void *p = x; + declareWriteIntent(p, len); + return p; + } + + /** declare intent to write + @param ofs offset within buf at which we will write + @param len the length at ofs we will write + @return new buffer pointer. + */ + void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) { + char *p = (char *) buf; + declareWriteIntent(p+ofs, len); + return p; + } + + void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) { + char *p = (char *) buf; + for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin(); + i != ranges.end(); ++i ) { + declareWriteIntent( p + i->first, i->second ); + } + return p; + } + + bool DurableImpl::commitIfNeeded() { + DEV commitJob._nSinceCommitIfNeededCall = 0; + if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit? + stats.curr->_earlyCommits++; + groupCommit(); + return true; + } + return false; + } + + /** Used in _DEBUG builds to check that we didn't overwrite the last intent + that was declared. called just before writelock release. we check a few + bytes after the declared region to see if they changed. + + @see MongoMutex::_releasedWriteLock + + SLOW + */ +#if 0 + void DurableImpl::debugCheckLastDeclaredWrite() { + static int n; + ++n; + + assert(debug && cmdLine.dur); + if (commitJob.writes().empty()) + return; + const WriteIntent &i = commitJob.lastWrite(); + size_t ofs; + MongoMMF *mmf = privateViews.find(i.start(), ofs); + if( mmf == 0 ) + return; + size_t past = ofs + i.length(); + if( mmf->length() < past + 8 ) + return; // too close to end of view + char *priv = (char *) mmf->getView(); + char *writ = (char *) mmf->view_write(); + unsigned long long *a = (unsigned long long *) (priv+past); + unsigned long long *b = (unsigned long long *) (writ+past); + if( *a != *b ) { + for( set::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) { + const WriteIntent& wi = *it; + char *r1 = (char*) wi.start(); + char *r2 = (char*) wi.end(); + if( r1 <= (((char*)a)+8) && r2 > (char*)a ) { + //log() << "it's ok " << wi.p << ' ' << wi.len << endl; + return; + } + } + log() << "dur data after write area " << i.start() << " does not agree" << endl; + log() << " was: " << ((void*)b) << " " << hexdump((char*)b, 8) << endl; + log() << " now: " << ((void*)a) << " " << hexdump((char*)a, 8) << endl; + log() << " n: " << n << endl; + log() << endl; + } + } +#endif + + /** write the buffer we have built to the journal and fsync it. + outside of lock as that could be slow. + */ + static void WRITETOJOURNAL(AlignedBuilder& ab) { + Timer t; + journal(ab); + stats.curr->_writeToJournalMicros += t.micros(); + } + + // Functor to be called over all MongoFiles + + class validateSingleMapMatches { + public: + validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes) {} + void operator () (MongoFile *mf) { + if( mf->isMongoMMF() ) { + MongoMMF *mmf = (MongoMMF*) mf; + const char *p = (const char *) mmf->getView(); + const char *w = (const char *) mmf->view_write(); + + if (!p || !w) return; // File not fully opened yet + + _bytes += mmf->length(); + + assert( mmf->length() == (unsigned) mmf->length() ); + { + scoped_lock lk( privateViews._mutex() ); // see setNoJournal + if (memcmp(p, w, (unsigned) mmf->length()) == 0) + return; // next file + } + + unsigned low = 0xffffffff; + unsigned high = 0; + log() << "DurParanoid mismatch in " << mmf->filename() << endl; + int logged = 0; + unsigned lastMismatch = 0xffffffff; + for( unsigned i = 0; i < mmf->length(); i++ ) { + if( p[i] != w[i] ) { + if( lastMismatch != 0xffffffff && lastMismatch+1 != i ) + log() << endl; // separate blocks of mismatches + lastMismatch= i; + if( ++logged < 60 ) { + stringstream ss; + ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i]; + if( p[i] > 32 && p[i] <= 126 ) + ss << '\t' << p[i]; + log() << ss.str() << endl; + } + if( logged == 60 ) + log() << "..." << endl; + if( i < low ) low = i; + if( i > high ) high = i; + } + } + if( low != 0xffffffff ) { + std::stringstream ss; + ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1; + log() << ss.str() << endl; + log() << "priv loc: " << (void*)(p+low) << ' ' << endl; + set& b = commitJob.writes(); + (void)b; // mark as unused. Useful for inspection in debugger + + // should we abort() here so this isn't unnoticed in some circumstances? + massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false); + } + } + } + private: + unsigned long long& _bytes; + }; + + /** (SLOW) diagnostic to check that the private view and the non-private view are in sync. + */ + void debugValidateAllMapsMatch() { + if( ! (cmdLine.durOptions & CmdLine::DurParanoid) ) + return; + + unsigned long long bytes = 0; + Timer t; + MongoFile::forEach(validateSingleMapMatches(bytes)); + OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " << (bytes / (1024*1024)) << "MB" << endl; + } + + extern size_t privateMapBytes; + + /** We need to remap the private views periodically. otherwise they would become very large. + Call within write lock. + */ + void _REMAPPRIVATEVIEW() { + static unsigned startAt; + static unsigned long long lastRemap; + + dbMutex.assertWriteLocked(); + dbMutex._remapPrivateViewRequested = false; + assert( !commitJob.hasWritten() ); + + // we want to remap all private views about every 2 seconds. there could be ~1000 views so + // we do a little each pass; beyond the remap time, more significantly, there will be copy on write + // faults after remapping, so doing a little bit at a time will avoid big load spikes on + // remapping. + unsigned long long now = curTimeMicros64(); + double fraction = (now-lastRemap)/2000000.0; + lastRemap = now; + + rwlock lk(MongoFile::mmmutex, false); + set& files = MongoFile::getAllFiles(); + unsigned sz = files.size(); + if( sz == 0 ) + return; + + { + // be careful not to use too much memory if the write rate is + // extremely high + double f = privateMapBytes / ((double)UncommittedBytesLimit); + if( f > fraction ) { + fraction = f; + } + privateMapBytes = 0; + } + + unsigned ntodo = (unsigned) (sz * fraction); + if( ntodo < 1 ) ntodo = 1; + if( ntodo > sz ) ntodo = sz; + + const set::iterator b = files.begin(); + const set::iterator e = files.end(); + set::iterator i = b; + // skip to our starting position + for( unsigned x = 0; x < startAt; x++ ) { + i++; + if( i == e ) i = b; + } + startAt = (startAt + ntodo) % sz; // mark where to start next time + + for( unsigned x = 0; x < ntodo; x++ ) { + dassert( i != e ); + if( (*i)->isMongoMMF() ) { + MongoMMF *mmf = (MongoMMF*) *i; + assert(mmf); + if( mmf->willNeedRemap() ) { + mmf->willNeedRemap() = false; + mmf->remapThePrivateView(); + } + i++; + if( i == e ) i = b; + } + } + } + void REMAPPRIVATEVIEW() { + Timer t; + _REMAPPRIVATEVIEW(); + stats.curr->_remapPrivateViewMicros += t.micros(); + } + + mutex groupCommitMutex("groupCommit"); + + /** locking: in read lock when called. */ + static void _groupCommit() { + stats.curr->_commits++; + + if( !commitJob.hasWritten() ) { + // getlasterror request could have came after the data was already committed + commitJob.notifyCommitted(); + return; + } + + // we need to make sure two group commits aren't running at the same time + // (and we are only read locked in the dbMutex, so it could happen) + scoped_lock lk(groupCommitMutex); + + PREPLOGBUFFER(); + + // todo : write to the journal outside locks, as this write can be slow. + // however, be careful then about remapprivateview as that cannot be done + // if new writes are then pending in the private maps. + WRITETOJOURNAL(commitJob._ab); + + // data is now in the journal, which is sufficient for acknowledging getLastError. + // (ok to crash after that) + commitJob.notifyCommitted(); + + WRITETODATAFILES(); + + commitJob.reset(); + + // REMAPPRIVATEVIEW + // + // remapping private views must occur after WRITETODATAFILES otherwise + // we wouldn't see newly written data on reads. + // + DEV assert( !commitJob.hasWritten() ); + if( !dbMutex.isWriteLocked() ) { + // this needs done in a write lock (as there is a short window during remapping when each view + // might not exist) thus we do it on the next acquisition of that instead of here (there is no + // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted + // writes occur). If desired, perhpas this can be eliminated on posix as it may be that the remap + // is race-free there. + // + dbMutex._remapPrivateViewRequested = true; + } + else { + stats.curr->_commitsInWriteLock++; + // however, if we are already write locked, we must do it now -- up the call tree someone + // may do a write without a new lock acquisition. this can happen when MongoMMF::close() calls + // this method when a file (and its views) is about to go away. + // + REMAPPRIVATEVIEW(); + } + } + + /** locking in read lock when called + @see MongoMMF::close() + */ + static void groupCommit() { + // we need to be at least read locked on the dbMutex so that we know the write intent data + // structures are not changing while we work + dbMutex.assertAtLeastReadLocked(); + + try { + _groupCommit(); + } + catch(DBException& e ) { + log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl; + abort(); + } + catch(std::ios_base::failure& e) { + log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl; + abort(); + } + catch(std::bad_alloc& e) { + log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl; + abort(); + } + catch(std::exception& e) { + log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl; + abort(); // based on myTerminate() + } + } + + static void go() { + if( !commitJob.hasWritten() ){ + commitJob.notifyCommitted(); + return; + } + + { + readlocktry lk("", 1000); + if( lk.got() ) { + groupCommit(); + return; + } + } + + // starvation on read locks could occur. so if read lock acquisition is slow, try to get a + // write lock instead. otherwise journaling could be delayed too long (too much data will + // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there + // has been writes) + writelock lk; + groupCommit(); + } + + /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its + views disappear + */ + void closingFileNotification() { + if (!cmdLine.dur) + return; + + if( dbMutex.atLeastReadLocked() ) { + groupCommit(); + } + else { + assert( inShutdown() ); + if( commitJob.hasWritten() ) { + log() << "dur warning files are closing outside locks with writes pending" << endl; + } + } + } + + CodeBlock durThreadMain; + + void durThread() { + Client::initThread("dur"); + const int HowOftenToGroupCommitMs = 90; + while( !inShutdown() ) { + sleepmillis(10); + CodeBlock::Within w(durThreadMain); + try { + int millis = HowOftenToGroupCommitMs; + { + stats.rotate(); + { + Timer t; + journalRotate(); // note we do this part outside of mongomutex + millis -= t.millis(); + assert( millis <= HowOftenToGroupCommitMs ); + if( millis < 5 ) + millis = 5; + } + + // we do this in a couple blocks, which makes it a tiny bit faster (only a little) on throughput, + // but is likely also less spiky on our cpu usage, which is good: + sleepmillis(millis/2); + commitJob.wi()._deferred.invoke(); + sleepmillis(millis/2); + commitJob.wi()._deferred.invoke(); + } + + go(); + } + catch(std::exception& e) { + log() << "exception in durThread causing immediate shutdown: " << e.what() << endl; + abort(); // based on myTerminate() + } + } + cc().shutdown(); + } + + void recover(); + + void releasingWriteLock() { + // implicit commitIfNeeded check on each write unlock + DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed + if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) { + stats.curr->_earlyCommits++; + groupCommit(); + } + } + + void preallocateFiles(); + + /** at startup, recover, and then start the journal threads */ + void startup() { + if( !cmdLine.dur ) + return; + + DurableInterface::enableDurability(); + + journalMakeDir(); + try { + recover(); + } + catch(...) { + log() << "exception during recovery" << endl; + throw; + } + + preallocateFiles(); + + boost::thread t(durThread); + } + + void DurableImpl::syncDataAndTruncateJournal() { + dbMutex.assertWriteLocked(); + + groupCommit(); + MongoFile::flushAll(true); + journalCleanup(); + + assert(!haveJournalFiles()); // Double check post-conditions + } + + } // namespace dur + +} // namespace mongo diff --git a/db/dur.h b/db/dur.h new file mode 100644 index 0000000..a8035e4 --- /dev/null +++ b/db/dur.h @@ -0,0 +1,201 @@ +// @file dur.h durability support + +#pragma once + +#include "diskloc.h" +#include "mongommf.h" + +namespace mongo { + + class NamespaceDetails; + + namespace dur { + + // a smaller limit is likely better on 32 bit +#if defined(__i386__) || defined(_M_IX86) + const unsigned UncommittedBytesLimit = 50 * 1024 * 1024; +#else + const unsigned UncommittedBytesLimit = 100 * 1024 * 1024; +#endif + + /** Call during startup so durability module can initialize + Throws if fatal error + Does nothing if cmdLine.dur is false + */ + void startup(); + + class DurableInterface : boost::noncopyable { + public: + virtual ~DurableInterface() { log() << "ERROR warning ~DurableInterface not intended to be called" << endl; } + + /** Declare that a file has been created + Normally writes are applied only after journaling, for safety. But here the file + is created first, and the journal will just replay the creation if the create didn't + happen because of crashing. + */ + virtual void createdFile(string filename, unsigned long long len) = 0; + + /** Declarations of write intent. + + Use these methods to declare "i'm about to write to x and it should be logged for redo." + + Failure to call writing...() is checked in _DEBUG mode by using a read only mapped view + (i.e., you'll segfault if the code is covered in that situation). The _DEBUG check doesn't + verify that your length is correct though. + */ + + /** declare intent to write to x for up to len + @return pointer where to write. this is modified when testIntent is true. + */ + virtual void* writingPtr(void *x, unsigned len) = 0; + + /** declare write intent; should already be in the write view to work correctly when testIntent is true. + if you aren't, use writingPtr() instead. + */ + virtual void declareWriteIntent(void *x, unsigned len) = 0; + + /** declare intent to write + @param ofs offset within buf at which we will write + @param len the length at ofs we will write + @return new buffer pointer. this is modified when testIntent is true. + */ + virtual void* writingAtOffset(void *buf, unsigned ofs, unsigned len) = 0; + + /** declare intent to write + @param ranges vector of pairs representing ranges. Each pair + comprises an offset from buf where a range begins, then the + range length. + @return new buffer pointer. this is modified when testIntent is true. + */ + virtual void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) = 0; + + /** Wait for acknowledgement of the next group commit. + @return true if --dur is on. There will be delay. + @return false if --dur is off. + */ + virtual bool awaitCommit() = 0; + + /** Commit immediately. + + Generally, you do not want to do this often, as highly granular committing may affect + performance. + + Does not return until the commit is complete. + + You must be at least read locked when you call this. Ideally, you are not write locked + and then read operations can occur concurrently. + + @return true if --dur is on. + @return false if --dur is off. (in which case there is action) + */ + virtual bool commitNow() = 0; + + /** Commit if enough bytes have been modified. Current threshold is 50MB + + The idea is that long running write operations that dont yield + (like creating an index or update with $atomic) can call this + whenever the db is in a sane state and it will prevent commits + from growing too large. + @return true if commited + */ + virtual bool commitIfNeeded() = 0; + + /** Declare write intent for a DiskLoc. @see DiskLoc::writing() */ + inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); } + + /** Declare write intent for an int */ + inline int& writingInt(const int& d) { return *((int*) writingPtr((int*) &d, sizeof(d))); } + + /** "assume i've already indicated write intent, let me write" + redeclaration is fine too, but this is faster. + */ + template + inline + T* alreadyDeclared(T *x) { +#if defined(_TESTINTENT) + return (T*) MongoMMF::switchToPrivateView(x); +#else + return x; +#endif + } + + /** declare intent to write to x for sizeof(*x) */ + template + inline + T* writing(T *x) { + return (T*) writingPtr(x, sizeof(T)); + } + + /** write something that doesn't have to be journaled, as this write is "unimportant". + a good example is paddingFactor. + can be thought of as memcpy(dst,src,len) + the dur implementation acquires a mutex in this method, so do not assume it is faster + without measuring! + */ + virtual void setNoJournal(void *dst, void *src, unsigned len) = 0; + + /** Commits pending changes, flushes all changes to main data + files, then removes the journal. + + This is useful as a "barrier" to ensure that writes before this + call will never go through recovery and be applied to files + that have had changes made after this call applied. + */ + virtual void syncDataAndTruncateJournal() = 0; + + static DurableInterface& getDur() { return *_impl; } + + private: + /** Intentionally unimplemented method. + It's very easy to manipulate Record::data open ended. Thus a call to writing(Record*) is suspect. + This will override the templated version and yield an unresolved external. + */ + Record* writing(Record* r); + /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */ + BtreeBucket* writing( BtreeBucket* ); + /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */ + NamespaceDetails* writing( NamespaceDetails* ); + + static DurableInterface* _impl; // NonDurableImpl at startup() + static void enableDurability(); // makes _impl a DurableImpl + static void disableDurability(); // makes _impl a NonDurableImpl + + // these need to be able to enable/disable Durability + friend void startup(); + friend class TempDisableDurability; + }; // class DurableInterface + + class NonDurableImpl : public DurableInterface { + void* writingPtr(void *x, unsigned len) { return x; } + void* writingAtOffset(void *buf, unsigned ofs, unsigned len) { return buf; } + void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges) { return buf; } + void declareWriteIntent(void *, unsigned) { } + void createdFile(string filename, unsigned long long len) { } + bool awaitCommit() { return false; } + bool commitNow() { return false; } + bool commitIfNeeded() { return false; } + void setNoJournal(void *dst, void *src, unsigned len); + void syncDataAndTruncateJournal() {} + }; + + class DurableImpl : public DurableInterface { + void* writingPtr(void *x, unsigned len); + void* writingAtOffset(void *buf, unsigned ofs, unsigned len); + void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges); + void declareWriteIntent(void *, unsigned); + void createdFile(string filename, unsigned long long len); + bool awaitCommit(); + bool commitNow(); + bool commitIfNeeded(); + void setNoJournal(void *dst, void *src, unsigned len); + void syncDataAndTruncateJournal(); + }; + + } // namespace dur + + inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); } + + /** declare that we are modifying a diskloc and this is a datafile write. */ + inline DiskLoc& DiskLoc::writing() const { return getDur().writingDiskLoc(*const_cast< DiskLoc * >( this )); } + +} diff --git a/db/dur_commitjob.cpp b/db/dur_commitjob.cpp new file mode 100644 index 0000000..aed38e8 --- /dev/null +++ b/db/dur_commitjob.cpp @@ -0,0 +1,210 @@ +/* @file dur_commitjob.cpp */ + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "dur_commitjob.h" +#include "taskqueue.h" + +namespace mongo { + + namespace dur { + + BOOST_STATIC_ASSERT( UncommittedBytesLimit > BSONObjMaxInternalSize * 3 ); + BOOST_STATIC_ASSERT( sizeof(void*)==4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6 ); + + void Writes::D::go(const Writes::D& d) { + commitJob.wi()._insertWriteIntent(d.p, d.len); + } + + void WriteIntent::absorb(const WriteIntent& other) { + dassert(overlaps(other)); + + void* newStart = min(start(), other.start()); + p = max(p, other.p); + len = (char*)p - (char*)newStart; + + dassert(contains(other)); + } + + void Writes::clear() { + dbMutex.assertAtLeastReadLocked(); + + _alreadyNoted.clear(); + _writes.clear(); + _ops.clear(); + _drained = false; +#if defined(DEBUG_WRITE_INTENT) + cout << "_debug clear\n"; + _debug.clear(); +#endif + } + +#if defined(DEBUG_WRITE_INTENT) + void assertAlreadyDeclared(void *p, int len) { + if( commitJob.wi()._debug[p] >= len ) + return; + log() << "assertAlreadyDeclared fails " << (void*)p << " len:" << len << ' ' << commitJob.wi()._debug[p] << endl; + printStackTrace(); + abort(); + } +#endif + + void Writes::_insertWriteIntent(void* p, int len) { + WriteIntent wi(p, len); + + if (_writes.empty()) { + _writes.insert(wi); + return; + } + + typedef set::const_iterator iterator; // shorter + + iterator closest = _writes.lower_bound(wi); + // closest.end() >= wi.end() + + if ((closest != _writes.end() && closest->overlaps(wi)) || // high end + (closest != _writes.begin() && (--closest)->overlaps(wi))) { // low end + if (closest->contains(wi)) + return; // nothing to do + + // find overlapping range and merge into wi + iterator end(closest); + iterator begin(closest); + while ( end->overlaps(wi)) { wi.absorb(*end); ++end; if (end == _writes.end()) break; } // look forwards + while (begin->overlaps(wi)) { wi.absorb(*begin); if (begin == _writes.begin()) break; --begin; } // look backwards + if (!begin->overlaps(wi)) ++begin; // make inclusive + + DEV { // ensure we're not deleting anything we shouldn't + for (iterator it(begin); it != end; ++it) { + assert(wi.contains(*it)); + } + } + + _writes.erase(begin, end); + _writes.insert(wi); + + DEV { // ensure there are no overlaps + // this can be very slow - n^2 - so make it RARELY + RARELY { + for (iterator it(_writes.begin()), end(boost::prior(_writes.end())); it != end; ++it) { + assert(!it->overlaps(*boost::next(it))); + } + } + } + } + else { // no entries overlapping wi + _writes.insert(closest, wi); + } + } + + + /** note an operation other than a "basic write" */ + void CommitJob::noteOp(shared_ptr p) { + DEV dbMutex.assertWriteLocked(); + dassert( cmdLine.dur ); + if( !_hasWritten ) { + assert( !dbMutex._remapPrivateViewRequested ); + _hasWritten = true; + } + _wi._ops.push_back(p); + } + + size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap + + void CommitJob::reset() { + _hasWritten = false; + _wi.clear(); + _ab.reset(); + privateMapBytes += _bytes; + _bytes = 0; + _nSinceCommitIfNeededCall = 0; + } + + CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false), + _bytes(0), _nSinceCommitIfNeededCall(0) { } + + void CommitJob::note(void* p, int len) { + // from the point of view of the dur module, it would be fine (i think) to only + // be read locked here. but must be at least read locked to avoid race with + // remapprivateview + DEV dbMutex.assertWriteLocked(); + dassert( cmdLine.dur ); + if( !_wi._alreadyNoted.checkAndSet(p, len) ) { + MemoryMappedFile::makeWritable(p, len); + + if( !_hasWritten ) { + // you can't be writing if one of these is pending, so this is a verification. + assert( !dbMutex._remapPrivateViewRequested ); + + // we don't bother doing a group commit when nothing is written, so we have a var to track that + _hasWritten = true; + } + + /** tips for debugging: + if you have an incorrect diff between data files in different folders + (see jstests/dur/quick.js for example), + turn this on and see what is logged. if you have a copy of its output from before the + regression, a simple diff of these lines would tell you a lot likely. + */ +#if 0 && defined(_DEBUG) + { + static int n; + if( ++n < 10000 ) { + size_t ofs; + MongoMMF *mmf = privateViews._find(w.p, ofs); + if( mmf ) { + log() << "DEBUG note write intent " << w.p << ' ' << mmf->filename() << " ofs:" << hex << ofs << " len:" << w.len << endl; + } + else { + log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl; + } + } + else if( n == 10000 ) { + log() << "DEBUG stopping write intent logging, too much to log" << endl; + } + } +#endif + + // remember intent. we will journal it in a bit + _wi.insertWriteIntent(p, len); + wassert( _wi._writes.size() < 2000000 ); + assert( _wi._writes.size() < 20000000 ); + + { + // a bit over conservative in counting pagebytes used + static size_t lastPos; // note this doesn't reset with each commit, but that is ok we aren't being that precise + size_t x = ((size_t) p) & ~0xfff; // round off to page address (4KB) + if( x != lastPos ) { + lastPos = x; + unsigned b = (len+4095) & ~0xfff; + _bytes += b; +#if defined(_DEBUG) + _nSinceCommitIfNeededCall++; + if( _nSinceCommitIfNeededCall >= 80 ) { + if( _nSinceCommitIfNeededCall % 40 == 0 ) + log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl; + } +#endif + uassert(13623, "DR102 too much data written uncommitted", _bytes < UncommittedBytesLimit * 3); + } + } + } + } + + } +} diff --git a/db/dur_commitjob.h b/db/dur_commitjob.h new file mode 100644 index 0000000..104d054 --- /dev/null +++ b/db/dur_commitjob.h @@ -0,0 +1,221 @@ +/* @file dur_commitjob.h used by dur.cpp +*/ + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../util/alignedbuilder.h" +#include "../util/mongoutils/hash.h" +#include "../util/concurrency/synchronization.h" +#include "cmdline.h" +#include "durop.h" +#include "dur.h" +#include "taskqueue.h" + +//#define DEBUG_WRITE_INTENT 1 + +namespace mongo { + namespace dur { + + /** declaration of an intent to write to a region of a memory mapped view + * + * We store the end rather than the start pointer to make operator< faster + * since that is heavily used in set lookup. + */ + struct WriteIntent { /* copyable */ + WriteIntent() : w_ptr(0), p(0) { } + WriteIntent(void *a, unsigned b) : w_ptr(0), p((char*)a+b), len(b) { } + + void* start() const { return (char*)p - len; } + void* end() const { return p; } + unsigned length() const { return len; } + + bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); } + + // can they be merged? + bool overlaps(const WriteIntent& rhs) const { + return (start() <= rhs.end() && end() >= rhs.start()); + } + + // is merging necessary? + bool contains(const WriteIntent& rhs) const { + return (start() <= rhs.start() && end() >= rhs.end()); + } + + // merge into me + void absorb(const WriteIntent& other); + + friend ostream& operator << (ostream& out, const WriteIntent& wi) { + return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len); + } + + mutable void *w_ptr; // writable mapping of p. + // mutable because set::iterator is const but this isn't used in op< +#if defined(_EXPERIMENTAL) + mutable unsigned ofsInJournalBuffer; +#endif + private: + void *p; // intent to write up to p + unsigned len; // up to this len + }; + + /** try to remember things we have already marked for journaling. false negatives are ok if infrequent - + we will just log them twice. + */ + template + class Already : boost::noncopyable { + public: + Already() { clear(); } + void clear() { memset(this, 0, sizeof(*this)); } + + /* see if we have Already recorded/indicated our write intent for this region of memory. + automatically upgrades the length if the length was shorter previously. + @return true if already indicated. + */ + bool checkAndSet(void* p, int len) { + unsigned x = mongoutils::hashPointer(p); + pair nd = nodes[x % N]; + if( nd.first == p ) { + if( nd.second < len ) { + nd.second = len; + return false; // haven't indicated this len yet + } + return true; // already indicated + } + nd.first = p; + nd.second = len; + return false; // a new set + } + + private: + enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily + pair nodes[N]; + }; + + /** our record of pending/uncommitted write intents */ + class Writes : boost::noncopyable { + struct D { + void *p; + unsigned len; + static void go(const D& d); + }; + public: + TaskQueue _deferred; + Already<127> _alreadyNoted; + set _writes; + vector< shared_ptr > _ops; // all the ops other than basic writes + bool _drained; // _deferred is drained? for asserting/testing + + /** reset the Writes structure (empties all the above) */ + void clear(); + + /** merges into set (ie non-deferred version) */ + void _insertWriteIntent(void* p, int len); + + void insertWriteIntent(void* p, int len) { +#if defined(DEBUG_WRITE_INTENT) + if( _debug[p] < len ) + _debug[p] = len; +#endif + D d; + d.p = p; + d.len = len; + _deferred.defer(d); + } + +#ifdef _DEBUG + WriteIntent _last; +#endif +#if defined(DEBUG_WRITE_INTENT) + map _debug; +#endif + }; + +#if defined(DEBUG_WRITE_INTENT) + void assertAlreadyDeclared(void *, int len); +#else + inline void assertAlreadyDeclared(void *, int len) { } +#endif + + /** A commit job object for a group commit. Currently there is one instance of this object. + + concurrency: assumption is caller is appropriately locking. + for example note() invocations are from the write lock. + other uses are in a read lock from a single thread (durThread) + */ + class CommitJob : boost::noncopyable { + public: + AlignedBuilder _ab; // for direct i/o writes to journal + + CommitJob(); + + /** record/note an intent to write */ + void note(void* p, int len); + + /** note an operation other than a "basic write" */ + void noteOp(shared_ptr p); + + set& writes() { + if( !_wi._drained ) { + // generally, you don't want to use the set until it is prepared (after deferred ops are applied) + // thus this assert here. + assert(false); + } + return _wi._writes; + } + + vector< shared_ptr >& ops() { return _wi._ops; } + + /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even + trying to acquire a lock, which might be helpful at times. + */ + bool hasWritten() const { return _hasWritten; } + + /** we use the commitjob object over and over, calling reset() rather than reconstructing */ + void reset(); + + /** the commit code calls this when data reaches the journal (on disk) */ + void notifyCommitted() { _notify.notifyAll(); } + + /** Wait until the next group commit occurs. That is, wait until someone calls notifyCommitted. */ + void awaitNextCommit() { + if( hasWritten() ) + _notify.wait(); + } + + /** we check how much written and if it is getting to be a lot, we commit sooner. */ + size_t bytes() const { return _bytes; } + +#if defined(_DEBUG) + const WriteIntent& lastWrite() const { return _wi._last; } +#endif + + Writes& wi() { return _wi; } + private: + bool _hasWritten; + Writes _wi; // todo: fix name + size_t _bytes; + NotifyAll _notify; // for getlasterror fsync:true acknowledgements + public: + unsigned _nSinceCommitIfNeededCall; + }; + + extern CommitJob commitJob; + + } +} diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp new file mode 100644 index 0000000..946f94c --- /dev/null +++ b/db/dur_journal.cpp @@ -0,0 +1,576 @@ +// @file dur_journal.cpp writing to the writeahead logging journal + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "client.h" +#include "namespace.h" +#include "dur_journal.h" +#include "dur_journalformat.h" +#include "dur_stats.h" +#include "../util/logfile.h" +#include "../util/timer.h" +#include "../util/alignedbuilder.h" +#include "../util/message.h" // getelapsedtimemillis +#include "../util/concurrency/race.h" +#include +#undef assert +#define assert MONGO_assert +#include "../util/mongoutils/str.h" +#include "dur_journalimpl.h" +#include "../util/file.h" + +using namespace mongoutils; + +namespace mongo { + + class AlignedBuilder; + + namespace dur { + BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 ); + BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 ); + BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 ); + BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 ); + BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 ); + + bool usingPreallocate = false; + + void removeOldJournalFile(path p); + + filesystem::path getJournalDir() { + filesystem::path p(dbpath); + p /= "journal"; + return p; + } + + path lsnPath() { + return getJournalDir()/"lsn"; + } + + extern CodeBlock durThreadMain; + + /** this should be called when something really bad happens so that we can flag appropriately + */ + void journalingFailure(const char *msg) { + /** todo: + (1) don't log too much + (2) make an indicator in the journal dir that something bad happened. + (2b) refuse to do a recovery startup if that is there without manual override. + */ + log() << "journaling error " << msg << endl; + assert(false); + } + + JHeader::JHeader(string fname) { + magic[0] = 'j'; magic[1] = '\n'; + _version = CurrentVersion; + memset(ts, 0, sizeof(ts)); + time_t t = time(0); + strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1); + memset(dbpath, 0, sizeof(dbpath)); + strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1); + { + fileId = t&0xffffffff; + fileId |= ((unsigned long long)getRandomNumber()) << 32; + } + memset(reserved3, 0, sizeof(reserved3)); + txt2[0] = txt2[1] = '\n'; + n1 = n2 = n3 = n4 = '\n'; + } + + // class Journal + + Journal j; + + const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0); + + Journal::Journal() : + _curLogFileMutex("JournalLfMutex") { + _written = 0; + _nextFileNumber = 0; + _curLogFile = 0; + _curFileId = 0; + _preFlushTime = 0; + _lastFlushTime = 0; + _writeToLSNNeeded = false; + } + + path Journal::getFilePathFor(int filenumber) const { + filesystem::path p(dir); + p /= string(str::stream() << "j._" << filenumber); + return p; + } + + /** never throws + @return true if journal dir is not empty + */ + bool haveJournalFiles() { + try { + for ( boost::filesystem::directory_iterator i( getJournalDir() ); + i != boost::filesystem::directory_iterator(); + ++i ) { + string fileName = boost::filesystem::path(*i).leaf(); + if( str::startsWith(fileName, "j._") ) + return true; + } + } + catch(...) { } + return false; + } + + /** throws */ + void removeJournalFiles() { + log() << "removeJournalFiles" << endl; + try { + for ( boost::filesystem::directory_iterator i( getJournalDir() ); + i != boost::filesystem::directory_iterator(); + ++i ) { + string fileName = boost::filesystem::path(*i).leaf(); + if( str::startsWith(fileName, "j._") ) { + try { + removeOldJournalFile(*i); + } + catch(std::exception& e) { + log() << "couldn't remove " << fileName << ' ' << e.what() << endl; + throw; + } + } + } + try { + boost::filesystem::remove(lsnPath()); + } + catch(...) { + log() << "couldn't remove " << lsnPath().string() << endl; + throw; + } + } + catch( std::exception& e ) { + log() << "error removing journal files " << e.what() << endl; + throw; + } + assert(!haveJournalFiles()); + log(1) << "removeJournalFiles end" << endl; + } + + /** at clean shutdown */ + bool okToCleanUp = false; // successful recovery would set this to true + void Journal::cleanup() { + if( !okToCleanUp ) + return; + + try { + scoped_lock lk(_curLogFileMutex); + closeCurrentJournalFile(); + removeJournalFiles(); + } + catch(std::exception& e) { + log() << "error couldn't remove journal file during shutdown " << e.what() << endl; + throw; + } + } + void journalCleanup() { j.cleanup(); } + + bool _preallocateIsFaster() { + bool faster = false; + filesystem::path p = getJournalDir() / "tempLatencyTest"; + try { remove(p); } catch(...) { } + try { + AlignedBuilder b(8192); + int millis[2]; + const int N = 50; + for( int pass = 0; pass < 2; pass++ ) { + LogFile f(p.string()); + Timer t; + for( int i = 0 ; i < N; i++ ) { + f.synchronousAppend(b.buf(), 8192); + } + millis[pass] = t.millis(); + // second time through, file exists and is prealloc case + } + int diff = millis[0] - millis[1]; + if( diff > 2 * N ) { + // at least 2ms faster for prealloc case? + faster = true; + log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl; + } + } + catch(...) { + log() << "info preallocateIsFaster couldn't run; returning false" << endl; + } + try { remove(p); } catch(...) { } + return faster; + } + bool preallocateIsFaster() { + return _preallocateIsFaster() && _preallocateIsFaster() && _preallocateIsFaster(); + } + + // throws + void preallocateFile(filesystem::path p, unsigned long long len) { + if( exists(p) ) + return; + + const unsigned BLKSZ = 1024 * 1024; + log() << "preallocating a journal file " << p.string() << endl; + LogFile f(p.string()); + AlignedBuilder b(BLKSZ); + for( unsigned long long x = 0; x < len; x += BLKSZ ) { + f.synchronousAppend(b.buf(), BLKSZ); + } + } + + // throws + void _preallocateFiles() { + for( int i = 0; i <= 2; i++ ) { + string fn = str::stream() << "prealloc." << i; + filesystem::path filepath = getJournalDir() / fn; + + unsigned long long limit = Journal::DataLimit; + if( debug && i == 1 ) { + // moving 32->64, the prealloc files would be short. that is "ok", but we want to exercise that + // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low + // limit for a file. also we want to be able to change in the future the constant without a lot of + // work anyway. + limit = 16 * 1024 * 1024; + } + preallocateFile(filepath, limit); + } + } + + void preallocateFiles() { + if( preallocateIsFaster() || + exists(getJournalDir()/"prealloc.0") || // if enabled previously, keep using + exists(getJournalDir()/"prealloc.1") ) { + usingPreallocate = true; + try { + _preallocateFiles(); + } + catch(...) { + log() << "warning caught exception in preallocateFiles, continuing" << endl; + } + } + j.open(); + } + + void removeOldJournalFile(path p) { + if( usingPreallocate ) { + try { + for( int i = 0; i <= 2; i++ ) { + string fn = str::stream() << "prealloc." << i; + filesystem::path filepath = getJournalDir() / fn; + if( !filesystem::exists(filepath) ) { + // we can recycle this file into this prealloc file location + boost::filesystem::rename(p, filepath); + return; + } + } + } catch(...) { + log() << "warning exception in dur::removeOldJournalFile " << p.string() << endl; + // fall through and try to delete the file + } + } + + // already have 3 prealloc files, so delete this file + try { + boost::filesystem::remove(p); + } + catch(...) { + log() << "warning exception removing " << p.string() << endl; + } + } + + // find a prealloc. file, presumably to take and use + path findPrealloced() { + try { + for( int i = 0; i <= 2; i++ ) { + string fn = str::stream() << "prealloc." << i; + filesystem::path filepath = getJournalDir() / fn; + if( filesystem::exists(filepath) ) + return filepath; + } + } catch(...) { + log() << "warning exception in dur::findPrealloced()" << endl; + } + return path(); + } + + /** assure journal/ dir exists. throws. call during startup. */ + void journalMakeDir() { + j.init(); + + filesystem::path p = getJournalDir(); + j.dir = p.string(); + log() << "journal dir=" << j.dir << endl; + if( !exists(j.dir) ) { + try { + create_directory(j.dir); + } + catch(std::exception& e) { + log() << "error creating directory " << j.dir << ' ' << e.what() << endl; + throw; + } + } + } + + void Journal::_open() { + _curFileId = 0; + assert( _curLogFile == 0 ); + path fname = getFilePathFor(_nextFileNumber); + + // if we have a prealloced file, use it + { + path p = findPrealloced(); + if( !p.empty() ) { + try { + { + // JHeader::fileId must be updated before renaming to be race-safe + LogFile f(p.string()); + JHeader h(p.string()); + AlignedBuilder b(8192); + b.appendStruct(h); + f.synchronousAppend(b.buf(), b.len()); + } + boost::filesystem::rename(p, fname); + } + catch(...) { + log() << "warning couldn't write to / rename file " << p.string() << endl; + } + } + } + + _curLogFile = new LogFile(fname.string()); + _nextFileNumber++; + { + JHeader h(fname.string()); + _curFileId = h.fileId; + assert(_curFileId); + AlignedBuilder b(8192); + b.appendStruct(h); + _curLogFile->synchronousAppend(b.buf(), b.len()); + } + } + + void Journal::init() { + assert( _curLogFile == 0 ); + MongoFile::notifyPreFlush = preFlush; + MongoFile::notifyPostFlush = postFlush; + } + + void Journal::open() { + assert( MongoFile::notifyPreFlush == preFlush ); + mutex::scoped_lock lk(_curLogFileMutex); + _open(); + } + + void LSNFile::set(unsigned long long x) { + memset(this, 0, sizeof(*this)); + lsn = x; + checkbytes = ~x; + } + + /** logs details of the situation, and returns 0, if anything surprising in the LSNFile + if something highly surprising, throws to abort + */ + unsigned long long LSNFile::get() { + uassert(13614, "unexpected version number of lsn file in journal/ directory", ver == 0); + if( ~lsn != checkbytes ) { + log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl; + return 0; + } + return lsn; + } + + /** called during recovery (the error message text below assumes that) + */ + unsigned long long journalReadLSN() { + if( !debug ) { + // in nondebug build, for now, be conservative until more tests written, and apply the whole journal. + // however we will still write the lsn file to exercise that code, and use in _DEBUG build. + return 0; + } + + if( !MemoryMappedFile::exists(lsnPath()) ) { + log() << "info no lsn file in journal/ directory" << endl; + return 0; + } + + try { + // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery. + // however, given we actually close the file when writing, that seems unlikely. + LSNFile L; + File f; + f.open(lsnPath().string().c_str()); + assert(f.is_open()); + f.read(0,(char*)&L, sizeof(L)); + unsigned long long lsn = L.get(); + return lsn; + } + catch(std::exception& e) { + uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what()); + } + return 0; + } + + unsigned long long getLastDataFileFlushTime() { + return j.lastFlushTime(); + } + + /** remember "last sequence number" to speed recoveries + concurrency: called by durThread only. + */ + void Journal::updateLSNFile() { + if( !_writeToLSNNeeded ) + return; + durThreadMain.assertWithin(); + _writeToLSNNeeded = false; + try { + // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery. + // however, given we actually close the file, that seems unlikely. + File f; + f.open(lsnPath().string().c_str()); + if( !f.is_open() ) { + // can get 0 if an i/o error + log() << "warning: open of lsn file failed" << endl; + return; + } + log() << "lsn set " << _lastFlushTime << endl; + LSNFile lsnf; + lsnf.set(_lastFlushTime); + f.write(0, (char*)&lsnf, sizeof(lsnf)); + } + catch(std::exception& e) { + log() << "warning: write to lsn file failed " << e.what() << endl; + // keep running (ignore the error). recovery will be slow. + } + } + + void Journal::preFlush() { + j._preFlushTime = Listener::getElapsedTimeMillis(); + } + + void Journal::postFlush() { + j._lastFlushTime = j._preFlushTime; + j._writeToLSNNeeded = true; + } + + // call from within _curLogFileMutex + void Journal::closeCurrentJournalFile() { + if (!_curLogFile) + return; + + JFile jf; + jf.filename = _curLogFile->_name; + jf.lastEventTimeMs = Listener::getElapsedTimeMillis(); + _oldJournalFiles.push_back(jf); + + delete _curLogFile; // close + _curLogFile = 0; + _written = 0; + } + + /** remove older journal files. + be in _curLogFileMutex but not dbMutex when calling + */ + void Journal::removeUnneededJournalFiles() { + while( !_oldJournalFiles.empty() ) { + JFile f = _oldJournalFiles.front(); + + if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) { + // eligible for deletion + path p( f.filename ); + log() << "old journal file will be removed: " << f.filename << endl; + removeOldJournalFile(p); + } + else { + break; + } + + _oldJournalFiles.pop_front(); + } + } + + /** check if time to rotate files. assure a file is open. + done separately from the journal() call as we can do this part + outside of lock. + thread: durThread() + */ + void journalRotate() { + j.rotate(); + } + void Journal::rotate() { + assert( !dbMutex.atLeastReadLocked() ); + durThreadMain.assertWithin(); + + scoped_lock lk(_curLogFileMutex); + + if ( inShutdown() || !_curLogFile ) + return; + + j.updateLSNFile(); + + if( _curLogFile && _written < DataLimit ) + return; + + if( _curLogFile ) { + + closeCurrentJournalFile(); + + removeUnneededJournalFiles(); + } + + try { + Timer t; + _open(); + int ms = t.millis(); + if( ms >= 200 ) { + log() << "DR101 latency warning on journal file open " << ms << "ms" << endl; + } + } + catch(std::exception& e) { + log() << "warning exception opening journal file " << e.what() << endl; + throw; + } + } + + /** write to journal + */ + void journal(const AlignedBuilder& b) { + j.journal(b); + } + void Journal::journal(const AlignedBuilder& b) { + try { + mutex::scoped_lock lk(_curLogFileMutex); + + // must already be open -- so that _curFileId is correct for previous buffer building + assert( _curLogFile ); + + stats.curr->_journaledBytes += b.len(); + _written += b.len(); + _curLogFile->synchronousAppend((void *) b.buf(), b.len()); + } + catch(std::exception& e) { + log() << "warning exception in dur::journal " << e.what() << endl; + throw; + } + } + + } +} + +/* todo + test (and handle) disk full on journal append. best quick thing to do is to terminate. + if we roll back operations, there are nuances such as is ReplSetImpl::lastOpTimeWritten too new in ram then? +*/ diff --git a/db/dur_journal.h b/db/dur_journal.h new file mode 100644 index 0000000..81957b5 --- /dev/null +++ b/db/dur_journal.h @@ -0,0 +1,68 @@ +// @file dur_journal.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +namespace mongo { + class AlignedBuilder; + + namespace dur { + + /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained. + */ + extern bool okToCleanUp; + + /** at termination after db files closed & fsynced */ + void journalCleanup(); + + /** assure journal/ dir exists. throws */ + void journalMakeDir(); + + /** check if time to rotate files; assure a file is open. + done separately from the journal() call as we can do this part + outside of lock. + only called by durThread. + */ + void journalRotate(); + + /** write/append to journal file * + @param buf - a buffer that will be written to the journal. + will not return until on disk + */ + void journal(const AlignedBuilder& buf); + + /** flag that something has gone wrong during writing to the journal + (not for recovery mode) + */ + void journalingFailure(const char *msg); + + /** read lsn from disk from the last run before doing recovery */ + unsigned long long journalReadLSN(); + + unsigned long long getLastDataFileFlushTime(); + + /** never throws. + @return true if there are any journal files in the journal dir. + */ + bool haveJournalFiles(); + + // in case disk controller buffers writes + const long long ExtraKeepTimeMs = 10000; + + } +} diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h new file mode 100644 index 0000000..d29f94d --- /dev/null +++ b/db/dur_journalformat.h @@ -0,0 +1,166 @@ +// @file dur_journalformat.h The format of our journal files. + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../util/md5.hpp" + +namespace mongo { + + namespace dur { + +#pragma pack(1) + /** beginning header for a journal/j._ file + there is nothing important int this header at this time. except perhaps version #. + */ + struct JHeader { + JHeader() { } + JHeader(string fname); + + char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something... + + // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near + // that. simply incrementing the version # is safe on a fwd basis. + enum { CurrentVersion = 0x4147 }; + unsigned short _version; + + // these are just for diagnostic ease (make header more useful as plain text) + char n1; // '\n' + char ts[20]; // ascii timestamp of file generation. for user reading, not used by code. + char n2; // '\n' + char dbpath[128]; // path/filename of this file for human reading and diagnostics. not used by code. + char n3, n4; // '\n', '\n' + + unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files + + char reserved3[8026]; // 8KB total for the file header + char txt2[2]; // "\n\n" at the end + + bool versionOk() const { return _version == CurrentVersion; } + bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; } + }; + + /** "Section" header. A section corresponds to a group commit. + len is length of the entire section including header and footer. + */ + struct JSectHeader { + unsigned len; // length in bytes of the whole section + unsigned long long seqNumber; // sequence number that can be used on recovery to not do too much work + unsigned long long fileId; // matches JHeader::fileId + }; + + /** an individual write operation within a group commit section. Either the entire section should + be applied, or nothing. (We check the md5 for the whole section before doing anything on recovery.) + */ + struct JEntry { + enum OpCodes { + OpCode_Footer = 0xffffffff, + OpCode_DbContext = 0xfffffffe, + OpCode_FileCreated = 0xfffffffd, + OpCode_DropDb = 0xfffffffc, + OpCode_Min = 0xfffff000 + }; + union { + unsigned len; // length in bytes of the data of the JEntry. does not include the JEntry header + OpCodes opcode; + }; + + unsigned ofs; // offset in file + + // sentinel and masks for _fileNo + enum { + DotNsSuffix = 0x7fffffff, // ".ns" file + LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext + }; + int _fileNo; // high bit is set to indicate it should be the /local database + // char data[len] follows + + const char * srcData() const { + const int *i = &_fileNo; + return (const char *) (i+1); + } + + int getFileNo() const { return _fileNo & (~LocalDbBit); } + void setFileNo(int f) { _fileNo = f; } + bool isNsSuffix() const { return getFileNo() == DotNsSuffix; } + + void setLocalDbContextBit() { _fileNo |= LocalDbBit; } + bool isLocalDbContext() const { return _fileNo & LocalDbBit; } + void clearLocalDbContextBit() { _fileNo = getFileNo(); } + + static string suffix(int fileno) { + if( fileno == DotNsSuffix ) return "ns"; + stringstream ss; + ss << fileno; + return ss.str(); + } + }; + + /** group commit section footer. md5 is a key field. */ + struct JSectFooter { + JSectFooter(const void* begin, int len) { // needs buffer to compute hash + sentinel = JEntry::OpCode_Footer; + reserved = 0; + magic[0] = magic[1] = magic[2] = magic[3] = '\n'; + + // skip section header since size modified after hashing + (const char*&)begin += sizeof(JSectHeader); + len -= sizeof(JSectHeader); + + md5(begin, len, hash); + } + unsigned sentinel; + md5digest hash; // unsigned char[16] + unsigned long long reserved; + char magic[4]; // "\n\n\n\n" + + bool checkHash(const void* begin, int len) const { + // skip section header since size modified after hashing + (const char*&)begin += sizeof(JSectHeader); + len -= sizeof(JSectHeader); + md5digest current; + md5(begin, len, current); + DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(current, 16) << endl; + return (memcmp(hash, current, sizeof(hash)) == 0); + } + }; + + /** declares "the next entry(s) are for this database / file path prefix" */ + struct JDbContext { + JDbContext() : sentinel(JEntry::OpCode_DbContext) { } + const unsigned sentinel; // compare to JEntry::len -- zero is our sentinel + //char dbname[]; + }; + + /** "last sequence number" */ + struct LSNFile { + unsigned ver; + unsigned reserved2; + unsigned long long lsn; + unsigned long long checkbytes; + unsigned long long reserved[8]; + + void set(unsigned long long lsn); + unsigned long long get(); + }; + +#pragma pack() + + } + +} diff --git a/db/dur_journalimpl.h b/db/dur_journalimpl.h new file mode 100644 index 0000000..9566dff --- /dev/null +++ b/db/dur_journalimpl.h @@ -0,0 +1,101 @@ +// @file dur_journal.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../util/logfile.h" + +namespace mongo { + namespace dur { + + /** the writeahead journal for durability */ + class Journal { + public: + string dir; // set by journalMakeDir() during initialization + + Journal(); + + /** call during startup by journalMakeDir() */ + void init(); + + /** check if time to rotate files. assure a file is open. + done separately from the journal() call as we can do this part + outside of lock. + thread: durThread() + */ + void rotate(); + + /** write to journal + */ + void journal(const AlignedBuilder& b); + + boost::filesystem::path getFilePathFor(int filenumber) const; + + unsigned long long lastFlushTime() const { return _lastFlushTime; } + void cleanup(); + + // Rotate after reaching this data size in a journal (j._) file + // We use a smaller size for 32 bit as the journal is mmapped during recovery (only) + // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must + // work. (and should as-is) + static const unsigned long long DataLimit = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024; + + unsigned long long curFileId() const { return _curFileId; } + + void assureLogFileOpen() { + mutex::scoped_lock lk(_curLogFileMutex); + if( _curLogFile == 0 ) + _open(); + } + + /** open a journal file to journal operations to. */ + void open(); + + private: + void _open(); + void closeCurrentJournalFile(); + void removeUnneededJournalFiles(); + + unsigned long long _written; // bytes written so far to the current journal (log) file + unsigned _nextFileNumber; + + mutex _curLogFileMutex; + + LogFile *_curLogFile; // use _curLogFileMutex + unsigned long long _curFileId; // current file id see JHeader::fileId + + struct JFile { + string filename; + unsigned long long lastEventTimeMs; + }; + + // files which have been closed but not unlinked (rotated out) yet + // ordered oldest to newest + list _oldJournalFiles; // use _curLogFileMutex + + // lsn related + static void preFlush(); + static void postFlush(); + unsigned long long _preFlushTime; + unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching) + bool _writeToLSNNeeded; + void updateLSNFile(); + }; + + } +} diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp new file mode 100644 index 0000000..1648e89 --- /dev/null +++ b/db/dur_preplogbuffer.cpp @@ -0,0 +1,192 @@ +// @file dur_preplogbuffer.cpp + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +/* + PREPLOGBUFFER + we will build an output buffer ourself and then use O_DIRECT + we could be in read lock for this + for very large objects write directly to redo log in situ? + @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc +*/ + +#include "pch.h" +#include "cmdline.h" +#include "dur.h" +#include "dur_journal.h" +#include "dur_journalimpl.h" +#include "dur_commitjob.h" +#include "../util/mongoutils/hash.h" +#include "../util/mongoutils/str.h" +#include "../util/alignedbuilder.h" +#include "../util/timer.h" +#include "dur_stats.h" + +using namespace mongoutils; + +namespace mongo { + namespace dur { + + extern Journal j; + + RelativePath local = RelativePath::fromRelativePath("local"); + + MongoMMF* findMMF_inlock(void *ptr, size_t &ofs) { + MongoMMF *f = privateViews.find_inlock(ptr, ofs); + if( f == 0 ) { + string s = str::stream() << "view pointer cannot be resolved " << (size_t) ptr; + journalingFailure(s.c_str()); // asserts + } + return f; + } + + /** put the basic write operation into the buffer (bb) to be journaled */ + void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) { + size_t ofs = 1; + MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs); + dassert( i->w_ptr == 0 ); + + if( !mmf->willNeedRemap() ) { + // tag this mmf as needed a remap of its private view later. + // usually it will already be dirty/already set, so we do the if above first + // to avoid possibility of cpu cache line contention + mmf->willNeedRemap() = true; + } + + // since we have already looked up the mmf, we go ahead and remember the write view location + // so we don't have to find the MongoMMF again later in WRITETODATAFILES() + dassert( i->w_ptr == 0 ); + i->w_ptr = ((char*)mmf->view_write()) + ofs; + + JEntry e; + e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file + assert( ofs <= 0x80000000 ); + e.ofs = (unsigned) ofs; + e.setFileNo( mmf->fileSuffixNo() ); + if( mmf->relativePath() == local ) { + e.setLocalDbContextBit(); + } + else if( mmf->relativePath() != lastDbPath ) { + lastDbPath = mmf->relativePath(); + JDbContext c; + bb.appendStruct(c); + bb.appendStr(lastDbPath.toString()); + } + bb.appendStruct(e); +#if defined(_EXPERIMENTAL) + i->ofsInJournalBuffer = bb.len(); +#endif + bb.appendBuf(i->start(), e.len); + + if (e.len != (unsigned)i->length()) { + log() << "dur info splitting prepBasicWrite at boundary" << endl; + + // This only happens if we write to the last byte in a file and + // the fist byte in another file that is mapped adjacently. I + // think most OSs leave at least a one page gap between + // mappings, but better to be safe. + + WriteIntent next ((char*)i->start() + e.len, i->length() - e.len); + prepBasicWrite_inlock(bb, &next, lastDbPath); + } + } + + /** basic write ops / write intents. note there is no particular order to these : if we have + two writes to the same location during the group commit interval, it is likely + (although not assured) that it is journaled here once. + */ + void prepBasicWrites(AlignedBuilder& bb) { + scoped_lock lk(privateViews._mutex()); + + // each time events switch to a different database we journal a JDbContext + RelativePath lastDbPath; + + for( set::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) { + prepBasicWrite_inlock(bb, &(*i), lastDbPath); + } + } + + void resetLogBuffer(AlignedBuilder& bb) { + bb.reset(); + + // JSectHeader + JSectHeader h; + h.len = (unsigned) 0xffffffff; // total length, will fill in later + h.seqNumber = getLastDataFileFlushTime(); + h.fileId = j.curFileId(); + + bb.appendStruct(h); + } + + /** we will build an output buffer ourself and then use O_DIRECT + we could be in read lock for this + caller handles locking + */ + void _PREPLOGBUFFER() { + assert( cmdLine.dur ); + + { + // now that we are locked, fully drain deferred notes of write intents + DEV dbMutex.assertAtLeastReadLocked(); + Writes& writes = commitJob.wi(); + writes._deferred.invoke(); + writes._drained = true; + } + + AlignedBuilder& bb = commitJob._ab; + resetLogBuffer(bb); + + // ops other than basic writes (DurOp's) + { + for( vector< shared_ptr >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { + (*i)->serialize(bb); + } + } + + { + prepBasicWrites(bb); + } + + { + JSectFooter f(bb.buf(), bb.len()); + bb.appendStruct(f); + } + + { + // pad to alignment, and set the total section length in the JSectHeader + assert( 0xffffe000 == (~(Alignment-1)) ); + unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); + dassert( L >= (unsigned) bb.len() ); + + *((unsigned*)bb.atOfs(0)) = L; + + unsigned padding = L - bb.len(); + bb.skip(padding); + dassert( bb.len() % Alignment == 0 ); + } + + return; + } + void PREPLOGBUFFER() { + Timer t; + j.assureLogFileOpen(); // so fileId is set + _PREPLOGBUFFER(); + stats.curr->_prepLogBufferMicros += t.micros(); + } + + } +} diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp new file mode 100644 index 0000000..1480a59 --- /dev/null +++ b/db/dur_recover.cpp @@ -0,0 +1,457 @@ +// @file dur_recover.cpp crash recovery via the journal + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" + +#include "dur.h" +#include "dur_recover.h" +#include "dur_journal.h" +#include "dur_journalformat.h" +#include "durop.h" +#include "namespace.h" +#include "../util/mongoutils/str.h" +#include "../util/bufreader.h" +#include "pdfile.h" +#include "database.h" +#include "db.h" +#include "../util/unittest.h" +#include "cmdline.h" +#include "curop.h" +#include "mongommf.h" + +#include +#include + +using namespace mongoutils; + +namespace mongo { + + namespace dur { + + struct ParsedJournalEntry { /*copyable*/ + ParsedJournalEntry() : e(0) { } + + // relative path of database for the operation. + // might be a pointer into mmaped Journal file + const char *dbName; + + // thse are pointers into the memory mapped journal file + const JEntry *e; // local db sentinel is already parsed out here into dbName + + // if not one of the two simple JEntry's above, this is the operation: + shared_ptr op; + }; + + void removeJournalFiles(); + path getJournalDir(); + + /** get journal filenames, in order. throws if unexpected content found */ + static void getFiles(path dir, vector& files) { + map m; + for ( filesystem::directory_iterator i( dir ); + i != filesystem::directory_iterator(); + ++i ) { + filesystem::path filepath = *i; + string fileName = filesystem::path(*i).leaf(); + if( str::startsWith(fileName, "j._") ) { + unsigned u = str::toUnsigned( str::after(fileName, '_') ); + if( m.count(u) ) { + uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName); + } + m.insert( pair(u,filepath) ); + } + } + for( map::iterator i = m.begin(); i != m.end(); ++i ) { + if( i != m.begin() && m.count(i->first - 1) == 0 ) { + uasserted(13532, + str::stream() << "unexpected file in journal directory " << dir.string() + << " : " << filesystem::path(i->second).leaf() << " : can't find its preceeding file"); + } + files.push_back(i->second); + } + } + + /** read through the memory mapped data of a journal file (journal/j._ file) + throws + */ + class JournalSectionIterator : boost::noncopyable { + public: + JournalSectionIterator(const void *p, unsigned len, bool doDurOps) + : _br(p, len) + , _sectHead(static_cast(_br.skip(sizeof(JSectHeader)))) + , _lastDbName(NULL) + , _doDurOps(doDurOps) + {} + + bool atEof() const { return _br.atEof(); } + + unsigned long long seqNumber() const { return _sectHead->seqNumber; } + + /** get the next entry from the log. this function parses and combines JDbContext and JEntry's. + * @return true if got an entry. false at successful end of section (and no entry returned). + * throws on premature end of section. + */ + bool next(ParsedJournalEntry& e) { + unsigned lenOrOpCode; + _br.read(lenOrOpCode); + + if (lenOrOpCode > JEntry::OpCode_Min) { + switch( lenOrOpCode ) { + + case JEntry::OpCode_Footer: { + if (_doDurOps) { + const char* pos = (const char*) _br.pos(); + pos -= sizeof(lenOrOpCode); // rewind to include OpCode + const JSectFooter& footer = *(const JSectFooter*)pos; + int len = pos - (char*)_sectHead; + if (!footer.checkHash(_sectHead, len)) { + massert(13594, str::stream() << "Journal checksum doesn't match. recorded: " + << toHex(footer.hash, sizeof(footer.hash)) + << " actual: " << md5simpledigest(_sectHead, len) + , false); + } + } + return false; // false return value denotes end of section + } + + case JEntry::OpCode_FileCreated: + case JEntry::OpCode_DropDb: { + e.dbName = 0; + boost::shared_ptr op = DurOp::read(lenOrOpCode, _br); + if (_doDurOps) { + e.op = op; + } + return true; + } + + case JEntry::OpCode_DbContext: { + _lastDbName = (const char*) _br.pos(); + const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _br.remaining()); + const unsigned len = strnlen(_lastDbName, limit); + massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0'); + _br.skip(len+1); // skip '\0' too + _br.read(lenOrOpCode); + } + // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet + + default: + // fall through + ; + } + } + + // JEntry - a basic write + assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min ); + _br.rewind(4); + e.e = (JEntry *) _br.skip(sizeof(JEntry)); + e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName; + assert( e.e->len == lenOrOpCode ); + _br.skip(e.e->len); + return true; + } + private: + BufReader _br; + const JSectHeader* _sectHead; + const char *_lastDbName; // pointer into mmaped journal file + const bool _doDurOps; + }; + + static string fileName(const char* dbName, int fileNo) { + stringstream ss; + ss << dbName << '.'; + assert( fileNo >= 0 ); + if( fileNo == JEntry::DotNsSuffix ) + ss << "ns"; + else + ss << fileNo; + + // relative name -> full path name + path full(dbpath); + full /= ss.str(); + return full.string(); + } + + RecoveryJob::~RecoveryJob() { + DESTRUCTOR_GUARD( + if( !_mmfs.empty() ) + close(); + ) + } + + void RecoveryJob::close() { + scoped_lock lk(_mx); + _close(); + } + + void RecoveryJob::_close() { + MongoFile::flushAll(true); + _mmfs.clear(); + } + + void RecoveryJob::write(const ParsedJournalEntry& entry) { + const string fn = fileName(entry.dbName, entry.e->getFileNo()); + MongoFile* file; + { + MongoFileFinder finder; // must release lock before creating new MongoMMF + file = finder.findByPath(fn); + } + + MongoMMF* mmf; + if (file) { + assert(file->isMongoMMF()); + mmf = (MongoMMF*)file; + } + else { + assert(_recovering); + boost::shared_ptr sp (new MongoMMF); + assert(sp->open(fn, false)); + _mmfs.push_back(sp); + mmf = sp.get(); + } + + if ((entry.e->ofs + entry.e->len) <= mmf->length()) { + void* dest = (char*)mmf->view_write() + entry.e->ofs; + memcpy(dest, entry.e->srcData(), entry.e->len); + } + else { + massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering); + } + } + + void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) { + if( entry.e ) { + if( dump ) { + stringstream ss; + ss << " BASICWRITE " << setw(20) << entry.dbName << '.'; + if( entry.e->isNsSuffix() ) + ss << "ns"; + else + ss << setw(2) << entry.e->getFileNo(); + ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/ + " " << hexdump(entry.e->srcData(), entry.e->len); + log() << ss.str() << endl; + } + if( apply ) { + write(entry); + } + } + else if(entry.op) { + // a DurOp subclass operation + if( dump ) { + log() << " OP " << entry.op->toString() << endl; + } + if( apply ) { + if( entry.op->needFilesClosed() ) { + _close(); // locked in processSection + } + entry.op->replay(); + } + } + } + + void RecoveryJob::applyEntries(const vector &entries) { + bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0; + bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal; + if( dump ) + log() << "BEGIN section" << endl; + + for( vector::const_iterator i = entries.begin(); i != entries.end(); ++i ) { + applyEntry(*i, apply, dump); + } + + if( dump ) + log() << "END section" << endl; + } + + void RecoveryJob::processSection(const void *p, unsigned len) { + scoped_lock lk(_mx); + + vector entries; + JournalSectionIterator i(p, len, _recovering); + + //DEV log() << "recovery processSection seq:" << i.seqNumber() << endl; + if( _recovering && _lastDataSyncedFromLastRun > i.seqNumber() + ExtraKeepTimeMs ) { + if( i.seqNumber() != _lastSeqMentionedInConsoleLog ) { + log() << "recover skipping application of section seq:" << i.seqNumber() << " < lsn:" << _lastDataSyncedFromLastRun << endl; + _lastSeqMentionedInConsoleLog = i.seqNumber(); + } + return; + } + + // first read all entries to make sure this section is valid + ParsedJournalEntry e; + while( i.next(e) ) { + entries.push_back(e); + } + + // got all the entries for one group commit. apply them: + applyEntries(entries); + } + + /** apply a specific journal file, that is already mmap'd + @param p start of the memory mapped file + @return true if this is detected to be the last file (ends abruptly) + */ + bool RecoveryJob::processFileBuffer(const void *p, unsigned len) { + try { + unsigned long long fileId; + BufReader br(p,len); + + { + // read file header + JHeader h; + br.read(h); + if( !h.versionOk() ) { + log() << "journal file version number mismatch. recover with old version of mongod, terminate cleanly, then upgrade." << endl; + uasserted(13536, str::stream() << "journal version number mismatch " << h._version); + } + uassert(13537, "journal header invalid", h.valid()); + fileId = h.fileId; + if(cmdLine.durOptions & CmdLine::DurDumpJournal) { + log() << "JHeader::fileId=" << fileId << endl; + } + } + + // read sections + while ( !br.atEof() ) { + JSectHeader h; + br.peek(h); + if( h.fileId != fileId ) { + if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) { + log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl; + log() << " sect len:" << h.len << " seqnum:" << h.seqNumber << endl; + } + return true; + } + processSection(br.skip(h.len), h.len); + + // ctrl c check + killCurrentOp.checkForInterrupt(false); + } + } + catch( BufReader::eof& ) { + if( cmdLine.durOptions & CmdLine::DurDumpJournal ) + log() << "ABRUPT END" << endl; + return true; // abrupt end + } + + return false; // non-abrupt end + } + + /** apply a specific journal file */ + bool RecoveryJob::processFile(path journalfile) { + log() << "recover " << journalfile.string() << endl; + MemoryMappedFile f; + void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL); + massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p); + return processFileBuffer(p, (unsigned) f.length()); + } + + /** @param files all the j._0 style files we need to apply for recovery */ + void RecoveryJob::go(vector& files) { + log() << "recover begin" << endl; + _recovering = true; + + // load the last sequence number synced to the datafiles on disk before the last crash + _lastDataSyncedFromLastRun = journalReadLSN(); + log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl; + + for( unsigned i = 0; i != files.size(); ++i ) { + /*bool abruptEnd = */processFile(files[i]); + /*if( abruptEnd && i+1 < files.size() ) { + log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl; + close(); + uasserted(13535, "recover abrupt journal file end"); + }*/ + } + + close(); + + if( cmdLine.durOptions & CmdLine::DurScanOnly ) { + uasserted(13545, str::stream() << "--durOptions " << (int) CmdLine::DurScanOnly << " (scan only) specified"); + } + + log() << "recover cleaning up" << endl; + removeJournalFiles(); + log() << "recover done" << endl; + okToCleanUp = true; + _recovering = false; + } + + void _recover() { + assert( cmdLine.dur ); + + filesystem::path p = getJournalDir(); + if( !exists(p) ) { + log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl; + okToCleanUp = true; + return; + } + + vector journalFiles; + getFiles(p, journalFiles); + + if( journalFiles.empty() ) { + log() << "recover : no journal files present, no recovery needed" << endl; + okToCleanUp = true; + return; + } + + RecoveryJob::get().go(journalFiles); + } + + extern mutex groupCommitMutex; + + /** recover from a crash + called during startup + throws on error + */ + void recover() { + // we use a lock so that exitCleanly will wait for us + // to finish (or at least to notice what is up and stop) + writelock lk; + + // this is so the mutexdebugger doesn't get confused. we are actually single threaded + // at this point in the program so it wouldn't have been a true problem (I think) + scoped_lock lk2(groupCommitMutex); + + _recover(); // throws on interruption + } + + struct BufReaderY { int a,b; }; + class BufReaderUnitTest : public UnitTest { + public: + void run() { + BufReader r((void*) "abcdabcdabcd", 12); + char x; + BufReaderY y; + r.read(x); //cout << x; // a + assert( x == 'a' ); + r.read(y); + r.read(x); + assert( x == 'b' ); + } + } brunittest; + + // can't free at termination because order of destruction of global vars is arbitrary + RecoveryJob &RecoveryJob::_instance = *(new RecoveryJob()); + + } // namespace dur + +} // namespace mongo + diff --git a/db/dur_recover.h b/db/dur_recover.h new file mode 100644 index 0000000..1022fdc --- /dev/null +++ b/db/dur_recover.h @@ -0,0 +1,45 @@ +// @file dur.h durability support + +#pragma once + +#include "../util/concurrency/mutex.h" +#include "../util/file.h" + +namespace mongo { + class MongoMMF; + + namespace dur { + struct ParsedJournalEntry; + + /** call go() to execute a recovery from existing journal files. + */ + class RecoveryJob : boost::noncopyable { + public: + RecoveryJob() :_lastDataSyncedFromLastRun(0), _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; } + void go(vector& files); + ~RecoveryJob(); + void processSection(const void *, unsigned len); + void close(); // locks and calls _close() + + static RecoveryJob & get() { return _instance; } + private: + void write(const ParsedJournalEntry& entry); // actually writes to the file + void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump); + void applyEntries(const vector &entries); + bool processFileBuffer(const void *, unsigned len); + bool processFile(path journalfile); + void _close(); // doesn't lock + + list > _mmfs; + + unsigned long long _lastDataSyncedFromLastRun; + unsigned long long _lastSeqMentionedInConsoleLog; + + mongo::mutex _mx; // protects _mmfs + + bool _recovering; // are we in recovery or WRITETODATAFILES + + static RecoveryJob &_instance; + }; + } +} diff --git a/db/dur_stats.h b/db/dur_stats.h new file mode 100644 index 0000000..5f5a188 --- /dev/null +++ b/db/dur_stats.h @@ -0,0 +1,46 @@ +// @file dur_stats.h + +namespace mongo { + namespace dur { + + /** journaling stats. the model here is that the commit thread is the only writer, and that reads are + uncommon (from a serverStatus command and such). Thus, there should not be multicore chatter overhead. + */ + struct Stats { + Stats(); + void rotate(); + BSONObj asObj(); + unsigned _intervalMicros; + struct S { + BSONObj _asObj(); + void reset(); + + unsigned _commits; + unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow() + unsigned long long _journaledBytes; + unsigned long long _writeToDataFilesBytes; + + unsigned long long _prepLogBufferMicros; + unsigned long long _writeToJournalMicros; + unsigned long long _writeToDataFilesMicros; + unsigned long long _remapPrivateViewMicros; + + // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we + // have visibility when this happens. can happen for a couple reasons + // - read lock starvation + // - file being closed + // - data being written faster than the normal group commit interval + unsigned _commitsInWriteLock; + + unsigned _dtMillis; + }; + S *curr; + private: + S _a,_b; + unsigned long long _lastRotate; + S* other(); + }; + extern Stats stats; + + } +} diff --git a/db/dur_writetodatafiles.cpp b/db/dur_writetodatafiles.cpp new file mode 100644 index 0000000..50797ea --- /dev/null +++ b/db/dur_writetodatafiles.cpp @@ -0,0 +1,99 @@ +// @file dur_writetodatafiles.cpp apply the writes back to the non-private MMF after they are for certain in redo log + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "dur_commitjob.h" +#include "dur_stats.h" +#include "dur_recover.h" +#include "../util/timer.h" + +namespace mongo { + namespace dur { + + void debugValidateAllMapsMatch(); + + /** apply the writes back to the non-private MMF after they are for certain in redo log + + (1) todo we don't need to write back everything every group commit. we MUST write back + that which is going to be a remapped on its private view - but that might not be all + views. + + (2) todo should we do this using N threads? would be quite easy + see Hackenberg paper table 5 and 6. 2 threads might be a good balance. + + (3) with enough work, we could do this outside the read lock. it's a bit tricky though. + - we couldn't do it from the private views then as they may be changing. would have to then + be from the journal alignedbuffer. + - we need to be careful the file isn't unmapped on us -- perhaps a mutex or something + with MongoMMF on closes or something to coordinate that. + + locking: in read lock when called + + @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en + */ + + void WRITETODATAFILES_Impl1() { + RecoveryJob::get().processSection(commitJob._ab.buf(), commitJob._ab.len()); + } + + // the old implementation + void WRITETODATAFILES_Impl2() { + /* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */ + for( set::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) { + const WriteIntent& intent = *it; + stats.curr->_writeToDataFilesBytes += intent.length(); + dassert(intent.w_ptr); + memcpy(intent.w_ptr, intent.start(), intent.length()); + } + } + +#if defined(_EXPERIMENTAL) + void WRITETODATAFILES_Impl3() { + /* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */ + for( set::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) { + const WriteIntent& intent = *it; + stats.curr->_writeToDataFilesBytes += intent.length(); + dassert(intent.w_ptr); + memcpy(intent.w_ptr, + commitJob._ab.atOfs(intent.ofsInJournalBuffer), + intent.length()); + } + } +#endif + + void WRITETODATAFILES() { + dbMutex.assertAtLeastReadLocked(); + + MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock + + Timer t; +#if defined(_EXPERIMENTAL) + WRITETODATAFILES_Impl3(); +#else + WRITETODATAFILES_Impl1(); +#endif + stats.curr->_writeToDataFilesMicros += t.micros(); + + if (!dbMutex.isWriteLocked()) + MongoFile::unmarkAllWritable(); + + debugValidateAllMapsMatch(); + } + + } +} diff --git a/db/durop.cpp b/db/durop.cpp new file mode 100644 index 0000000..344b21e --- /dev/null +++ b/db/durop.cpp @@ -0,0 +1,160 @@ +// @file durop.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "concurrency.h" +#include "../util/alignedbuilder.h" +#include "../util/mongoutils/str.h" +#include "../util/file.h" +#include "mongommf.h" +#include "durop.h" +#include "../util/file_allocator.h" + +using namespace mongoutils; + +namespace mongo { + + extern string dbpath; // --dbpath parm + + void _deleteDataFiles(const char *); + + namespace dur { + + /** read a durop from journal file referenced by br. + @param opcode the opcode which has already been written from the bufreader + */ + shared_ptr DurOp::read(unsigned opcode, BufReader& br) { + shared_ptr op; + switch( opcode ) { + case JEntry::OpCode_FileCreated: + op = shared_ptr( new FileCreatedOp(br) ); + break; + case JEntry::OpCode_DropDb: + op = shared_ptr( new DropDbOp(br) ); + break; + default: + massert(13546, (str::stream() << "dur recover unrecognized opcode in journal " << opcode), false); + } + return op; + } + + void DurOp::serialize(AlignedBuilder& ab) { + ab.appendNum(_opcode); + _serialize(ab); + } + + DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) { + unsigned long long reserved; + log.read(reserved); + log.read(reserved); + log.readStr(_db); + string reservedStr; + log.readStr(reservedStr); + } + + void DropDbOp::_serialize(AlignedBuilder& ab) { + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendStr(_db); + ab.appendStr(""); // reserved + } + + /** throws */ + void DropDbOp::replay() { + log() << "recover replay drop db " << _db << endl; + _deleteDataFiles(_db.c_str()); + } + + FileCreatedOp::FileCreatedOp(string f, unsigned long long l) : + DurOp(JEntry::OpCode_FileCreated) { + _p = RelativePath::fromFullPath(f); + _len = l; + } + + FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) { + unsigned long long reserved; + log.read(reserved); + log.read(reserved); + log.read(_len); // size of file, not length of name + string s; + log.readStr(s); + _p._p = s; + } + + void FileCreatedOp::_serialize(AlignedBuilder& ab) { + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendNum(_len); + ab.appendStr(_p.toString()); + } + + string FileCreatedOp::toString() { + return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB"; + } + + // if an operation deletes or creates a file (or moves etc.), it may need files closed. + bool FileCreatedOp::needFilesClosed() { + return exists( _p.asFullPath() ); + } + + void FileCreatedOp::replay() { + // i believe the code assumes new files are filled with zeros. thus we have to recreate the file, + // or rewrite at least, even if it were the right length. perhaps one day we should change that + // although easier to avoid defects if we assume it is zeros perhaps. + string full = _p.asFullPath(); + if( exists(full) ) { + try { + remove(full); + } + catch(std::exception& e) { + log(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl; + } + } + + log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl; + if( MemoryMappedFile::exists(full) ) { + // first delete if exists. + try { + remove(full); + } + catch(...) { + log() << "warning could not delete file " << full << endl; + } + } + ensureParentDirCreated(full); + File f; + f.open(full.c_str()); + massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open()); + unsigned long long left = _len; + const unsigned blksz = 64 * 1024; + scoped_array v( new char[blksz] ); + memset( v.get(), 0, blksz ); + fileofs ofs = 0; + while( left ) { + unsigned long long w = left < blksz ? left : blksz; + f.write(ofs, v.get(), (unsigned) w); + left -= w; + ofs += w; + } + f.fsync(); + massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() ); + } + + } + +} diff --git a/db/durop.h b/db/durop.h new file mode 100644 index 0000000..c4574c2 --- /dev/null +++ b/db/durop.h @@ -0,0 +1,111 @@ +// @file durop.h class DurOp and descendants + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "dur_journalformat.h" +#include "../util/bufreader.h" +#include "../util/paths.h" + +namespace mongo { + + class AlignedBuilder; + + namespace dur { + + const unsigned Alignment = 8192; + + /** DurOp - Operations we journal that aren't just basic writes. + * + * Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent. + * We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of + * them (we don't want a vtable for example there). + * + * For each op we want to journal, we define a subclass. + */ + class DurOp { /* copyable */ + public: + // @param opcode a sentinel value near max unsigned which uniquely identifies the operation. + // @see dur::JEntry + DurOp(unsigned opcode) : _opcode(opcode) { } + + virtual ~DurOp() { } + + /** serialize the op out to a builder which will then be written (presumably) to the journal */ + void serialize(AlignedBuilder& ab); + + /** read a durop from journal file referenced by br. + @param opcode the opcode which has already been written from the bufreader + */ + static shared_ptr read(unsigned opcode, BufReader& br); + + /** replay the operation (during recovery) + throws + + For now, these are not replayed during the normal WRITETODATAFILES phase, since these + operations are handled in other parts of the code. At some point this may change. + */ + virtual void replay() = 0; + + virtual string toString() = 0; + + /** if the op requires all file to be closed before doing its work, returns true. */ + virtual bool needFilesClosed() { return false; } + + protected: + /** DurOp will have already written the opcode for you */ + virtual void _serialize(AlignedBuilder& ab) = 0; + + private: + const unsigned _opcode; + }; + + /** indicates creation of a new file */ + class FileCreatedOp : public DurOp { + public: + FileCreatedOp(BufReader& log); + /** param f filename to create with path */ + FileCreatedOp(string f, unsigned long long l); + virtual void replay(); + virtual string toString(); + virtual bool needFilesClosed(); + protected: + virtual void _serialize(AlignedBuilder& ab); + private: + RelativePath _p; + unsigned long long _len; // size of file, not length of name + }; + + /** record drop of a database */ + class DropDbOp : public DurOp { + public: + DropDbOp(BufReader& log); + DropDbOp(string db) : + DurOp(JEntry::OpCode_DropDb), _db(db) { } + virtual void replay(); + virtual string toString() { return string("DropDbOp ") + _db; } + virtual bool needFilesClosed() { return true; } + protected: + virtual void _serialize(AlignedBuilder& ab); + private: + string _db; + }; + + } + +} diff --git a/db/extsort.cpp b/db/extsort.cpp index 68e6b52..2e6d8d8 100644 --- a/db/extsort.cpp +++ b/db/extsort.cpp @@ -19,160 +19,160 @@ #include "pch.h" #include "extsort.h" -#include "namespace.h" +#include "namespace-inl.h" #include "../util/file.h" #include #include #include namespace mongo { - + BSONObj BSONObjExternalSorter::extSortOrder; unsigned long long BSONObjExternalSorter::_compares = 0; - + BSONObjExternalSorter::BSONObjExternalSorter( const BSONObj & order , long maxFileSize ) - : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , - _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0){ - + : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , + _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) { + stringstream rootpath; rootpath << dbpath; if ( dbpath[dbpath.size()-1] != '/' ) rootpath << "/"; rootpath << "_tmp/esort." << time(0) << "." << rand() << "/"; _root = rootpath.str(); - + log(1) << "external sort root: " << _root.string() << endl; create_directories( _root ); _compares = 0; } - - BSONObjExternalSorter::~BSONObjExternalSorter(){ - if ( _cur ){ + + BSONObjExternalSorter::~BSONObjExternalSorter() { + if ( _cur ) { delete _cur; _cur = 0; } - + unsigned long removed = remove_all( _root ); wassert( removed == 1 + _files.size() ); } - void BSONObjExternalSorter::_sortInMem(){ + void BSONObjExternalSorter::_sortInMem() { // extSortComp needs to use glbals // qsort_r only seems available on bsd, which is what i really want to use dblock l; extSortOrder = _order; _cur->sort( BSONObjExternalSorter::extSortComp ); } - - void BSONObjExternalSorter::sort(){ + + void BSONObjExternalSorter::sort() { uassert( 10048 , "already sorted" , ! _sorted ); - + _sorted = true; - if ( _cur && _files.size() == 0 ){ + if ( _cur && _files.size() == 0 ) { _sortInMem(); log(1) << "\t\t not using file. size:" << _curSizeSoFar << " _compares:" << _compares << endl; return; } - - if ( _cur ){ + + if ( _cur ) { finishMap(); } - - if ( _cur ){ + + if ( _cur ) { delete _cur; _cur = 0; } - + if ( _files.size() == 0 ) return; - + } - void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ){ + void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ) { uassert( 10049 , "sorted already" , ! _sorted ); - - if ( ! _cur ){ + + if ( ! _cur ) { _cur = new InMemory( _arraySize ); } - + Data& d = _cur->getNext(); d.first = o.getOwned(); d.second = loc; - + long size = o.objsize(); _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj ); - - if ( _cur->hasSpace() == false || _curSizeSoFar > _maxFilesize ){ + + if ( _cur->hasSpace() == false || _curSizeSoFar > _maxFilesize ) { finishMap(); log(1) << "finishing map" << endl; } } - - void BSONObjExternalSorter::finishMap(){ + + void BSONObjExternalSorter::finishMap() { uassert( 10050 , "bad" , _cur ); - + _curSizeSoFar = 0; if ( _cur->size() == 0 ) return; - + _sortInMem(); - + stringstream ss; ss << _root.string() << "/file." << _files.size(); string file = ss.str(); - + ofstream out; out.open( file.c_str() , ios_base::out | ios_base::binary ); assertStreamGood( 10051 , (string)"couldn't open file: " + file , out ); - + int num = 0; - for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ){ + for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ) { Data p = *i; out.write( p.first.objdata() , p.first.objsize() ); out.write( (char*)(&p.second) , sizeof( DiskLoc ) ); num++; } - + _cur->clear(); - + _files.push_back( file ); out.close(); log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl; } - + // --------------------------------- BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) : - _cmp( sorter->_order ) , _in( 0 ){ - - for ( list::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ){ + _cmp( sorter->_order ) , _in( 0 ) { + + for ( list::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) { _files.push_back( new FileIterator( *i ) ); _stash.push_back( pair( Data( BSONObj() , DiskLoc() ) , false ) ); } - - if ( _files.size() == 0 && sorter->_cur ){ + + if ( _files.size() == 0 && sorter->_cur ) { _in = sorter->_cur; _it = sorter->_cur->begin(); } - + } - - BSONObjExternalSorter::Iterator::~Iterator(){ + + BSONObjExternalSorter::Iterator::~Iterator() { for ( vector::iterator i=_files.begin(); i!=_files.end(); i++ ) delete *i; _files.clear(); } - - bool BSONObjExternalSorter::Iterator::more(){ + + bool BSONObjExternalSorter::Iterator::more() { if ( _in ) return _it != _in->end(); - + for ( vector::iterator i=_files.begin(); i!=_files.end(); i++ ) if ( (*i)->more() ) return true; @@ -181,34 +181,34 @@ namespace mongo { return true; return false; } - - BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next(){ - - if ( _in ){ + + BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next() { + + if ( _in ) { Data& d = *_it; ++_it; return d; } - + Data best; int slot = -1; - - for ( unsigned i=0; i<_stash.size(); i++ ){ - if ( ! _stash[i].second ){ + for ( unsigned i=0; i<_stash.size(); i++ ) { + + if ( ! _stash[i].second ) { if ( _files[i]->more() ) _stash[i] = pair( _files[i]->next() , true ); else continue; } - - if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ){ + + if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ) { best = _stash[i].first; slot = i; } - + } - + assert( slot >= 0 ); _stash[slot].second = false; @@ -216,27 +216,26 @@ namespace mongo { } // ----------------------------------- - - BSONObjExternalSorter::FileIterator::FileIterator( string file ){ - long length; + + BSONObjExternalSorter::FileIterator::FileIterator( string file ) { + unsigned long long length; _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL ); massert( 10308 , "mmap failed" , _buf ); - assert( (unsigned long long)length == (unsigned long long)file_size( file ) ); + assert( length == (unsigned long long) file_size( file ) ); _end = _buf + length; } - BSONObjExternalSorter::FileIterator::~FileIterator(){ - } - - bool BSONObjExternalSorter::FileIterator::more(){ + BSONObjExternalSorter::FileIterator::~FileIterator() {} + + bool BSONObjExternalSorter::FileIterator::more() { return _buf < _end; } - - BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next(){ + + BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next() { BSONObj o( _buf ); _buf += o.objsize(); DiskLoc * l = (DiskLoc*)_buf; _buf += 8; return Data( o , *l ); } - + } diff --git a/db/extsort.h b/db/extsort.h index fa0eca4..c0791db 100644 --- a/db/extsort.h +++ b/db/extsort.h @@ -20,8 +20,8 @@ #include "../pch.h" #include "jsobj.h" -#include "namespace.h" -#include "curop.h" +#include "namespace-inl.h" +#include "curop-inl.h" #include "../util/array.h" namespace mongo { @@ -32,13 +32,13 @@ namespace mongo { */ class BSONObjExternalSorter : boost::noncopyable { public: - + typedef pair Data; private: static BSONObj extSortOrder; - static int extSortComp( const void *lv, const void *rv ){ + static int extSortComp( const void *lv, const void *rv ) { RARELY killCurrentOp.checkForInterrupt(); _compares++; Data * l = (Data*)lv; @@ -54,7 +54,7 @@ namespace mongo { FileIterator( string file ); ~FileIterator(); bool more(); - Data next(); + Data next(); private: MemoryMappedFile _file; char * _buf; @@ -63,7 +63,7 @@ namespace mongo { class MyCmp { public: - MyCmp( const BSONObj & order = BSONObj() ) : _order( order ){} + MyCmp( const BSONObj & order = BSONObj() ) : _order( order ) {} bool operator()( const Data &l, const Data &r ) const { RARELY killCurrentOp.checkForInterrupt(); _compares++; @@ -78,50 +78,50 @@ namespace mongo { }; public: - + typedef FastArray InMemory; class Iterator : boost::noncopyable { public: - + Iterator( BSONObjExternalSorter * sorter ); ~Iterator(); bool more(); Data next(); - + private: MyCmp _cmp; vector _files; vector< pair > _stash; - + InMemory * _in; InMemory::iterator _it; - + }; - + BSONObjExternalSorter( const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 ); ~BSONObjExternalSorter(); - + void add( const BSONObj& o , const DiskLoc & loc ); - void add( const BSONObj& o , int a , int b ){ + void add( const BSONObj& o , int a , int b ) { add( o , DiskLoc( a , b ) ); } /* call after adding values, and before fetching the iterator */ void sort(); - - auto_ptr iterator(){ + + auto_ptr iterator() { uassert( 10052 , "not sorted" , _sorted ); return auto_ptr( new Iterator( this ) ); } - - int numFiles(){ + + int numFiles() { return _files.size(); } - - long getCurSizeSoFar(){ return _curSizeSoFar; } - void hintNumObjects( long long numObjects ){ + long getCurSizeSoFar() { return _curSizeSoFar; } + + void hintNumObjects( long long numObjects ) { if ( numObjects < _arraySize ) _arraySize = (int)(numObjects + 100); } @@ -129,18 +129,18 @@ namespace mongo { private: void _sortInMem(); - + void sort( string file ); void finishMap(); - + BSONObj _order; long _maxFilesize; path _root; - + int _arraySize; InMemory * _cur; long _curSizeSoFar; - + list _files; bool _sorted; diff --git a/db/filever.h b/db/filever.h index 4aa18d4..e89a824 100644 --- a/db/filever.h +++ b/db/filever.h @@ -20,11 +20,11 @@ namespace mongo { -inline void checkDataFileVersion(NamespaceDetails& d) { -} + inline void checkDataFileVersion(NamespaceDetails& d) { + } -inline void checkIndexFileVersion(NamespaceDetails& d) { -} + inline void checkIndexFileVersion(NamespaceDetails& d) { + } } diff --git a/db/geo/2d.cpp b/db/geo/2d.cpp index 60818fc..934ee80 100644 --- a/db/geo/2d.cpp +++ b/db/geo/2d.cpp @@ -17,14 +17,14 @@ */ #include "pch.h" -#include "../namespace.h" +#include "../namespace-inl.h" #include "../jsobj.h" #include "../index.h" #include "../../util/unittest.h" #include "../commands.h" #include "../pdfile.h" #include "../btree.h" -#include "../curop.h" +#include "../curop-inl.h" #include "../matcher.h" #include "core.h" @@ -33,7 +33,8 @@ namespace mongo { #if 0 # define GEODEBUG(x) cout << x << endl; - inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g){ +# define GEODEBUGPRINT(x) PRINT(x) + inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) { if (!prefix.constrains()) { cout << "\t empty prefix" << endl; return ; @@ -46,18 +47,29 @@ namespace mongo { Point center ( (ll._x+tr._x)/2, (ll._y+tr._y)/2 ); double radius = fabs(ll._x - tr._x) / 2; - cout << "\t ll: " << ll.toString() << " tr: " << tr.toString() + cout << "\t ll: " << ll.toString() << " tr: " << tr.toString() << " center: " << center.toString() << " radius: " << radius << endl; } #else -# define GEODEBUG(x) -# define PREFIXDEBUG(x, y) +# define GEODEBUG(x) +# define GEODEBUGPRINT(x) +# define PREFIXDEBUG(x, y) #endif - double EARTH_RADIUS_KM = 6371; - double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192; + const double EARTH_RADIUS_KM = 6371; + const double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192; + enum GeoDistType { + GEO_PLAIN, + GEO_SPHERE + }; + + inline double computeXScanDistance(double y, double maxDistDegrees) { + // TODO: this overestimates for large madDistDegrees far from the equator + return maxDistDegrees / min(cos(deg2rad(min(+89.0, y + maxDistDegrees))), + cos(deg2rad(max(-89.0, y - maxDistDegrees)))); + } GeoBitSets geoBitSets; @@ -66,14 +78,14 @@ namespace mongo { class Geo2dType : public IndexType , public GeoConvert { public: Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec ) - : IndexType( plugin , spec ){ - + : IndexType( plugin , spec ) { + BSONObjBuilder orderBuilder; BSONObjIterator i( spec->keyPattern ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); - if ( e.type() == String && GEO2DNAME == e.valuestr() ){ + if ( e.type() == String && GEO2DNAME == e.valuestr() ) { uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 ); uassert( 13023 , "2d has to be first in index" , _other.size() == 0 ); _geo = e.fieldName(); @@ -83,16 +95,16 @@ namespace mongo { } orderBuilder.append( "" , 1 ); } - + uassert( 13024 , "no geo field specified" , _geo.size() ); - + _bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft uassert( 13028 , "can't have more than 32 bits in geo index" , _bits <= 32 ); _max = _configval( spec , "max" , 180 ); _min = _configval( spec , "min" , -180 ); - + _scaling = (1024*1024*1024*4.0)/(_max-_min); _order = orderBuilder.obj(); @@ -103,30 +115,30 @@ namespace mongo { _error = distance(a, b); } - int _configval( const IndexSpec* spec , const string& name , int def ){ + int _configval( const IndexSpec* spec , const string& name , int def ) { BSONElement e = spec->info[name]; if ( e.isNumber() ) return e.numberInt(); return def; } - ~Geo2dType(){ - + ~Geo2dType() { + } - virtual BSONObj fixKey( const BSONObj& in ) { + virtual BSONObj fixKey( const BSONObj& in ) { if ( in.firstElement().type() == BinData ) return in; BSONObjBuilder b(in.objsize()+16); - + if ( in.firstElement().isABSONObj() ) _hash( in.firstElement().embeddedObject() ).append( b , "" ); else if ( in.firstElement().type() == String ) GeoHash( in.firstElement().valuestr() ).append( b , "" ); else if ( in.firstElement().type() == RegEx ) GeoHash( in.firstElement().regex() ).append( b , "" ); - else + else return in; BSONObjIterator i(in); @@ -152,19 +164,44 @@ namespace mongo { _hash( embed ).append( b , "" ); - for ( size_t i=0; i<_other.size(); i++ ){ - BSONElement e = obj[_other[i]]; - if ( e.eoo() ) - e = _spec->missingField(); - b.appendAs( e , "" ); - } + // Go through all the other index keys + for ( vector::const_iterator i = _other.begin(); i != _other.end(); ++i ){ + + // Get *all* fields for the index key + BSONElementSet eSet; + obj.getFieldsDotted( *i, eSet ); + + + if ( eSet.size() == 0 ) + b.appendAs( _spec->missingField(), "" ); + else if ( eSet.size() == 1 ) + b.appendAs( *(eSet.begin()), "" ); + else{ + + // If we have more than one key, store as an array of the objects + // TODO: Store multiple keys? + + BSONArrayBuilder aBuilder; + + for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ){ + aBuilder.append( *ei ); + } + + BSONArray arr = aBuilder.arr(); + + b.append( "", arr ); + + } + + } + keys.insert( b.obj() ); } - + GeoHash _tohash( const BSONElement& e ) const { if ( e.isABSONObj() ) return _hash( e.embeddedObject() ); - + return GeoHash( e , _bits ); } @@ -174,7 +211,7 @@ namespace mongo { BSONElement x = i.next(); uassert( 13068 , "geo field only has 1 element" , i.more() ); BSONElement y = i.next(); - + uassert( 13026 , "geo values have to be numbers: " + o.toString() , x.isNumber() && y.isNumber() ); return hash( x.number() , y.number() ); @@ -192,33 +229,33 @@ namespace mongo { b.append( "y" , _unconvert( y ) ); return b.obj(); } - + unsigned _convert( double in ) const { uassert( 13027 , "point not in range" , in <= (_max + _error) && in >= (_min - _error) ); in -= _min; assert( in > 0 ); return (unsigned)(in * _scaling); } - + double _unconvert( unsigned in ) const { double x = in; x /= _scaling; x += _min; return x; } - + void unhash( const GeoHash& h , double& x , double& y ) const { unsigned a,b; h.unhash(a,b); x = _unconvert( a ); y = _unconvert( b ); } - + double distance( const GeoHash& a , const GeoHash& b ) const { double ax,ay,bx,by; unhash( a , ax , ay ); unhash( b , bx , by ); - + double dx = bx - ax; double dy = by - ay; @@ -237,6 +274,11 @@ namespace mongo { b.move( 1 , 1 ); unhash( a, ax, ay ); unhash( b, bx, by ); + + // _min and _max are a singularity + if (bx == _min) + bx = _max; + return (fabs(ax-bx)); } @@ -248,10 +290,10 @@ namespace mongo { virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const { BSONElement e = query.getFieldDotted(_geo.c_str()); - switch ( e.type() ){ + switch ( e.type() ) { case Object: { BSONObj sub = e.embeddedObject(); - switch ( sub.firstElement().getGtLtOp() ){ + switch ( sub.firstElement().getGtLtOp() ) { case BSONObj::opNEAR: case BSONObj::opWITHIN: return OPTIMAL; @@ -259,6 +301,9 @@ namespace mongo { } } case Array: + // Non-geo index data is stored in a non-standard way, cannot use for exact lookups with + // additional criteria + if ( query.nFields() > 1 ) return USELESS; return HELPFUL; default: return USELESS; @@ -267,7 +312,7 @@ namespace mongo { string _geo; vector _other; - + unsigned _bits; int _max; int _min; @@ -279,38 +324,38 @@ namespace mongo { class Box { public: - + Box( const Geo2dType * g , const GeoHash& hash ) - : _min( g , hash ) , - _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ){ + : _min( g , hash ) , + _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ) { } - + Box( double x , double y , double size ) - : _min( x , y ) , - _max( x + size , y + size ){ + : _min( x , y ) , + _max( x + size , y + size ) { } Box( Point min , Point max ) - : _min( min ) , _max( max ){ + : _min( min ) , _max( max ) { } - Box(){} + Box() {} string toString() const { StringBuilder buf(64); buf << _min.toString() << " -->> " << _max.toString(); return buf.str(); } - + bool between( double min , double max , double val , double fudge=0) const { return val + fudge >= min && val <= max + fudge; } - + bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const { assert( amin <= amax ); assert( bmin <= bmax ); - if ( amin < bmin ){ + if ( amin < bmin ) { if ( amax < bmin ) return false; res = min ? bmin : amax; @@ -323,16 +368,16 @@ namespace mongo { } double intersects( const Box& other ) const { - + Point boundMin(0,0); Point boundMax(0,0); - + if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false || - mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false || - mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false || - mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false ) + mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false || + mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false || + mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false ) return 0; - + Box intersection( boundMin , boundMax ); return intersection.area() / ( ( area() + other.area() ) / 2 ); @@ -347,45 +392,49 @@ namespace mongo { ( _min._y + _max._y ) / 2 ); } - bool inside( Point p , double fudge = 0 ){ + bool inside( Point p , double fudge = 0 ) { bool res = inside( p._x , p._y , fudge ); //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl; return res; } - - bool inside( double x , double y , double fudge = 0 ){ - return + + bool inside( double x , double y , double fudge = 0 ) { + return between( _min._x , _max._x , x , fudge ) && between( _min._y , _max._y , y , fudge ); } - + + bool contains(const Box& other, double fudge=0) { + return inside(other._min, fudge) && inside(other._max, fudge); + } + Point _min; Point _max; }; - + class Geo2dPlugin : public IndexPlugin { public: - Geo2dPlugin() : IndexPlugin( GEO2DNAME ){ + Geo2dPlugin() : IndexPlugin( GEO2DNAME ) { } - + virtual IndexType* generate( const IndexSpec* spec ) const { return new Geo2dType( this , spec ); } } geo2dplugin; - + struct GeoUnitTest : public UnitTest { - - int round( double d ){ + + int round( double d ) { return (int)(.5+(d*1000)); } - + #define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); } - void run(){ + void run() { assert( ! GeoHash::isBitSet( 0 , 0 ) ); assert( ! GeoHash::isBitSet( 0 , 31 ) ); assert( GeoHash::isBitSet( 1 , 31 ) ); - + IndexSpec i( BSON( "loc" << "2d" ) ); Geo2dType g( &geo2dplugin , &i ); { @@ -411,7 +460,7 @@ namespace mongo { assert( round( in["x"].number() ) == round( out["x"].number() ) ); assert( round( in["y"].number() ) == round( out["y"].number() ) ); } - + { GeoHash h( "0000" ); h.move( 0 , 1 ); @@ -424,13 +473,13 @@ namespace mongo { GEOHEQ( h , "0100" ); h.move( 0 , -1 ); GEOHEQ( h , "0001" ); - + h.init( "0000" ); h.move( 1 , 0 ); GEOHEQ( h , "0010" ); } - + { Box b( 5 , 5 , 2 ); assert( "(5,5) -->> (7,7)" == b.toString() ); @@ -444,7 +493,7 @@ namespace mongo { b = g.hash( 42 , 44 ); assert( round(10) == round(g.distance( a , b )) ); } - + { GeoHash x("0000"); assert( 0 == x.getHash() ); @@ -454,7 +503,7 @@ namespace mongo { assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) ); assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) ); } - + { GeoHash x("1010"); GEOHEQ( x , "1010" ); @@ -462,8 +511,8 @@ namespace mongo { GEOHEQ( y , "101001" ); } - { - + { + GeoHash a = g.hash( 5 , 5 ); GeoHash b = g.hash( 5 , 7 ); GeoHash c = g.hash( 100 , 100 ); @@ -509,13 +558,13 @@ namespace mongo { assert( entry.hasPrefix( GeoHash( "1100" ) ) ); assert( entry.hasPrefix( prefix ) ); } - + { GeoHash a = g.hash( 50 , 50 ); GeoHash b = g.hash( 48 , 54 ); assert( round( 4.47214 ) == round( g.distance( a , b ) ) ); } - + { Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) ); @@ -534,7 +583,7 @@ namespace mongo { int N = 10000; { Timer t; - for ( int i=0; i 2469 && dist < 2470 ); } + { + Point BNA (-86.67, 36.12); + Point LAX (-118.40, 33.94); + Point JFK (-73.77694444, 40.63861111 ); + assert( spheredist_deg(BNA, BNA) < 1e-6); + assert( spheredist_deg(LAX, LAX) < 1e-6); + assert( spheredist_deg(JFK, JFK) < 1e-6); + + Point zero (0, 0); + Point antizero (0,-180); + + // these were known to cause NaN + assert( spheredist_deg(zero, zero) < 1e-6); + assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6); + assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6); + } } } } geoUnitTest; - + class GeoPoint { public: - GeoPoint(){ + GeoPoint() { } GeoPoint( const KeyNode& node , double distance ) - : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ){ + : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ) { } GeoPoint( const BSONObj& key , DiskLoc loc , double distance ) - : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ){ + : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ) { } bool operator<( const GeoPoint& other ) const { @@ -630,44 +695,44 @@ namespace mongo { public: GeoAccumulator( const Geo2dType * g , const BSONObj& filter ) : _g(g) , _lookedAt(0) , _objectsLoaded(0) , _found(0) { - if ( ! filter.isEmpty() ){ + if ( ! filter.isEmpty() ) { _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) ); } } - virtual ~GeoAccumulator(){ + virtual ~GeoAccumulator() { } - virtual void add( const KeyNode& node ){ + virtual void add( const KeyNode& node ) { // when looking at other boxes, don't want to look at some object twice pair::iterator,bool> seenBefore = _seen.insert( node.recordLoc ); - if ( ! seenBefore.second ){ + if ( ! seenBefore.second ) { GEODEBUG( "\t\t\t\t already seen : " << node.recordLoc.obj()["_id"] ); return; } _lookedAt++; - + // distance check double d = 0; - if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ){ + if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ) { GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj() << "\t" << d ); return; - } + } GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj() << "\t" << d ); - + // matcher MatchDetails details; - if ( _matcher.get() ){ + if ( _matcher.get() ) { bool good = _matcher->matches( node.key , node.recordLoc , &details ); if ( details.loadedObject ) _objectsLoaded++; - - if ( ! good ){ + + if ( ! good ) { GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] ); return; } } - + if ( ! details.loadedObject ) // dont double count _objectsLoaded++; @@ -681,7 +746,7 @@ namespace mongo { long long found() const { return _found; } - + const Geo2dType * _g; set _seen; auto_ptr _matcher; @@ -690,82 +755,96 @@ namespace mongo { long long _objectsLoaded; long long _found; }; - + class GeoHopper : public GeoAccumulator { public: typedef multiset Holder; - GeoHopper( const Geo2dType * g , unsigned max , const GeoHash& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits::max() ) - : GeoAccumulator( g , filter ) , _max( max ) , _near( n ), _maxDistance( maxDistance ) { - _farthest = -1; - } + GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits::max() , GeoDistType type=GEO_PLAIN) + : GeoAccumulator( g , filter ) , _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _farthest(-1) + {} - virtual bool checkDistance( const GeoHash& h , double& d ){ - d = _g->distance( _near , h ); + virtual bool checkDistance( const GeoHash& h , double& d ) { + switch (_type) { + case GEO_PLAIN: + d = _near.distance( Point(_g, h) ); + break; + case GEO_SPHERE: + d = spheredist_deg(_near, Point(_g, h)); + break; + default: + assert(0); + } bool good = d < _maxDistance && ( _points.size() < _max || d < farthest() ); - GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near << "\t" << h << "\t" << d + GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString() << "\t" << h << "\t" << d << " ok: " << good << " farthest: " << farthest() ); return good; } - - virtual void addSpecific( const KeyNode& node , double d ){ + + virtual void addSpecific( const KeyNode& node , double d ) { GEODEBUG( "\t\t" << GeoHash( node.key.firstElement() ) << "\t" << node.recordLoc.obj() << "\t" << d ); _points.insert( GeoPoint( node.key , node.recordLoc , d ) ); - if ( _points.size() > _max ){ + if ( _points.size() > _max ) { _points.erase( --_points.end() ); - } - Holder::iterator i = _points.end(); - i--; - _farthest = i->_distance; + Holder::iterator i = _points.end(); + i--; + _farthest = i->_distance; + } + else { + if (d > _farthest) + _farthest = d; + } } double farthest() const { return _farthest; } + unsigned _max; - GeoHash _near; + Point _near; Holder _points; double _maxDistance; + GeoDistType _type; double _farthest; }; - + struct BtreeLocation { int pos; bool found; DiskLoc bucket; - - BSONObj key(){ + + BSONObj key() { if ( bucket.isNull() ) return BSONObj(); return bucket.btree()->keyNode( pos ).key; } - - bool hasPrefix( const GeoHash& hash ){ + + bool hasPrefix( const GeoHash& hash ) { BSONElement e = key().firstElement(); if ( e.eoo() ) return false; return GeoHash( e ).hasPrefix( hash ); } - - bool advance( int direction , int& totalFound , GeoAccumulator* all ){ + + bool advance( int direction , int& totalFound , GeoAccumulator* all ) { if ( bucket.isNull() ) return false; bucket = bucket.btree()->advance( bucket , pos , direction , "btreelocation" ); - + if ( all ) return checkCur( totalFound , all ); - + return ! bucket.isNull(); } - bool checkCur( int& totalFound , GeoAccumulator* all ){ + bool checkCur( int& totalFound , GeoAccumulator* all ) { if ( bucket.isNull() ) return false; - if ( bucket.btree()->isUsed(pos) ){ + if ( bucket.btree()->isUsed(pos) ) { totalFound++; all->add( bucket.btree()->keyNode( pos ) ); } @@ -776,51 +855,65 @@ namespace mongo { return true; } - string toString(){ + string toString() { stringstream ss; ss << "bucket: " << bucket.toString() << " pos: " << pos << " found: " << found; return ss.str(); } - static bool initial( const IndexDetails& id , const Geo2dType * spec , - BtreeLocation& min , BtreeLocation& max , + static bool initial( const IndexDetails& id , const Geo2dType * spec , + BtreeLocation& min , BtreeLocation& max , GeoHash start , - int & found , GeoAccumulator * hopper ) - { - + int & found , GeoAccumulator * hopper ) { + Ordering ordering = Ordering::make(spec->_order); - min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , + min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , ordering , min.pos , min.found , minDiskLoc ); - min.checkCur( found , hopper ); + if (hopper) min.checkCur( found , hopper ); max = min; - - if ( min.bucket.isNull() || ( !(hopper->found()) ) ){ - min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , + + if ( min.bucket.isNull() || ( hopper && !(hopper->found()) ) ) { + min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , ordering , min.pos , min.found , minDiskLoc , -1 ); - min.checkCur( found , hopper ); + if (hopper) min.checkCur( found , hopper ); } - + return ! min.bucket.isNull() || ! max.bucket.isNull(); } }; class GeoSearch { public: - GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits::max() ) - : _spec( g ) , _n( n ) , _start( n ) , + GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits::max() , GeoDistType type=GEO_PLAIN) + : _spec( g ) ,_startPt(g,n), _start( n ) , _numWanted( numWanted ) , _filter( filter ) , _maxDistance( maxDistance ) , - _hopper( new GeoHopper( g , numWanted , n , filter , maxDistance ) ) - { + _hopper( new GeoHopper( g , numWanted , _startPt , filter , maxDistance, type ) ), _type(type) { assert( g->getDetails() ); _nscanned = 0; _found = 0; + + if (type == GEO_PLAIN) { + _scanDistance = maxDistance; + } + else if (type == GEO_SPHERE) { + if (maxDistance == numeric_limits::max()) { + _scanDistance = maxDistance; + } + else { + //TODO: consider splitting into x and y scan distances + _scanDistance = computeXScanDistance(_startPt._y, rad2deg(maxDistance)); + } + } + else { + assert(0); + } } - - void exec(){ + + void exec() { const IndexDetails& id = *_spec->getDetails(); - - BtreeBucket * head = id.head.btree(); + + const BtreeBucket * head = id.head.btree(); assert( head ); /* * Search algorithm @@ -829,144 +922,185 @@ namespace mongo { * 3) find optimal set of boxes that complete circle * 4) use regular btree cursors to scan those boxes */ - + GeoHopper * hopper = _hopper.get(); _prefix = _start; - { // 1 regular geo hash algorithm - + BtreeLocation min,max; + { + // 1 regular geo hash algorithm + - BtreeLocation min,max; - if ( ! BtreeLocation::initial( id , _spec , min , max , _n , _found , hopper ) ) + if ( ! BtreeLocation::initial( id , _spec , min , max , _start , _found , NULL ) ) return; - - while ( _hopper->found() < _numWanted ){ + + while ( !_prefix.constrains() || // if next pass would cover universe, just keep going + ( _hopper->found() < _numWanted && _spec->sizeEdge( _prefix ) <= _scanDistance)) { GEODEBUG( _prefix << "\t" << _found << "\t DESC" ); - while ( min.hasPrefix( _prefix ) && min.advance( -1 , _found , hopper ) ) + while ( min.hasPrefix(_prefix) && min.checkCur(_found, hopper) && min.advance(-1, _found, NULL) ) _nscanned++; GEODEBUG( _prefix << "\t" << _found << "\t ASC" ); - while ( max.hasPrefix( _prefix ) && max.advance( 1 , _found , hopper ) ) + while ( max.hasPrefix(_prefix) && max.checkCur(_found, hopper) && max.advance(+1, _found, NULL) ) _nscanned++; - if ( ! _prefix.constrains() ) - break; + + if ( ! _prefix.constrains() ) { + GEODEBUG( "done search w/o part 2" ) + return; + } + + _alreadyScanned = Box(_spec, _prefix); _prefix = _prefix.up(); - - double temp = _spec->distance( _prefix , _start ); - if ( temp > ( _maxDistance * 2 ) ) - break; } } GEODEBUG( "done part 1" ); - if ( _found && _prefix.constrains() ){ + { // 2 - Point center( _spec , _n ); double farthest = hopper->farthest(); - // Phase 1 might not have found any points. - if (farthest == -1) - farthest = _spec->sizeDiag( _prefix ); - Box want( center._x - farthest , center._y - farthest , farthest * 2 ); - _prefix = _n; - while ( _spec->sizeEdge( _prefix ) < ( farthest / 2 ) ){ + GEODEBUGPRINT(hopper->farthest()); + if (hopper->found() < _numWanted) { + // Not enough found in Phase 1 + farthest = _scanDistance; + } + else if (_type == GEO_SPHERE) { + farthest = std::min(_scanDistance, computeXScanDistance(_startPt._y, rad2deg(farthest))); + } + GEODEBUGPRINT(farthest); + + Box want( _startPt._x - farthest , _startPt._y - farthest , farthest * 2 ); + GEODEBUGPRINT(want.toString()); + + _prefix = _start; + while (_prefix.constrains() && _spec->sizeEdge( _prefix ) < farthest ) { _prefix = _prefix.up(); } - - if ( logLevel > 0 ){ - log(1) << "want: " << want << " found:" << _found << " nscanned: " << _nscanned << " hash size:" << _spec->sizeEdge( _prefix ) + + PREFIXDEBUG(_prefix, _spec); + + if (_prefix.getBits() <= 1) { + // TODO consider walking in $natural order + + while ( min.checkCur(_found, hopper) && min.advance(-1, _found, NULL) ) + _nscanned++; + while ( max.checkCur(_found, hopper) && max.advance(+1, _found, NULL) ) + _nscanned++; + + GEODEBUG( "done search after scanning whole collection" ) + return; + } + + if ( logLevel > 0 ) { + log(1) << "want: " << want << " found:" << _found << " nscanned: " << _nscanned << " hash size:" << _spec->sizeEdge( _prefix ) << " farthest: " << farthest << " using box: " << Box( _spec , _prefix ).toString() << endl; } - - for ( int x=-1; x<=1; x++ ){ - for ( int y=-1; y<=1; y++ ){ + + for ( int x=-1; x<=1; x++ ) { + for ( int y=-1; y<=1; y++ ) { GeoHash toscan = _prefix; toscan.move( x , y ); - + // 3 & 4 doBox( id , want , toscan ); } } } GEODEBUG( "done search" ) - + } - void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ){ + void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ) { Box testBox( _spec , toscan ); - if ( logLevel > 2 ){ + if ( logLevel > 2 ) { cout << "\t"; for ( int i=0; i_error)) { + GEODEBUG("skipping box: already scanned"); + return; // been here, done this + } double intPer = testBox.intersects( want ); - - if ( intPer <= 0 ) + + if ( intPer <= 0 ) { + GEODEBUG("skipping box: not in want"); return; - + } + bool goDeeper = intPer < .5 && depth < 2; long long myscanned = 0; - + BtreeLocation loc; - loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , Ordering::make(_spec->_order) , - loc.pos , loc.found , minDiskLoc ); + loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , Ordering::make(_spec->_order) , + loc.pos , loc.found , minDiskLoc ); loc.checkCur( _found , _hopper.get() ); - while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) ){ + while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) ) { _nscanned++; - if ( ++myscanned > 100 && goDeeper ){ + if ( ++myscanned > 100 && goDeeper ) { doBox( id , want , toscan + "00" , depth + 1); doBox( id , want , toscan + "01" , depth + 1); doBox( id , want , toscan + "10" , depth + 1); doBox( id , want , toscan + "11" , depth + 1); - return; + return; } } - + } const Geo2dType * _spec; - GeoHash _n; + Point _startPt; GeoHash _start; GeoHash _prefix; int _numWanted; BSONObj _filter; double _maxDistance; + double _scanDistance; shared_ptr _hopper; long long _nscanned; int _found; + GeoDistType _type; + + Box _alreadyScanned; }; class GeoCursorBase : public Cursor { public: GeoCursorBase( const Geo2dType * spec ) - : _spec( spec ), _id( _spec->getDetails() ){ + : _spec( spec ), _id( _spec->getDetails() ) { } - virtual DiskLoc refLoc(){ return DiskLoc(); } + virtual DiskLoc refLoc() { return DiskLoc(); } virtual BSONObj indexKeyPattern() { return _spec->keyPattern(); } - virtual void noteLocation() { - assert(0); + virtual void noteLocation() { + // no-op since these are meant to be safe } /* called before query getmore block is iterated */ virtual void checkLocation() { - assert(0); + // no-op since these are meant to be safe } virtual bool supportGetMore() { return false; } virtual bool supportYields() { return false; } - virtual bool getsetdup(DiskLoc loc){ - return false; - } + virtual bool getsetdup(DiskLoc loc) { return false; } + virtual bool modifiedKeys() const { return true; } + virtual bool isMultiKey() const { return false; } + + const Geo2dType * _spec; const IndexDetails * _id; @@ -975,20 +1109,23 @@ namespace mongo { class GeoSearchCursor : public GeoCursorBase { public: GeoSearchCursor( shared_ptr s ) - : GeoCursorBase( s->_spec ) , - _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ) { + : GeoCursorBase( s->_spec ) , + _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ), _nscanned() { + if ( _cur != _end ) { + ++_nscanned; + } } - + virtual ~GeoSearchCursor() {} - - virtual bool ok(){ + + virtual bool ok() { return _cur != _end; } - - virtual Record* _current(){ assert(ok()); return _cur->_loc.rec(); } - virtual BSONObj current(){ assert(ok()); return _cur->_o; } - virtual DiskLoc currLoc(){ assert(ok()); return _cur->_loc; } - virtual bool advance(){ _cur++; return ok(); } + + virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); } + virtual BSONObj current() { assert(ok()); return _cur->_o; } + virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; } + virtual bool advance() { _cur++; incNscanned(); return ok(); } virtual BSONObj currKey() const { return _cur->_key; } virtual string toString() { @@ -996,82 +1133,103 @@ namespace mongo { } - virtual BSONObj prettyStartKey() const { - return BSON( _s->_spec->_geo << _s->_prefix.toString() ); + virtual BSONObj prettyStartKey() const { + return BSON( _s->_spec->_geo << _s->_prefix.toString() ); } - virtual BSONObj prettyEndKey() const { + virtual BSONObj prettyEndKey() const { GeoHash temp = _s->_prefix; temp.move( 1 , 1 ); - return BSON( _s->_spec->_geo << temp.toString() ); + return BSON( _s->_spec->_geo << temp.toString() ); } + virtual long long nscanned() { return _nscanned; } shared_ptr _s; GeoHopper::Holder::iterator _cur; GeoHopper::Holder::iterator _end; + + void incNscanned() { if ( ok() ) { ++_nscanned; } } + long long _nscanned; }; class GeoBrowse : public GeoCursorBase , public GeoAccumulator { public: GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj() ) : GeoCursorBase( g ) ,GeoAccumulator( g , filter ) , - _type( type ) , _filter( filter ) , _firstCall(true) { + _type( type ) , _filter( filter ) , _firstCall(true), _nscanned() { } - + virtual string toString() { return (string)"GeoBrowse-" + _type; } - virtual bool ok(){ - if ( _firstCall ){ + virtual bool ok() { + bool first = _firstCall; + if ( _firstCall ) { fillStack(); _firstCall = false; } - if ( ! _cur.isEmpty() || _stack.size() ) + if ( ! _cur.isEmpty() || _stack.size() ) { + if ( first ) { + ++_nscanned; + } return true; + } - while ( moreToDo() ){ + while ( moreToDo() ) { fillStack(); - if ( ! _cur.isEmpty() ) + if ( ! _cur.isEmpty() ) { + if ( first ) { + ++_nscanned; + } return true; + } } - + return false; } - - virtual bool advance(){ + + virtual bool advance() { _cur._o = BSONObj(); - - if ( _stack.size() ){ + + if ( _stack.size() ) { _cur = _stack.front(); _stack.pop_front(); + ++_nscanned; return true; } - + if ( ! moreToDo() ) return false; - + while ( _cur.isEmpty() && moreToDo() ) fillStack(); - return ! _cur.isEmpty(); + return ! _cur.isEmpty() && ++_nscanned; } - - virtual Record* _current(){ assert(ok()); return _cur._loc.rec(); } - virtual BSONObj current(){ assert(ok()); return _cur._o; } - virtual DiskLoc currLoc(){ assert(ok()); return _cur._loc; } + + virtual Record* _current() { assert(ok()); return _cur._loc.rec(); } + virtual BSONObj current() { assert(ok()); return _cur._o; } + virtual DiskLoc currLoc() { assert(ok()); return _cur._loc; } virtual BSONObj currKey() const { return _cur._key; } virtual bool moreToDo() = 0; virtual void fillStack() = 0; - virtual void addSpecific( const KeyNode& node , double d ){ + virtual void addSpecific( const KeyNode& node , double d ) { if ( _cur.isEmpty() ) _cur = GeoPoint( node , d ); else _stack.push_back( GeoPoint( node , d ) ); } + virtual long long nscanned() { + if ( _firstCall ) { + ok(); + } + return _nscanned; + } + string _type; BSONObj _filter; list _stack; @@ -1079,25 +1237,28 @@ namespace mongo { GeoPoint _cur; bool _firstCall; + long long _nscanned; + }; class GeoCircleBrowse : public GeoBrowse { public: - + enum State { - START , + START , DOING_EXPAND , DOING_AROUND , DONE } _state; - GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() ) - : GeoBrowse( g , "circle" , filter ){ - + GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center") + : GeoBrowse( g , "circle" , filter ) { + uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 ); BSONObjIterator i(circle); - _startPt = Point(i.next()); - _start = _startPt.hash(g); + BSONElement center = i.next(); + _start = g->_tohash(center); + _startPt = Point(center); _prefix = _start; _maxDistance = i.next().numberDouble(); uassert( 13061 , "need a max distance > 0 " , _maxDistance > 0 ); @@ -1106,17 +1267,42 @@ namespace mongo { _state = START; _found = 0; + if (type == "$center") { + _type = GEO_PLAIN; + _xScanDistance = _maxDistance; + _yScanDistance = _maxDistance; + } + else if (type == "$centerSphere") { + uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI); + + _type = GEO_SPHERE; + _yScanDistance = rad2deg(_maxDistance); + _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance); + + uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet", + (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) && + (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90)); + + GEODEBUGPRINT(_maxDistance); + GEODEBUGPRINT(_xScanDistance); + GEODEBUGPRINT(_yScanDistance); + } + else { + uassert(13460, "invalid $center query type: " + type, false); + } + ok(); } - virtual bool moreToDo(){ + virtual bool moreToDo() { return _state != DONE; } - - virtual void fillStack(){ - if ( _state == START ){ - if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , - _prefix , _found , this ) ){ + + virtual void fillStack() { + + if ( _state == START ) { + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , + _prefix , _found , this ) ) { _state = DONE; return; } @@ -1124,10 +1310,10 @@ namespace mongo { } - if ( _state == DOING_AROUND ){ + if ( _state == DOING_AROUND ) { // TODO could rework and return rather than looping - for (int i=-1; i<=1; i++){ - for (int j=-1; j<=1; j++){ + for (int i=-1; i<=1; i++) { + for (int j=-1; j<=1; j++) { if (i == 0 && j == 0) continue; // main box @@ -1135,10 +1321,11 @@ namespace mongo { newBox.move(i, j); PREFIXDEBUG(newBox, _g); - if (needToCheckBox(newBox)){ + if (needToCheckBox(newBox)) { // TODO consider splitting into quadrants getPointsForPrefix(newBox); - } else { + } + else { GEODEBUG("skipping box"); } } @@ -1147,20 +1334,19 @@ namespace mongo { _state = DONE; return; } - - if (_state == DOING_EXPAND){ + + if (_state == DOING_EXPAND) { GEODEBUG( "circle prefix [" << _prefix << "]" ); PREFIXDEBUG(_prefix, _g); while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) ); while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) ); - if ( ! _prefix.constrains() ){ + if ( ! _prefix.constrains() ) { GEODEBUG( "\t exhausted the btree" ); _state = DONE; return; } - Point ll (_g, _prefix); GeoHash trHash = _prefix; @@ -1168,50 +1354,52 @@ namespace mongo { Point tr (_g, trHash); double sideLen = fabs(tr._x - ll._x); - if (sideLen > _maxDistance){ // circle must be contained by surrounding squares - if ( (ll._x + _maxDistance < _startPt._x && ll._y + _maxDistance < _startPt._y) && - (tr._x - _maxDistance > _startPt._x && tr._y - _maxDistance > _startPt._y) ) - { + if (sideLen > std::max(_xScanDistance, _yScanDistance)) { // circle must be contained by surrounding squares + if ( (ll._x + _xScanDistance < _startPt._x && ll._y + _yScanDistance < _startPt._y) && + (tr._x - _xScanDistance > _startPt._x && tr._y - _yScanDistance > _startPt._y) ) { GEODEBUG("square fully contains circle"); _state = DONE; - } else if (_prefix.getBits() > 1){ + } + else if (_prefix.getBits() > 1) { GEODEBUG("checking surrounding squares"); _state = DOING_AROUND; - } else { + } + else { GEODEBUG("using simple search"); _prefix = _prefix.up(); } - } else { + } + else { _prefix = _prefix.up(); } return; } - + /* Clients are expected to use moreToDo before calling * fillStack, so DONE is checked for there. If any more * State values are defined, you should handle them - * here. */ + * here. */ assert(0); } - bool needToCheckBox(const GeoHash& prefix){ + bool needToCheckBox(const GeoHash& prefix) { Point ll (_g, prefix); - if (fabs(ll._x - _startPt._x) <= _maxDistance) return true; - if (fabs(ll._y - _startPt._y) <= _maxDistance) return true; + if (fabs(ll._x - _startPt._x) <= _xScanDistance) return true; + if (fabs(ll._y - _startPt._y) <= _yScanDistance) return true; - GeoHash trHash = _prefix; + GeoHash trHash = prefix; trHash.move( 1 , 1 ); Point tr (_g, trHash); - if (fabs(tr._x - _startPt._x) <= _maxDistance) return true; - if (fabs(tr._y - _startPt._y) <= _maxDistance) return true; + if (fabs(tr._x - _startPt._x) <= _xScanDistance) return true; + if (fabs(tr._y - _startPt._y) <= _yScanDistance) return true; return false; } - void getPointsForPrefix(const GeoHash& prefix){ - if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ){ + void getPointsForPrefix(const GeoHash& prefix) { + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ) { return; } @@ -1219,37 +1407,50 @@ namespace mongo { while ( _max.hasPrefix( prefix ) && _max.advance( 1 , _found , this ) ); } - - virtual bool checkDistance( const GeoHash& h , double& d ){ - d = _g->distance( _start , h ); + + virtual bool checkDistance( const GeoHash& h , double& d ) { + switch (_type) { + case GEO_PLAIN: + d = _g->distance( _start , h ); + break; + case GEO_SPHERE: + d = spheredist_deg(_startPt, Point(_g, h)); + break; + default: + assert(0); + } + GEODEBUG( "\t " << h << "\t" << d ); return d <= _maxDistance; } + GeoDistType _type; GeoHash _start; Point _startPt; - double _maxDistance; - + double _maxDistance; // user input + double _xScanDistance; // effected by GeoDistType + double _yScanDistance; // effected by GeoDistType + int _found; - - GeoHash _prefix; + + GeoHash _prefix; BtreeLocation _min; BtreeLocation _max; - }; + }; class GeoBoxBrowse : public GeoBrowse { public: - + enum State { - START , + START , DOING_EXPAND , DONE } _state; - GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() ) - : GeoBrowse( g , "box" , filter ){ - + GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() ) + : GeoBrowse( g , "box" , filter ) { + uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 ); BSONObjIterator i(box); _bl = g->_tohash( i.next() ); @@ -1265,7 +1466,7 @@ namespace mongo { Point center = _want.center(); _prefix = _g->hash( center._x , center._y ); - + GEODEBUG( "center : " << center.toString() << "\t" << _prefix ); { @@ -1280,42 +1481,43 @@ namespace mongo { ok(); } - virtual bool moreToDo(){ + virtual bool moreToDo() { return _state != DONE; } - - virtual void fillStack(){ - if ( _state == START ){ - if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , - _prefix , _found , this ) ){ + virtual void fillStack() { + if ( _state == START ) { + + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , + _prefix , _found , this ) ) { _state = DONE; return; } _state = DOING_EXPAND; } - - if ( _state == DOING_EXPAND ){ + + if ( _state == DOING_EXPAND ) { int started = _found; - while ( started == _found || _state == DONE ){ + while ( started == _found || _state == DONE ) { GEODEBUG( "box prefix [" << _prefix << "]" ); while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) ); while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) ); - + if ( _state == DONE ) return; - if ( ! _prefix.constrains() ){ + if ( ! _prefix.constrains() ) { GEODEBUG( "box exhausted" ); _state = DONE; return; } - if (_g->sizeEdge(_prefix) < _wantLen){ + if (_g->sizeEdge(_prefix) < _wantLen) { _prefix = _prefix.up(); - } else { - for (int i=-1; i<=1; i++){ - for (int j=-1; j<=1; j++){ + } + else { + for (int i=-1; i<=1; i++) { + for (int j=-1; j<=1; j++) { if (i == 0 && j == 0) continue; // main box @@ -1326,36 +1528,37 @@ namespace mongo { PREFIXDEBUG(newBox, _g); Box cur( _g , newBox ); - if (_want.intersects(cur)){ + if (_want.intersects(cur)) { // TODO consider splitting into quadrants getPointsForPrefix(newBox); - } else { + } + else { GEODEBUG("skipping box"); } } } _state = DONE; } - + } return; } } - void getPointsForPrefix(const GeoHash& prefix){ - if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ){ + void getPointsForPrefix(const GeoHash& prefix) { + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ) { return; } while ( _min.hasPrefix( prefix ) && _min.advance( -1 , _found , this ) ); while ( _max.hasPrefix( prefix ) && _max.advance( 1 , _found , this ) ); } - - virtual bool checkDistance( const GeoHash& h , double& d ){ + + virtual bool checkDistance( const GeoHash& h , double& d ) { bool res = _want.inside( Point( _g , h ) , _fudge ); - GEODEBUG( "\t want : " << _want.toString() - << " point: " << Point( _g , h ).toString() + GEODEBUG( "\t want : " << _want.toString() + << " point: " << Point( _g , h ).toString() << " in : " << res ); return res; } @@ -1366,23 +1569,23 @@ namespace mongo { double _wantLen; int _found; - - GeoHash _prefix; + + GeoHash _prefix; BtreeLocation _min; BtreeLocation _max; double _fudge; - }; + }; shared_ptr Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const { if ( numWanted < 0 ) numWanted = numWanted * -1; else if ( numWanted == 0 ) - numWanted = 100; - + numWanted = 100; + BSONObjIterator i(query); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); if ( _geo != e.fieldName() ) @@ -1390,13 +1593,27 @@ namespace mongo { if ( e.type() != Object ) continue; - - switch ( e.embeddedObject().firstElement().getGtLtOp() ){ + + switch ( e.embeddedObject().firstElement().getGtLtOp() ) { case BSONObj::opNEAR: { BSONObj n = e.embeddedObject(); e = n.firstElement(); + + const char* suffix = e.fieldName() + 5; // strlen("$near") == 5; + GeoDistType type; + if (suffix[0] == '\0') { + type = GEO_PLAIN; + } + else if (strcmp(suffix, "Sphere") == 0) { + type = GEO_SPHERE; + } + else { + uassert(13464, string("invalid $near search type: ") + e.fieldName(), false); + type = GEO_PLAIN; // prevents uninitialized warning + } + double maxDistance = numeric_limits::max(); - if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ){ + if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) { BSONObjIterator i(e.embeddedObject()); i.next(); i.next(); @@ -1409,32 +1626,30 @@ namespace mongo { if ( e.isNumber() ) maxDistance = e.numberDouble(); } - shared_ptr s( new GeoSearch( this , _tohash(e) , numWanted , query , maxDistance ) ); + shared_ptr s( new GeoSearch( this , _tohash(e) , numWanted , query , maxDistance, type ) ); s->exec(); shared_ptr c; c.reset( new GeoSearchCursor( s ) ); - return c; + return c; } case BSONObj::opWITHIN: { e = e.embeddedObject().firstElement(); uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() ); e = e.embeddedObject().firstElement(); string type = e.fieldName(); - if ( type == "$center" ){ + if ( startsWith(type, "$center") ) { uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() ); - shared_ptr c; - c.reset( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query ) ); - return c; + shared_ptr c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type) ); + return c; } - else if ( type == "$box" ){ + else if ( type == "$box" ) { uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() ); - shared_ptr c; - c.reset( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) ); - return c; + shared_ptr c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) ); + return c; } throw UserException( 13058 , (string)"unknown $with type: " + type ); } - default: + default: break; } } @@ -1448,41 +1663,41 @@ namespace mongo { class Geo2dFindNearCmd : public Command { public: - Geo2dFindNearCmd() : Command( "geoNear" ){} - virtual LockType locktype() const { return READ; } + Geo2dFindNearCmd() : Command( "geoNear" ) {} + virtual LockType locktype() const { return READ; } bool slaveOk() const { return true; } void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; } bool slaveOverrideOk() { return true; } - bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string ns = dbname + "." + cmdObj.firstElement().valuestr(); NamespaceDetails * d = nsdetails( ns.c_str() ); - if ( ! d ){ + if ( ! d ) { errmsg = "can't find ns"; return false; } vector idxs; d->findIndexByType( GEO2DNAME , idxs ); - - if ( idxs.size() > 1 ){ + + if ( idxs.size() > 1 ) { errmsg = "more than 1 geo indexes :("; return false; } - - if ( idxs.size() == 0 ){ + + if ( idxs.size() == 0 ) { errmsg = "no geo index :("; return false; } int geoIdx = idxs[0]; - + result.append( "ns" , ns ); IndexDetails& id = d->idx( geoIdx ); Geo2dType * g = (Geo2dType*)id.getSpec().getType(); assert( &id == g->getDetails() ); - + int numWanted = 100; if ( cmdObj["num"].isNumber() ) numWanted = cmdObj["num"].numberInt(); @@ -1499,37 +1714,41 @@ namespace mongo { if ( cmdObj["maxDistance"].isNumber() ) maxDistance = cmdObj["maxDistance"].number(); - GeoSearch gs( g , n , numWanted , filter , maxDistance ); + GeoDistType type = GEO_PLAIN; + if ( cmdObj["spherical"].trueValue() ) + type = GEO_SPHERE; + + GeoSearch gs( g , n , numWanted , filter , maxDistance , type); - if ( cmdObj["start"].type() == String){ + if ( cmdObj["start"].type() == String) { GeoHash start ((string) cmdObj["start"].valuestr()); gs._start = start; } - + gs.exec(); double distanceMultiplier = 1; if ( cmdObj["distanceMultiplier"].isNumber() ) distanceMultiplier = cmdObj["distanceMultiplier"].number(); - + double totalDistance = 0; BSONObjBuilder arr( result.subarrayStart( "results" ) ); int x = 0; - for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ){ + for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ) { const GeoPoint& p = *i; - + double dis = distanceMultiplier * p._distance; totalDistance += dis; - - BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ).c_str() ) ); + + BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) ); bb.append( "dis" , dis ); bb.append( "obj" , p._o ); bb.done(); } arr.done(); - + BSONObjBuilder stats( result.subobjStart( "stats" ) ); stats.append( "time" , cc().curop()->elapsedMillis() ); stats.appendNumber( "btreelocs" , gs._nscanned ); @@ -1538,23 +1757,23 @@ namespace mongo { stats.append( "avgDistance" , totalDistance / x ); stats.append( "maxDistance" , gs._hopper->farthest() ); stats.done(); - + return true; } - + } geo2dFindNearCmd; class GeoWalkCmd : public Command { public: - GeoWalkCmd() : Command( "geoWalk" ){} - virtual LockType locktype() const { return READ; } + GeoWalkCmd() : Command( "geoWalk" ) {} + virtual LockType locktype() const { return READ; } bool slaveOk() const { return true; } bool slaveOverrideOk() { return true; } - bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string ns = dbname + "." + cmdObj.firstElement().valuestr(); NamespaceDetails * d = nsdetails( ns.c_str() ); - if ( ! d ){ + if ( ! d ) { errmsg = "can't find ns"; return false; } @@ -1562,10 +1781,10 @@ namespace mongo { int geoIdx = -1; { NamespaceDetails::IndexIterator ii = d->ii(); - while ( ii.more() ){ + while ( ii.more() ) { IndexDetails& id = ii.next(); - if ( id.getSpec().getTypeName() == GEO2DNAME ){ - if ( geoIdx >= 0 ){ + if ( id.getSpec().getTypeName() == GEO2DNAME ) { + if ( geoIdx >= 0 ) { errmsg = "2 geo indexes :("; return false; } @@ -1573,12 +1792,12 @@ namespace mongo { } } } - - if ( geoIdx < 0 ){ + + if ( geoIdx < 0 ) { errmsg = "no geo index :("; return false; } - + IndexDetails& id = d->idx( geoIdx ); Geo2dType * g = (Geo2dType*)id.getSpec().getType(); @@ -1587,12 +1806,12 @@ namespace mongo { int max = 100000; BtreeCursor c( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ); - while ( c.ok() && max-- ){ + while ( c.ok() && max-- ) { GeoHash h( c.currKey().firstElement() ); int len; cout << "\t" << h.toString() - << "\t" << c.current()[g->_geo] - << "\t" << hex << h.getHash() + << "\t" << c.current()[g->_geo] + << "\t" << hex << h.getHash() << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0] << "\t" << c.current()["_id"] << endl; @@ -1601,7 +1820,7 @@ namespace mongo { return true; } - + } geoWalkCmd; } diff --git a/db/geo/core.h b/db/geo/core.h index 13f3636..602b513 100644 --- a/db/geo/core.h +++ b/db/geo/core.h @@ -31,23 +31,23 @@ namespace mongo { class GeoBitSets { public: - GeoBitSets(){ - for ( int i=0; i<32; i++ ){ + GeoBitSets() { + for ( int i=0; i<32; i++ ) { masks32[i] = ( 1 << ( 31 - i ) ); } - for ( int i=0; i<64; i++ ){ + for ( int i=0; i<64; i++ ) { masks64[i] = ( 1LL << ( 63 - i ) ); } - - for ( unsigned i=0; i<16; i++ ){ + + for ( unsigned i=0; i<16; i++ ) { unsigned fixed = 0; - for ( int j=0; j<4; j++ ){ + for ( int j=0; j<4; j++ ) { if ( i & ( 1 << j ) ) fixed |= ( 1 << ( j * 2 ) ); } hashedToNormal[fixed] = i; } - + } int masks32[32]; long long masks64[64]; @@ -56,24 +56,24 @@ namespace mongo { }; extern GeoBitSets geoBitSets; - + class GeoHash { public: GeoHash() - : _hash(0),_bits(0){ + : _hash(0),_bits(0) { } - explicit GeoHash( const char * hash ){ + explicit GeoHash( const char * hash ) { init( hash ); } - explicit GeoHash( const string& hash ){ + explicit GeoHash( const string& hash ) { init( hash ); } - explicit GeoHash( const BSONElement& e , unsigned bits=32 ){ + explicit GeoHash( const BSONElement& e , unsigned bits=32 ) { _bits = bits; - if ( e.type() == BinData ){ + if ( e.type() == BinData ) { int len = 0; _copy( (char*)&_hash , e.binData( len ) ); assert( len == 8 ); @@ -85,26 +85,26 @@ namespace mongo { } _fix(); } - - GeoHash( unsigned x , unsigned y , unsigned bits=32){ + + GeoHash( unsigned x , unsigned y , unsigned bits=32) { init( x , y , bits ); } - GeoHash( const GeoHash& old ){ + GeoHash( const GeoHash& old ) { _hash = old._hash; _bits = old._bits; } GeoHash( long long hash , unsigned bits ) - : _hash( hash ) , _bits( bits ){ + : _hash( hash ) , _bits( bits ) { _fix(); } - void init( unsigned x , unsigned y , unsigned bits ){ + void init( unsigned x , unsigned y , unsigned bits ) { assert( bits <= 32 ); _hash = 0; _bits = bits; - for ( unsigned i=0; i> (64-(other._bits*2)); return x == 0; } - - string toString() const { + + string toString() const { StringBuilder buf( _bits * 2 ); for ( unsigned x=0; x<_bits*2; x++ ) buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" ); @@ -172,7 +172,7 @@ namespace mongo { return ss.str(); } - void init( const string& s ){ + void init( const string& s ) { _hash = 0; _bits = s.size() / 2; for ( unsigned pos=0; pos 0; } - - void move( int x , int y ){ + + void move( int x , int y ) { assert( _bits ); _move( 0 , x ); _move( 1 , y ); } - void _move( unsigned offset , int d ){ + void _move( unsigned offset , int d ) { if ( d == 0 ) return; assert( d <= 1 && d>= -1 ); // TEMP - + bool from, to; - if ( d > 0 ){ + if ( d > 0 ) { from = 0; to = 1; } @@ -238,34 +238,34 @@ namespace mongo { unsigned pos = ( _bits * 2 ) - 1; if ( offset == 0 ) pos--; - while ( true ){ - if ( getBit(pos) == from ){ + while ( true ) { + if ( getBit(pos) == from ) { setBit( pos , to ); return; } - if ( pos < 2 ){ + if ( pos < 2 ) { // overflow - for ( ; pos < ( _bits * 2 ) ; pos += 2 ){ + for ( ; pos < ( _bits * 2 ) ; pos += 2 ) { setBit( pos , from ); } return; } - + setBit( pos , from ); pos -= 2; } - + assert(0); } - GeoHash& operator=(const GeoHash& h) { + GeoHash& operator=(const GeoHash& h) { _hash = h._hash; _bits = h._bits; return *this; } - - bool operator==(const GeoHash& h ){ + + bool operator==(const GeoHash& h ) { return _hash == h._hash && _bits == h._bits; } @@ -273,7 +273,7 @@ namespace mongo { unsigned pos = _bits * 2; _bits += strlen(s) / 2; assert( _bits <= 32 ); - while ( s[0] ){ + while ( s[0] ) { if ( s[0] == '1' ) setBit( pos , 1 ); pos++; @@ -288,19 +288,19 @@ namespace mongo { n+=s; return n; } - - void _fix(){ + + void _fix() { static long long FULL = 0xFFFFFFFFFFFFFFFFLL; long long mask = FULL << ( 64 - ( _bits * 2 ) ); _hash &= mask; } - + void append( BSONObjBuilder& b , const char * name ) const { char buf[8]; _copy( buf , (char*)&_hash ); b.appendBinData( name , 8 , bdtCustom , buf ); } - + long long getHash() const { return _hash; } @@ -311,9 +311,9 @@ namespace mongo { GeoHash commonPrefix( const GeoHash& other ) const { unsigned i=0; - for ( ; i<_bits && iunhash( hash , _x , _y ); } - - explicit Point( const BSONElement& e ){ + + explicit Point( const BSONElement& e ) { BSONObjIterator i(e.Obj()); _x = i.next().number(); _y = i.next().number(); } - explicit Point( const BSONObj& o ){ + explicit Point( const BSONObj& o ) { BSONObjIterator i(o); _x = i.next().number(); _y = i.next().number(); } Point( double x , double y ) - : _x( x ) , _y( y ){ + : _x( x ) , _y( y ) { } - - Point() : _x(0),_y(0){ + + Point() : _x(0),_y(0) { } - GeoHash hash( const GeoConvert * g ){ + GeoHash hash( const GeoConvert * g ) { return g->hash( _x , _y ); } @@ -380,12 +380,12 @@ namespace mongo { double b = _y - p._y; return sqrt( ( a * a ) + ( b * b ) ); } - + string toString() const { StringBuilder buf(32); buf << "(" << _x << "," << _y << ")"; return buf.str(); - + } double _x; @@ -393,8 +393,11 @@ namespace mongo { }; - extern double EARTH_RADIUS_KM; - extern double EARTH_RADIUS_MILES; + extern const double EARTH_RADIUS_KM; + extern const double EARTH_RADIUS_MILES; + + inline double deg2rad(double deg) { return deg * (M_PI/180); } + inline double rad2deg(double rad) { return rad * (180/M_PI); } // WARNING: _x and _y MUST be longitude and latitude in that order // note: multiply by earth radius for distance @@ -407,20 +410,26 @@ namespace mongo { double sin_y1(sin(p1._y)), cos_y1(cos(p1._y)); double sin_x2(sin(p2._x)), cos_x2(cos(p2._x)); double sin_y2(sin(p2._y)), cos_y2(cos(p2._y)); - - double cross_prod = + + double cross_prod = (cos_y1*cos_x1 * cos_y2*cos_x2) + (cos_y1*sin_x1 * cos_y2*sin_x2) + (sin_y1 * sin_y2); + if (cross_prod >= 1 || cross_prod <= -1) { + // fun with floats + assert( fabs(cross_prod)-1 < 1e-6 ); + return cross_prod > 0 ? 0 : M_PI; + } + return acos(cross_prod); } // note: return is still in radians as that can be multiplied by radius to get arc length inline double spheredist_deg( const Point& p1, const Point& p2 ) { return spheredist_rad( - Point( p1._x * (M_PI/180), p1._y * (M_PI/180)), - Point( p2._x * (M_PI/180), p2._y * (M_PI/180)) + Point( deg2rad(p1._x), deg2rad(p1._y) ), + Point( deg2rad(p2._x), deg2rad(p2._y) ) ); } diff --git a/db/geo/haystack.cpp b/db/geo/haystack.cpp index 4a1d4a7..7f278ca 100644 --- a/db/geo/haystack.cpp +++ b/db/geo/haystack.cpp @@ -17,14 +17,14 @@ */ #include "pch.h" -#include "../namespace.h" +#include "../namespace-inl.h" #include "../jsobj.h" #include "../index.h" #include "../../util/unittest.h" #include "../commands.h" #include "../pdfile.h" #include "../btree.h" -#include "../curop.h" +#include "../curop-inl.h" #include "../matcher.h" #include "core.h" @@ -38,29 +38,29 @@ * should not be used for finding the closest restaurants that are open */ namespace mongo { - + string GEOSEARCHNAME = "geoHaystack"; - + class GeoHaystackSearchHopper { public: GeoHaystackSearchHopper( const BSONObj& n , double maxDistance , unsigned limit , const string& geoField ) - : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField){ - + : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField) { + } - - void got( const DiskLoc& loc ){ + + void got( const DiskLoc& loc ) { Point p( loc.obj().getFieldDotted( _geoField ) ); if ( _near.distance( p ) > _maxDistance ) return; _locs.push_back( loc ); } - int append( BSONArrayBuilder& b ){ + int append( BSONArrayBuilder& b ) { for ( unsigned i=0; i<_locs.size() && i<_limit; i++ ) b.append( _locs[i].obj() ); return _locs.size(); } - + Point _near; double _maxDistance; unsigned _limit; @@ -70,22 +70,22 @@ namespace mongo { }; class GeoHaystackSearchIndex : public IndexType { - + public: - + GeoHaystackSearchIndex( const IndexPlugin* plugin , const IndexSpec* spec ) - : IndexType( plugin , spec ){ - + : IndexType( plugin , spec ) { + BSONElement e = spec->info["bucketSize"]; uassert( 13321 , "need bucketSize" , e.isNumber() ); _bucketSize = e.numberDouble(); - + BSONObjBuilder orderBuilder; - + BSONObjIterator i( spec->keyPattern ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); - if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ){ + if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ) { uassert( 13314 , "can't have 2 geo fields" , _geo.size() == 0 ); uassert( 13315 , "2d has to be first in index" , _other.size() == 0 ); _geo = e.fieldName(); @@ -95,13 +95,13 @@ namespace mongo { } orderBuilder.append( "" , 1 ); } - + uassert( 13316 , "no geo field specified" , _geo.size() ); uassert( 13317 , "no other fields specified" , _other.size() ); uassert( 13326 , "quadrant search can only have 1 other field for now" , _other.size() == 1 ); _order = orderBuilder.obj(); } - + int hash( const BSONElement& e ) const { uassert( 13322 , "not a number" , e.isNumber() ); return hash( e.numberDouble() ); @@ -126,18 +126,18 @@ namespace mongo { buf.appendNull( "" ); else buf.appendAs( e , "" ); - + BSONObj key = buf.obj(); GEOQUADDEBUG( obj << "\n\t" << root << "\n\t" << key ); keys.insert( key ); } void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const { - + BSONElement loc = obj.getFieldDotted( _geo ); if ( loc.eoo() ) return; - + uassert( 13323 , "latlng not an array" , loc.isABSONObj() ); string root; { @@ -146,34 +146,34 @@ namespace mongo { BSONElement y = i.next(); root = makeString( hash(x) , hash(y) ); } - - + + assert( _other.size() == 1 ); - + BSONElementSet all; obj.getFieldsDotted( _other[0] , all ); - - if ( all.size() == 0 ){ + + if ( all.size() == 0 ) { _add( obj , root , BSONElement() , keys ); } else { - for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ){ + for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ) { _add( obj , root , *i , keys ); } } - + } - + shared_ptr newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const { shared_ptr c; assert(0); return c; } - - void searchCommand( NamespaceDetails* nsd , int idxNo , - const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search , - BSONObjBuilder& result , unsigned limit ){ - + + void searchCommand( NamespaceDetails* nsd , int idxNo , + const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search , + BSONObjBuilder& result , unsigned limit ) { + Timer t; log(1) << "SEARCH near:" << n << " maxDistance:" << maxDistance << " search: " << search << endl; @@ -184,33 +184,33 @@ namespace mongo { y = hash( i.next() ); } int scale = (int)ceil( maxDistance / _bucketSize ); - + GeoHaystackSearchHopper hopper(n,maxDistance,limit,_geo); - + long long btreeMatches = 0; - for ( int a=-scale; a<=scale; a++ ){ - for ( int b=-scale; b<=scale; b++ ){ + for ( int a=-scale; a<=scale; a++ ) { + for ( int b=-scale; b<=scale; b++ ) { BSONObjBuilder bb; bb.append( "" , makeString( x + a , y + b ) ); - for ( unsigned i=0; i<_other.size(); i++ ){ + for ( unsigned i=0; i<_other.size(); i++ ) { BSONElement e = search.getFieldDotted( _other[i] ); if ( e.eoo() ) bb.appendNull( "" ); else bb.appendAs( e , "" ); } - + BSONObj key = bb.obj(); - + GEOQUADDEBUG( "KEY: " << key ); - + set thisPass; BtreeCursor cursor( nsd , idxNo , *getDetails() , key , key , true , 1 ); - while ( cursor.ok() ){ + while ( cursor.ok() ) { pair::iterator, bool> p = thisPass.insert( cursor.currLoc() ); - if ( p.second ){ + if ( p.second ) { hopper.got( cursor.currLoc() ); GEOQUADDEBUG( "\t" << cursor.current() ); btreeMatches++; @@ -221,10 +221,10 @@ namespace mongo { } - BSONArrayBuilder arr( result.subarrayStart( "results" ) ); + BSONArrayBuilder arr( result.subarrayStart( "results" ) ); int num = hopper.append( arr ); arr.done(); - + { BSONObjBuilder b( result.subobjStart( "stats" ) ); b.append( "time" , t.millis() ); @@ -237,20 +237,20 @@ namespace mongo { const IndexDetails* getDetails() const { return _spec->getDetails(); } - + string _geo; vector _other; - + BSONObj _order; double _bucketSize; }; - + class GeoHaystackSearchIndexPlugin : public IndexPlugin { public: - GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ){ + GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ) { } - + virtual IndexType* generate( const IndexSpec* spec ) const { return new GeoHaystackSearchIndex( this , spec ); } @@ -259,38 +259,38 @@ namespace mongo { class GeoHaystackSearchCommand : public Command { - public: - GeoHaystackSearchCommand() : Command( "geoSearch" ){} - virtual LockType locktype() const { return READ; } + public: + GeoHaystackSearchCommand() : Command( "geoSearch" ) {} + virtual LockType locktype() const { return READ; } bool slaveOk() const { return true; } bool slaveOverrideOk() const { return true; } - bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ - + bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string ns = dbname + "." + cmdObj.firstElement().valuestr(); - + NamespaceDetails * d = nsdetails( ns.c_str() ); - if ( ! d ){ + if ( ! d ) { errmsg = "can't find ns"; return false; } - + vector idxs; d->findIndexByType( GEOSEARCHNAME , idxs ); - if ( idxs.size() == 0 ){ + if ( idxs.size() == 0 ) { errmsg = "no geoSearch index"; return false; } - if ( idxs.size() > 1 ){ + if ( idxs.size() > 1 ) { errmsg = "more than 1 geosearch index"; return false; } - + int idxNum = idxs[0]; - + IndexDetails& id = d->idx( idxNum ); GeoHaystackSearchIndex * si = (GeoHaystackSearchIndex*)id.getSpec().getType(); - assert( &id == si->getDetails() ); - + assert( &id == si->getDetails() ); + BSONElement n = cmdObj["near"]; BSONElement maxDistance = cmdObj["maxDistance"]; BSONElement search = cmdObj["search"]; @@ -298,20 +298,20 @@ namespace mongo { uassert( 13318 , "near needs to be an array" , n.isABSONObj() ); uassert( 13319 , "maxDistance needs a number" , maxDistance.isNumber() ); uassert( 13320 , "search needs to be an object" , search.type() == Object ); - + unsigned limit = 50; if ( cmdObj["limit"].isNumber() ) limit = (unsigned)cmdObj["limit"].numberInt(); si->searchCommand( d , idxNum , n.Obj() , maxDistance.numberDouble() , search.Obj() , result , limit ); - + return 1; } - - } nameSearchCommand; + + } nameSearchCommand; + + - - } diff --git a/db/helpers/dblogger.h b/db/helpers/dblogger.h index 572169b..4d6ee6d 100644 --- a/db/helpers/dblogger.h +++ b/db/helpers/dblogger.h @@ -18,14 +18,14 @@ #pragma once -namespace mongo { +namespace mongo { /** helper to log (and read log) of a capped collection in the database */ class DBLogger { bool _inited; public: const string _ns; - DBLogger(string ns) : _inited(false), _ns(ns){ } + DBLogger(string ns) : _inited(false), _ns(ns) { } }; } diff --git a/db/index.cpp b/db/index.cpp index 04eca73..c696e27 100644 --- a/db/index.cpp +++ b/db/index.cpp @@ -17,15 +17,16 @@ */ #include "pch.h" -#include "namespace.h" +#include "namespace-inl.h" #include "index.h" #include "btree.h" #include "query.h" #include "background.h" +#include "repl/rs.h" namespace mongo { - int removeFromSysIndexes(const char *ns, const char *idxName) { + int removeFromSysIndexes(const char *ns, const char *idxName) { string system_indexes = cc().database()->name + ".system.indexes"; BSONObjBuilder b; b.append("ns", ns); @@ -34,24 +35,36 @@ namespace mongo { return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true); } - /* this is just an attempt to clean up old orphaned stuff on a delete all indexes - call. repair database is the clean solution, but this gives one a lighter weight + /* this is just an attempt to clean up old orphaned stuff on a delete all indexes + call. repair database is the clean solution, but this gives one a lighter weight partial option. see dropIndexes() */ - void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) { + void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) { string system_indexes = cc().database()->name + ".system.indexes"; BSONObjBuilder b; b.append("ns", ns); - if( idIndex ) { + if( idIndex ) { b.append("name", BSON( "$ne" << idIndex->indexName().c_str() )); } BSONObj cond = b.done(); int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true); - if( n ) { + if( n ) { log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl; } } + int IndexDetails::keyPatternOffset( const string& key ) const { + BSONObjIterator i( keyPattern() ); + int n = 0; + while ( i.more() ) { + BSONElement e = i.next(); + if ( key == e.fieldName() ) + return n; + n++; + } + return -1; + } + const IndexSpec& IndexDetails::getSpec() const { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this ); @@ -62,29 +75,35 @@ namespace mongo { */ void IndexDetails::kill_idx() { string ns = indexNamespace(); // e.g. foo.coll.$ts_1 + try { - string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below - - // clean up parent namespace index cache - NamespaceDetailsTransient::get_w( pns.c_str() ).deletedIndex(); + string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below - string name = indexName(); + // clean up parent namespace index cache + NamespaceDetailsTransient::get_w( pns.c_str() ).deletedIndex(); + + string name = indexName(); + + /* important to catch exception here so we can finish cleanup below. */ + try { + dropNS(ns.c_str()); + } + catch(DBException& ) { + log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl; + } + head.setInvalid(); + info.setInvalid(); + + // clean up in system.indexes. we do this last on purpose. + int n = removeFromSysIndexes(pns.c_str(), name.c_str()); + wassert( n == 1 ); - /* important to catch exception here so we can finish cleanup below. */ - try { - btreeStore->drop(ns.c_str()); } - catch(DBException& ) { - log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl; + catch ( DBException &e ) { + log() << "exception in kill_idx: " << e << ", ns: " << ns << endl; } - head.setInvalid(); - info.setInvalid(); - - // clean up in system.indexes. we do this last on purpose. - int n = removeFromSysIndexes(pns.c_str(), name.c_str()); - wassert( n == 1 ); } - + void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const { getSpec().getKeys( obj, keys ); } @@ -105,7 +124,7 @@ namespace mongo { } } - void getIndexChanges(vector& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) { + void getIndexChanges(vector& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) { int z = d.nIndexesBeingBuilt(); v.resize(z); NamespaceDetails::IndexIterator i = d.ii(); @@ -115,7 +134,7 @@ namespace mongo { IndexChanges& ch = v[i]; idx.getKeysFromObject(oldObj, ch.oldkeys); idx.getKeysFromObject(newObj, ch.newkeys); - if( ch.newkeys.size() > 1 ) + if( ch.newkeys.size() > 1 ) d.setIndexIsMultikey(i); setDifference(ch.oldkeys, ch.newkeys, ch.removed); setDifference(ch.newkeys, ch.oldkeys, ch.added); @@ -133,12 +152,12 @@ namespace mongo { } } - // should be { : , .keyp.. } - static bool validKeyPattern(BSONObj kp) { + // should be { : , .keyp.. } + static bool validKeyPattern(BSONObj kp) { BSONObjIterator i(kp); - while( i.moreWithEOO() ) { + while( i.moreWithEOO() ) { BSONElement e = i.next(); - if( e.type() == Object || e.type() == Array ) + if( e.type() == Object || e.type() == Array ) return false; } return true; @@ -154,29 +173,23 @@ namespace mongo { throws DBException - @return - true if ok to continue. when false we stop/fail silently (index already exists) - sourceNS - source NS we are indexing - sourceCollection - its details ptr + @param sourceNS - source NS we are indexing + @param sourceCollection - its details ptr + @return true if ok to continue. when false we stop/fail silently (index already exists) */ - bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection) { + bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) { sourceCollection = 0; // logical name of the index. todo: get rid of the name, we don't need it! - const char *name = io.getStringField("name"); + const char *name = io.getStringField("name"); uassert(12523, "no index name specified", *name); // the collection for which we are building an index - sourceNS = io.getStringField("ns"); + sourceNS = io.getStringField("ns"); uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos); - uassert(10097, "bad table to index name on add index attempt", - cc().database()->name == nsToDatabase(sourceNS.c_str())); + uassert(10097, "bad table to index name on add index attempt", + cc().database()->name == nsToDatabase(sourceNS.c_str())); - /* we can't build a new index for the ns if a build is already in progress in the background - - EVEN IF this is a foreground build. - */ - uassert(12588, "cannot add index with a background operation in progress", - !BackgroundOperation::inProgForNs(sourceNS.c_str())); BSONObj key = io.getObjectField("key"); uassert(12524, "index key pattern too large", key.objsize() <= 2048); @@ -187,7 +200,7 @@ namespace mongo { if ( sourceNS.empty() || key.isEmpty() ) { log(2) << "bad add index attempt name:" << (name?name:"") << "\n ns:" << - sourceNS << "\n idxobj:" << io.toString() << endl; + sourceNS << "\n idxobj:" << io.toString() << endl; string s = "bad add index attempt " + sourceNS + " key:" + key.toString(); uasserted(12504, s); } @@ -201,7 +214,7 @@ namespace mongo { return false; } sourceCollection = nsdetails(sourceNS.c_str()); - tlog() << "info: creating collection " << sourceNS << " on add index\n"; + tlog() << "info: creating collection " << sourceNS << " on add index" << endl; assert( sourceCollection ); } @@ -222,24 +235,55 @@ namespace mongo { uasserted(12505,s); } - /* this is because we want key patterns like { _id : 1 } and { _id : } to + /* we can't build a new index for the ns if a build is already in progress in the background - + EVEN IF this is a foreground build. + */ + uassert(12588, "cannot add index with a background operation in progress", + !BackgroundOperation::inProgForNs(sourceNS.c_str())); + + /* this is because we want key patterns like { _id : 1 } and { _id : } to all be treated as the same pattern. */ - if ( !god && IndexDetails::isIdIndexPattern(key) ) { - ensureHaveIdIndex( sourceNS.c_str() ); - return false; + if ( IndexDetails::isIdIndexPattern(key) ) { + if( !god ) { + ensureHaveIdIndex( sourceNS.c_str() ); + return false; + } + } + else { + /* is buildIndexes:false set for this replica set member? + if so we don't build any indexes except _id + */ + if( theReplSet && !theReplSet->buildIndexes() ) + return false; + } + + string pluginName = IndexPlugin::findPluginName( key ); + IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0; + + if ( plugin ) { + fixedIndexObject = plugin->adjustIndexSpec( io ); + } + else if ( io["v"].eoo() ) { + // add "v" if it doesn't exist + // if it does - leave whatever value was there + // this is for testing and replication + BSONObjBuilder b( io.objsize() + 32 ); + b.appendElements( io ); + b.append( "v" , 0 ); + fixedIndexObject = b.obj(); } return true; } - void IndexSpec::reset( const IndexDetails * details ){ + void IndexSpec::reset( const IndexDetails * details ) { _details = details; reset( details->info ); } - void IndexSpec::reset( const DiskLoc& loc ){ + void IndexSpec::reset( const DiskLoc& loc ) { info = loc.obj(); keyPattern = info["key"].embeddedObjectUserCheck(); if ( keyPattern.objsize() == 0 ) { diff --git a/db/index.h b/db/index.h index a2d7e7e..8578ed3 100644 --- a/db/index.h +++ b/db/index.h @@ -25,20 +25,27 @@ namespace mongo { - /* Details about a particular index. There is one of these effectively for each object in - system.namespaces (although this also includes the head pointer, which is not in that - collection). + /* Details about a particular index. There is one of these effectively for each object in + system.namespaces (although this also includes the head pointer, which is not in that + collection). ** MemoryMapped Record ** (i.e., this is on disk data) - */ + */ class IndexDetails { public: - DiskLoc head; /* btree head disk location */ + /** + * btree head disk location + * TODO We should make this variable private, since btree operations + * may change its value and we don't want clients to rely on an old + * value. If we create a btree class, we can provide a btree object + * to clients instead of 'head'. + */ + DiskLoc head; /* Location of index info object. Format: { name:"nameofindex", ns:"parentnsname", key: {keypattobject} - [, unique: , background: ] + [, unique: , background: ] } This object is in the system.indexes collection. Note that since we @@ -70,6 +77,13 @@ namespace mongo { return info.obj().getObjectField("key"); } + /** + * @return offset into keyPattern for key + -1 if doesn't exist + */ + int keyPatternOffset( const string& key ) const; + bool inKeyPattern( const string& key ) const { return keyPatternOffset( key ) >= 0; } + /* true if the specified key is in the index */ bool hasKey(const BSONObj& key); bool wouldCreateDup(const BSONObj& key, DiskLoc self); @@ -96,11 +110,11 @@ namespace mongo { BSONObjIterator i(pattern); BSONElement e = i.next(); if( strcmp(e.fieldName(), "_id") != 0 ) return false; - return i.next().eoo(); + return i.next().eoo(); } - + /* returns true if this is the _id index. */ - bool isIdIndex() const { + bool isIdIndex() const { return isIdIndexPattern( keyPattern() ); } @@ -112,11 +126,11 @@ namespace mongo { return io.getStringField("ns"); } - bool unique() const { + bool unique() const { BSONObj io = info.obj(); - return io["unique"].trueValue() || - /* temp: can we juse make unique:true always be there for _id and get rid of this? */ - isIdIndex(); + return io["unique"].trueValue() || + /* temp: can we juse make unique:true always be there for _id and get rid of this? */ + isIdIndex(); } /* if set, when building index, if any duplicates, drop the duplicating object */ @@ -128,7 +142,7 @@ namespace mongo { (system.indexes or system.namespaces) -- only NamespaceIndex. */ void kill_idx(); - + const IndexSpec& getSpec() const; string toString() const { @@ -136,13 +150,13 @@ namespace mongo { } }; - struct IndexChanges/*on an update*/ { + struct IndexChanges { /*on an update*/ BSONObjSetDefaultOrder oldkeys; BSONObjSetDefaultOrder newkeys; vector removed; // these keys were removed as part of the change vector added; // these keys were added as part of the change - /** @curObjLoc - the object we want to add's location. if it is already in the + /** @curObjLoc - the object we want to add's location. if it is already in the index, that is allowed here (for bg indexing case). */ void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) { diff --git a/db/indexkey.cpp b/db/indexkey.cpp index 70dd770..34f30fa 100644 --- a/db/indexkey.cpp +++ b/db/indexkey.cpp @@ -17,7 +17,7 @@ */ #include "pch.h" -#include "namespace.h" +#include "namespace-inl.h" #include "index.h" #include "btree.h" #include "query.h" @@ -28,98 +28,136 @@ namespace mongo { map * IndexPlugin::_plugins; IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec ) - : _plugin( plugin ) , _spec( spec ){ - + : _plugin( plugin ) , _spec( spec ) { + } - IndexType::~IndexType(){ + IndexType::~IndexType() { } - - const BSONObj& IndexType::keyPattern() const { - return _spec->keyPattern; + + const BSONObj& IndexType::keyPattern() const { + return _spec->keyPattern; } IndexPlugin::IndexPlugin( const string& name ) - : _name( name ){ + : _name( name ) { if ( ! _plugins ) _plugins = new map(); (*_plugins)[name] = this; } - - int IndexType::compare( const BSONObj& l , const BSONObj& r ) const { - return l.woCompare( r , _spec->keyPattern ); - } - void IndexSpec::_init(){ - assert( keyPattern.objsize() ); - + string IndexPlugin::findPluginName( const BSONObj& keyPattern ) { string pluginName = ""; BSONObjIterator i( keyPattern ); - BSONObjBuilder nullKeyB; + while( i.more() ) { BSONElement e = i.next(); - _fieldNames.push_back( e.fieldName() ); - _fixed.push_back( BSONElement() ); - nullKeyB.appendNull( "" ); - if ( e.type() == String ){ - uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 ); - pluginName = e.valuestr(); - } - + if ( e.type() != String ) + continue; + + uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 || pluginName == e.String() ); + pluginName = e.String(); } - - _nullKey = nullKeyB.obj(); - - BSONObjBuilder b; - b.appendNull( "" ); - _nullObj = b.obj(); - _nullElt = _nullObj.firstElement(); - - if ( pluginName.size() ){ - IndexPlugin * plugin = IndexPlugin::get( pluginName ); - if ( ! plugin ){ - log() << "warning: can't find plugin [" << pluginName << "]" << endl; + + return pluginName; + } + + int IndexType::compare( const BSONObj& l , const BSONObj& r ) const { + return l.woCompare( r , _spec->keyPattern ); + } + + void IndexSpec::_init() { + assert( keyPattern.objsize() ); + + // some basics + _nFields = keyPattern.nFields(); + _sparse = info["sparse"].trueValue(); + uassert( 13529 , "sparse only works for single field keys" , ! _sparse || _nFields ); + + + { + // build _nullKey + + BSONObjBuilder b; + BSONObjIterator i( keyPattern ); + + while( i.more() ) { + BSONElement e = i.next(); + _fieldNames.push_back( e.fieldName() ); + _fixed.push_back( BSONElement() ); + b.appendNull( "" ); } - else { - _indexType.reset( plugin->generate( this ) ); + _nullKey = b.obj(); + } + + { + // _nullElt + BSONObjBuilder b; + b.appendNull( "" ); + _nullObj = b.obj(); + _nullElt = _nullObj.firstElement(); + } + + { + // handle plugins + string pluginName = IndexPlugin::findPluginName( keyPattern ); + if ( pluginName.size() ) { + IndexPlugin * plugin = IndexPlugin::get( pluginName ); + if ( ! plugin ) { + log() << "warning: can't find plugin [" << pluginName << "]" << endl; + } + else { + _indexType.reset( plugin->generate( this ) ); + } } } + _finishedInit = true; } - + void IndexSpec::getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const { - if ( _indexType.get() ){ + if ( _indexType.get() ) { _indexType->getKeys( obj , keys ); return; } vector fieldNames( _fieldNames ); vector fixed( _fixed ); _getKeys( fieldNames , fixed , obj, keys ); - if ( keys.empty() ) + if ( keys.empty() && ! _sparse ) keys.insert( _nullKey ); } void IndexSpec::_getKeys( vector fieldNames , vector fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const { BSONElement arrElt; unsigned arrIdx = ~0; + int numNotFound = 0; + for( unsigned i = 0; i < fieldNames.size(); ++i ) { if ( *fieldNames[ i ] == '\0' ) continue; + BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] ); - if ( e.eoo() ) + + if ( e.eoo() ) { e = _nullElt; // no matching field + numNotFound++; + } + if ( e.type() != Array ) fieldNames[ i ] = ""; // no matching field or non-array match + if ( *fieldNames[ i ] == '\0' ) fixed[ i ] = e; // no need for further object expansion (though array expansion still possible) + if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here arrIdx = i; arrElt = e; } + // enforce single array path here - if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ){ + if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) { stringstream ss; ss << "cannot index parallel arrays [" << e.fieldName() << "] [" << arrElt.fieldName() << "]"; uasserted( 10088 , ss.str() ); @@ -127,13 +165,19 @@ namespace mongo { } bool allFound = true; // have we found elements for all field names in the key spec? - for( vector::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ){ - if ( **i != '\0' ){ + for( vector::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) { + if ( **i != '\0' ) { allFound = false; break; } } + if ( _sparse && numNotFound == _nFields ) { + // we didn't find any fields + // so we're not going to index this document + return; + } + bool insertArrayNull = false; if ( allFound ) { @@ -143,11 +187,11 @@ namespace mongo { for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) b.appendAs( *i, "" ); keys.insert( b.obj() ); - } + } else { // terminal array element to expand, so generate all keys BSONObjIterator i( arrElt.embeddedObject() ); - if ( i.more() ){ + if ( i.more() ) { while( i.more() ) { BSONObjBuilder b(_sizeTracker); for( unsigned j = 0; j < fixed.size(); ++j ) { @@ -159,18 +203,19 @@ namespace mongo { keys.insert( b.obj() ); } } - else if ( fixed.size() > 1 ){ + else if ( fixed.size() > 1 ) { insertArrayNull = true; } } - } else { + } + else { // nonterminal array element to expand, so recurse assert( !arrElt.eoo() ); BSONObjIterator i( arrElt.embeddedObject() ); - if ( i.more() ){ + if ( i.more() ) { while( i.more() ) { BSONElement e = i.next(); - if ( e.type() == Object ){ + if ( e.type() == Object ) { _getKeys( fieldNames, fixed, e.embeddedObject(), keys ); } } @@ -179,12 +224,12 @@ namespace mongo { insertArrayNull = true; } } - + if ( insertArrayNull ) { // x : [] - need to insert undefined BSONObjBuilder b(_sizeTracker); for( unsigned j = 0; j < fixed.size(); ++j ) { - if ( j == arrIdx ){ + if ( j == arrIdx ) { b.appendUndefined( "" ); } else { @@ -199,12 +244,12 @@ namespace mongo { } } - bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ){ + bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ) { BSONObjIterator x(a); - while ( x.more() ){ + while ( x.more() ) { BSONElement e = x.next(); BSONObjIterator y(b); - while ( y.more() ){ + while ( y.more() ) { BSONElement f = y.next(); FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() ); if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD ) @@ -213,13 +258,13 @@ namespace mongo { } return false; } - + IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const { if ( _indexType.get() ) return _indexType->suitability( query , order ); return _suitability( query , order ); } - + IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const { // TODO: optimize if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 ) diff --git a/db/indexkey.h b/db/indexkey.h index e73d9de..be73171 100644 --- a/db/indexkey.h +++ b/db/indexkey.h @@ -46,16 +46,16 @@ namespace mongo { virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const = 0; virtual shared_ptr newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0; - + /** optional op : changes query to match what's in the index */ virtual BSONObj fixKey( const BSONObj& in ) { return in; } /** optional op : compare 2 objects with regards to this index */ - virtual int compare( const BSONObj& l , const BSONObj& r ) const; + virtual int compare( const BSONObj& l , const BSONObj& r ) const; /** @return plugin */ const IndexPlugin * getPlugin() const { return _plugin; } - + const BSONObj& keyPattern() const; virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ; @@ -66,7 +66,7 @@ namespace mongo { const IndexPlugin * _plugin; const IndexSpec * _spec; }; - + /** * this represents a plugin * a plugin could be something like full text search, sparse index, etc... @@ -76,11 +76,21 @@ namespace mongo { class IndexPlugin : boost::noncopyable { public: IndexPlugin( const string& name ); - virtual ~IndexPlugin(){} - + virtual ~IndexPlugin() {} + virtual IndexType* generate( const IndexSpec * spec ) const = 0; - static IndexPlugin* get( const string& name ){ + string getName() const { return _name; } + + /** + * @return new keyPattern + * if nothing changes, should return keyPattern + */ + virtual BSONObj adjustIndexSpec( const BSONObj& spec ) const { return spec; } + + // ------- static below ------- + + static IndexPlugin* get( const string& name ) { if ( ! _plugins ) return 0; map::iterator i = _plugins->find( name ); @@ -89,7 +99,12 @@ namespace mongo { return i->second; } - string getName() const { return _name; } + /** + * @param keyPattern { x : "fts" } + * @return "" or the name + */ + static string findPluginName( const BSONObj& keyPattern ); + private: string _name; static map * _plugins; @@ -102,31 +117,31 @@ namespace mongo { public: BSONObj keyPattern; // e.g., { name : 1 } BSONObj info; // this is the same as IndexDetails::info.obj() - + IndexSpec() - : _details(0) , _finishedInit(false){ + : _details(0) , _finishedInit(false) { } IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() ) - : keyPattern(k) , info(m) , _details(0) , _finishedInit(false){ + : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) { _init(); } - + /** this is a DiscLoc of an IndexDetails info - should have a key field + should have a key field */ - IndexSpec( const DiskLoc& loc ){ + IndexSpec( const DiskLoc& loc ) { reset( loc ); } - + void reset( const DiskLoc& loc ); void reset( const IndexDetails * details ); - + void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const; BSONElement missingField() const { return _nullElt; } - + string getTypeName() const { if ( _indexType.get() ) return _indexType->getPlugin()->getName(); @@ -148,20 +163,24 @@ namespace mongo { IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ; void _getKeys( vector fieldNames , vector fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const; - + BSONSizeTracker _sizeTracker; vector _fieldNames; vector _fixed; - BSONObj _nullKey; - - BSONObj _nullObj; - BSONElement _nullElt; - + + BSONObj _nullKey; // a full key with all fields null + + BSONObj _nullObj; // only used for _nullElt + BSONElement _nullElt; // jstNull + + int _nFields; // number of fields in the index + bool _sparse; // if the index is sparse + shared_ptr _indexType; const IndexDetails * _details; - + void _init(); public: diff --git a/db/instance.cpp b/db/instance.cpp index a6873f2..3b668ee 100644 --- a/db/instance.cpp +++ b/db/instance.cpp @@ -27,7 +27,6 @@ #include "lasterror.h" #include "security.h" #include "json.h" -//#include "reccache.h" #include "replpair.h" #include "../s/d_logic.h" #include "../util/file_allocator.h" @@ -38,6 +37,8 @@ #endif #include "stats/counters.h" #include "background.h" +#include "dur_journal.h" +#include "dur_recover.h" namespace mongo { @@ -61,29 +62,30 @@ namespace mongo { bool useCursors = true; bool useHints = true; - - void flushOpLog( stringstream &ss ) { + + void flushDiagLog() { if( _diaglog.f && _diaglog.f->is_open() ) { - ss << "flushing op log and files\n"; + log() << "flushing diag log" << endl; _diaglog.flush(); } } - int ctr = 0; - KillCurrentOp killCurrentOp; - + int lockFile = 0; +#ifdef WIN32 + HANDLE lockFileHandle; +#endif // see FSyncCommand: - unsigned lockedForWriting; + unsigned lockedForWriting; mongo::mutex lockedForWritingMutex("lockedForWriting"); bool unlockRequested = false; void inProgCmd( Message &m, DbResponse &dbresponse ) { BSONObjBuilder b; - if( ! cc().isAdmin() ){ + if( ! cc().isAdmin() ) { BSONObjBuilder b; b.append("err", "unauthorized"); } @@ -95,12 +97,13 @@ namespace mongo { { Client& me = cc(); scoped_lock bl(Client::clientsMutex); - for( set::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { + for( set::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { Client *c = *i; assert( c ); - if ( c == &me ) - continue; CurOp* co = c->curop(); + if ( c == &me && !co ) { + continue; + } assert( co ); if( all || co->active() ) vals.push_back( co->infoNoauth() ); @@ -113,26 +116,26 @@ namespace mongo { b.append("info", "use db.$cmd.sys.unlock.findOne() to terminate the fsync write/snapshot lock"); } } - + replyToQuery(0, m, dbresponse, b.obj()); } - + void killOp( Message &m, DbResponse &dbresponse ) { BSONObj obj; - if( ! cc().isAdmin() ){ + if( ! cc().isAdmin() ) { obj = fromjson("{\"err\":\"unauthorized\"}"); } - /*else if( !dbMutexInfo.isLocked() ) + /*else if( !dbMutexInfo.isLocked() ) obj = fromjson("{\"info\":\"no op in progress/not locked\"}"); */ else { DbMessage d(m); QueryMessage q(d); BSONElement e = q.query.getField("op"); - if( !e.isNumber() ) { + if( !e.isNumber() ) { obj = fromjson("{\"err\":\"no op number field specified?\"}"); } - else { + else { log() << "going to kill op: " << e << endl; obj = fromjson("{\"info\":\"attempting to kill op\"}"); killCurrentOp.kill( (unsigned) e.number() ); @@ -143,23 +146,23 @@ namespace mongo { void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) { BSONObj obj; - if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) { + if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) { obj = fromjson("{\"err\":\"unauthorized\"}"); } else { - if( lockedForWriting ) { - log() << "command: unlock requested" << endl; + if( lockedForWriting ) { + log() << "command: unlock requested" << endl; obj = fromjson("{ok:1,\"info\":\"unlock requested\"}"); unlockRequested = true; } - else { + else { obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}"); } } replyToQuery(0, m, dbresponse, obj); } - static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){ + static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { bool ok = true; MSGID responseTo = m.header()->id; @@ -168,7 +171,7 @@ namespace mongo { auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); - + try { dbresponse.exhaust = runQuery(m, q, op, *resp); assert( !resp->empty() ); @@ -176,9 +179,9 @@ namespace mongo { catch ( AssertionException& e ) { ok = false; op.debug().str << " exception "; - LOGSOME { + LOGSOME { log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" << - (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; + (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } @@ -207,18 +210,18 @@ namespace mongo { resp->setData( msgdata, true ); } - if ( op.shouldDBProfile( 0 ) ){ + if ( op.shouldDBProfile( 0 ) ) { op.debug().str << " bytes:" << resp->header()->dataLen(); } - + dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; - + return ok; } // Returns false when request includes 'end' - bool assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client ) { + void assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client ) { // before we lock... int op = m.operation(); @@ -228,18 +231,18 @@ namespace mongo { if( strstr(ns, ".$cmd") ) { isCommand = true; opwrite(m); - if( strstr(ns, ".$cmd.sys.") ) { + if( strstr(ns, ".$cmd.sys.") ) { if( strstr(ns, "$cmd.sys.inprog") ) { inProgCmd(m, dbresponse); - return true; + return; } - if( strstr(ns, "$cmd.sys.killop") ) { + if( strstr(ns, "$cmd.sys.killop") ) { killOp(m, dbresponse); - return true; + return; } - if( strstr(ns, "$cmd.sys.unlock") ) { + if( strstr(ns, "$cmd.sys.unlock") ) { unlockFsync(ns, m, dbresponse); - return true; + return; } } } @@ -253,30 +256,30 @@ namespace mongo { else { opwrite(m); } - + globalOpCounters.gotOp( op , isCommand ); - + Client& c = cc(); - + auto_ptr nestedOp; CurOp* currentOpP = c.curop(); - if ( currentOpP->active() ){ + if ( currentOpP->active() ) { nestedOp.reset( new CurOp( &c , currentOpP ) ); currentOpP = nestedOp.get(); } CurOp& currentOp = *currentOpP; currentOp.reset(client,op); - + OpDebug& debug = currentOp.debug(); StringBuilder& ss = debug.str; ss << opToString( op ) << " "; int logThreshold = cmdLine.slowMS; bool log = logLevel >= 1; - + if ( op == dbQuery ) { if ( handlePossibleShardedMessage( m , &dbresponse ) ) - return true; + return; receivedQuery(c , dbresponse, m ); } else if ( op == dbGetMore ) { @@ -289,7 +292,7 @@ namespace mongo { int len = strlen(p); if ( len > 400 ) out() << curTimeMillis() % 10000 << - " long msg received, len:" << len << endl; + " long msg received, len:" << len << endl; Message *resp = new Message(); if ( strcmp( "end" , p ) == 0 ) @@ -304,7 +307,7 @@ namespace mongo { const char *ns = m.singleData()->_data + 4; char cl[256]; nsToDatabase(ns, cl); - if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) { + if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) { uassert_nothrow("unauthorized"); } else { @@ -330,37 +333,40 @@ namespace mongo { log = true; } } + catch ( UserException& ue ) { + tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl; + ss << " exception " << ue.toString(); + } catch ( AssertionException& e ) { - static int n; - tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing" << endl; - ss << " exception " + e.toString(); - log = ++n < 10; + tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl; + ss << " exception " << e.toString(); + log = true; } } } currentOp.ensureStarted(); currentOp.done(); int ms = currentOp.totalTimeMillis(); - - log = log || (logLevel >= 2 && ++ctr % 512 == 0); - //DEV log = true; + + //DEV log = true; if ( log || ms > logThreshold ) { if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && ms < 3000 && !log ) { /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */ - } else { + } + else { ss << ' ' << ms << "ms"; mongo::tlog() << ss.str() << endl; } } - - if ( currentOp.shouldDBProfile( ms ) ){ + + if ( currentOp.shouldDBProfile( ms ) ) { // performance profiling is on - if ( dbMutex.getState() < 0 ){ + if ( dbMutex.getState() < 0 ) { mongo::log(1) << "note: not profiling because recursive read lock" << endl; } else { - mongolock lk(true); - if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ){ + writelock lk; + if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) { Client::Context c( currentOp.getNS() ); profile(ss.str().c_str(), ms); } @@ -370,37 +376,44 @@ namespace mongo { } } - return true; } /* assembleResponse() */ - void killCursors(int n, long long *ids); void receivedKillCursors(Message& m) { int *x = (int *) m.singleData()->_data; x++; // reserved int n = *x++; + + assert( m.dataSize() == 8 + ( 8 * n ) ); + uassert( 13004 , "sent 0 cursors to kill" , n >= 1 ); if ( n > 2000 ) { log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl; assert( n < 30000 ); } - killCursors(n, (long long *) x); + + int found = ClientCursor::erase(n, (long long *) x); + + if ( logLevel > 0 || found != n ) { + log( found == n ) << "killcursors: found " << found << " of " << n << endl; + } + } /* db - database name path - db directory */ - void closeDatabase( const char *db, const string& path ) { + /*static*/ void Database::closeDatabase( const char *db, const string& path ) { assertInWriteLock(); - + Client::Context * ctx = cc().getContext(); assert( ctx ); assert( ctx->inDB( db , path ) ); Database *database = ctx->db(); assert( database->name == db ); - - oplogCheckCloseDatabase( database ); - if( BackgroundOperation::inProgForDb(db) ) { + oplogCheckCloseDatabase( database ); // oplog caches some things, dirty its caches + + if( BackgroundOperation::inProgForDb(db) ) { log() << "warning: bg op in prog during close db? " << db << endl; } @@ -412,8 +425,8 @@ namespace mongo { NamespaceDetailsTransient::clearForPrefix( prefix.c_str() ); dbHolder.erase( db, path ); - delete database; // closes files ctx->clear(); + delete database; // closes files } void receivedUpdate(Message& m, CurOp& op) { @@ -428,7 +441,7 @@ namespace mongo { assert( d.moreJSObjs() ); assert( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); - uassert( 10055 , "update object too large", toupdate.objsize() <= MaxBSONObjectSize); + uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); assert( toupdate.objsize() < m.header()->dataLen() ); assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; @@ -436,15 +449,15 @@ namespace mongo { bool broadcast = flags & UpdateOption_Broadcast; { string s = query.toString(); - /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. - instead, let's just story the query BSON in the debug object, and it can toString() + /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. + instead, let's just story the query BSON in the debug object, and it can toString() lazily */ op.debug().str << " query: " << s; op.setQuery(query); - } + } - mongolock lk(1); + writelock lk; // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) @@ -461,6 +474,7 @@ namespace mongo { const char *ns = d.getns(); assert(*ns); uassert( 10056 , "not master", isMasterNs( ns ) ); + op.debug().str << ns << ' '; int flags = d.pullInt(); bool justOne = flags & RemoveOption_JustOne; bool broadcast = flags & RemoveOption_Broadcast; @@ -470,63 +484,63 @@ namespace mongo { string s = pattern.toString(); op.debug().str << " query: " << s; op.setQuery(pattern); - } + } writelock lk(ns); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast & handlePossibleShardedMessage( m , 0 ) ) return; - + Client::Context ctx(ns); - + long long n = deleteObjects(ns, pattern, justOne, true); lastError.getSafe()->recordDelete( n ); } - + QueryResult* emptyMoreResult(long long); bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { StringBuilder& ss = curop.debug().str; bool ok = true; - + DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); - + ss << ns << " cid:" << cursorid; - if( ntoreturn ) + if( ntoreturn ) ss << " ntoreturn:" << ntoreturn; - time_t start = 0; - int pass = 0; + time_t start = 0; + int pass = 0; bool exhaust = false; QueryResult* msgdata; while( 1 ) { try { - mongolock lk(false); + readlock lk; Client::Context ctx(ns); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } - catch ( GetMoreWaitException& ) { + catch ( GetMoreWaitException& ) { exhaust = false; massert(13073, "shutting down", !inShutdown() ); - if( pass == 0 ) { - start = time(0); - } - else { - if( time(0) - start >= 4 ) { - // after about 4 seconds, return. this is a sanity check. pass stops at 1000 normally - // for DEV this helps and also if sleep is highly inaccurate on a platform. we want to - // return occasionally so slave can checkpoint. - pass = 10000; - } - } + if( pass == 0 ) { + start = time(0); + } + else { + if( time(0) - start >= 4 ) { + // after about 4 seconds, return. this is a sanity check. pass stops at 1000 normally + // for DEV this helps and also if sleep is highly inaccurate on a platform. we want to + // return occasionally so slave can checkpoint. + pass = 10000; + } + } pass++; - DEV - sleepmillis(20); - else + DEV + sleepmillis(20); + else sleepmillis(2); continue; } @@ -545,8 +559,8 @@ namespace mongo { ss << " nreturned:" << msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; - if( exhaust ) { - ss << " exhaust "; + if( exhaust ) { + ss << " exhaust "; dbresponse.exhaust = ns; } return ok; @@ -554,8 +568,8 @@ namespace mongo { void receivedInsert(Message& m, CurOp& op) { DbMessage d(m); - const char *ns = d.getns(); - assert(*ns); + const char *ns = d.getns(); + assert(*ns); uassert( 10058 , "not master", isMasterNs( ns ) ); op.debug().str << ns; @@ -564,31 +578,32 @@ namespace mongo { if ( handlePossibleShardedMessage( m , 0 ) ) return; - Client::Context ctx(ns); + Client::Context ctx(ns); + int n = 0; while ( d.moreJSObjs() ) { BSONObj js = d.nextJsObj(); - uassert( 10059 , "object to insert too large", js.objsize() <= MaxBSONObjectSize); + uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize); + + { + // check no $ modifiers + BSONObjIterator i( js ); + while ( i.more() ) { + BSONElement e = i.next(); + uassert( 13511 , "object to insert can't have $ modifiers" , e.fieldName()[0] != '$' ); + } + } + theDataFileMgr.insertWithObjMod(ns, js, false); logOp("i", ns, js); - globalOpCounters.gotInsert(); + + if( ++n % 4 == 0 ) { + // if we are inserting quite a few, we may need to commit along the way + getDur().commitIfNeeded(); + } } + globalOpCounters.incInsertInWriteLock(n); } - class JniMessagingPort : public AbstractMessagingPort { - public: - JniMessagingPort(Message& _container) : container(_container) { } - void reply(Message& received, Message& response, MSGID) { - container = response; - } - void reply(Message& received, Message& response) { - container = response; - } - unsigned remotePort(){ - return 1; - } - Message & container; - }; - void getDatabaseNames( vector< string > &names , const string& usePath ) { boost::filesystem::path path( usePath ); for ( boost::filesystem::directory_iterator i( path ); @@ -599,7 +614,8 @@ namespace mongo { p /= ( dbName + ".ns" ); if ( MMF::exists( p ) ) names.push_back( dbName ); - } else { + } + else { string fileName = boost::filesystem::path(*i).leaf(); if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" ) names.push_back( fileName.substr( 0, fileName.length() - 3 ) ); @@ -607,14 +623,14 @@ namespace mongo { } } - /* returns true if there is data on this server. useful when starting replication. + /* returns true if there is data on this server. useful when starting replication. local database does NOT count except for rsoplog collection. */ - bool replHasDatabases() { + bool replHasDatabases() { vector names; getDatabaseNames(names); if( names.size() >= 2 ) return true; - if( names.size() == 1 ){ + if( names.size() == 1 ) { if( names[0] != "local" ) return true; // we have a local database. return true if oplog isn't empty @@ -628,7 +644,7 @@ namespace mongo { return false; } - bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk ) { + bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { if ( lastError._get() ) lastError.startRequest( toSend, lastError._get() ); DbResponse dbResponse; @@ -636,6 +652,7 @@ namespace mongo { assert( dbResponse.response ); dbResponse.response->concat(); // can get rid of this if we make response handling smarter response = *dbResponse.response; + getDur().commitIfNeeded(); return true; } @@ -644,11 +661,12 @@ namespace mongo { lastError.startRequest( toSend, lastError._get() ); DbResponse dbResponse; assembleResponse( toSend, dbResponse ); + getDur().commitIfNeeded(); } auto_ptr DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip , - const BSONObj *fieldsToReturn , int queryOptions ){ - + const BSONObj *fieldsToReturn , int queryOptions ) { + //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions ) return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions ); // @@ -656,128 +674,181 @@ namespace mongo { //throw UserException( (string)"yay:" + ns ); } - void DBDirectClient::killCursor( long long id ){ + void DBDirectClient::killCursor( long long id ) { ClientCursor::erase( id ); } - DBClientBase * createDirectClient(){ - return new DBDirectClient(); + unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) { + readlock lk( ns ); + string errmsg; + long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg ); + if ( res == -1 ) + return 0; + uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 ); + return (unsigned long long )res; } - //void recCacheCloseAll(); + DBClientBase * createDirectClient() { + return new DBDirectClient(); + } mongo::mutex exitMutex("exit"); int numExitCalls = 0; - void shutdown(); - bool inShutdown(){ + bool inShutdown() { return numExitCalls > 0; } - void tryToOutputFatal( const string& s ){ + void tryToOutputFatal( const string& s ) { try { rawOut( s ); return; } - catch ( ... ){} + catch ( ... ) {} try { cerr << s << endl; return; } - catch ( ... ){} - + catch ( ... ) {} + // uh - oh, not sure there is anything else we can do... } + /** also called by ntservice.cpp */ + void shutdownServer() { + + log() << "shutdown: going to close listening sockets..." << endl; + ListeningSockets::get()->closeAll(); + + log() << "shutdown: going to flush diaglog..." << endl; + flushDiagLog(); + + /* must do this before unmapping mem or you may get a seg fault */ + log() << "shutdown: going to close sockets..." << endl; + boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) ); + + // wait until file preallocation finishes + // we would only hang here if the file_allocator code generates a + // synchronous signal, which we don't expect + log() << "shutdown: waiting for fs preallocator..." << endl; + FileAllocator::get()->waitUntilFinished(); + + if( cmdLine.dur ) { + log() << "shutdown: lock for final commit..." << endl; + { + int n = 10; + while( 1 ) { + // we may already be in a read lock from earlier in the call stack, so do read lock here + // to be consistent with that. + readlocktry w("", 20000); + if( w.got() ) { + log() << "shutdown: final commit..." << endl; + getDur().commitNow(); + break; + } + if( --n <= 0 ) { + log() << "shutdown: couldn't acquire write lock, aborting" << endl; + abort(); + } + log() << "shutdown: waiting for write lock..." << endl; + } + } + MemoryMappedFile::flushAll(true); + } + + log() << "shutdown: closing all files..." << endl; + stringstream ss3; + MemoryMappedFile::closeAllFiles( ss3 ); + rawOut( ss3.str() ); + + if( cmdLine.dur ) { + log() << "shutdown: journalCleanup..." << endl; + dur::journalCleanup(); + } + +#if !defined(__sunos__) + if ( lockFile ) { + log() << "shutdown: removing fs lock..." << endl; + /* This ought to be an unlink(), but Eliot says the last + time that was attempted, there was a race condition + with acquirePathLock(). */ +#ifdef WIN32 + if( _chsize( lockFile , 0 ) ) + log() << "couldn't remove fs lock " << getLastError() << endl; + CloseHandle(lockFileHandle); +#else + if( ftruncate( lockFile , 0 ) ) + log() << "couldn't remove fs lock " << errnoWithDescription() << endl; + flock( lockFile, LOCK_UN ); +#endif + } +#endif + } + /* not using log() herein in case we are already locked */ - void dbexit( ExitCode rc, const char *why) { + void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) { + + auto_ptr wlt; + if ( tryToGetLock ) { + wlt.reset( new writelocktry( "" , 2 * 60 * 1000 ) ); + uassert( 13455 , "dbexit timed out getting lock" , wlt->got() ); + } + Client * c = currentClient.get(); { scoped_lock lk( exitMutex ); if ( numExitCalls++ > 0 ) { - if ( numExitCalls > 5 ){ + if ( numExitCalls > 5 ) { // this means something horrible has happened ::_exit( rc ); } stringstream ss; - ss << "dbexit: " << why << "; exiting immediately" << endl; + ss << "dbexit: " << why << "; exiting immediately"; tryToOutputFatal( ss.str() ); if ( c ) c->shutdown(); - ::exit( rc ); + ::exit( rc ); } } - - stringstream ss; - ss << "dbexit: " << why << endl; - tryToOutputFatal( ss.str() ); - + + { + stringstream ss; + ss << "dbexit: " << why; + tryToOutputFatal( ss.str() ); + } + try { - shutdown(); // gracefully shutdown instance + shutdownServer(); // gracefully shutdown instance } - catch ( ... ){ + catch ( ... ) { tryToOutputFatal( "shutdown failed with exception" ); } - try { + try { mutexDebugger.programEnding(); } catch (...) { } - + tryToOutputFatal( "dbexit: really exiting now" ); if ( c ) c->shutdown(); ::exit(rc); } - - void shutdown() { - - log() << "shutdown: going to close listening sockets..." << endl; - ListeningSockets::get()->closeAll(); - log() << "shutdown: going to flush oplog..." << endl; - stringstream ss2; - flushOpLog( ss2 ); - rawOut( ss2.str() ); - - /* must do this before unmapping mem or you may get a seg fault */ - log() << "shutdown: going to close sockets..." << endl; - boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) ); - - // wait until file preallocation finishes - // we would only hang here if the file_allocator code generates a - // synchronous signal, which we don't expect - log() << "shutdown: waiting for fs preallocator..." << endl; - theFileAllocator().waitUntilFinished(); - - log() << "shutdown: closing all files..." << endl; - stringstream ss3; - MemoryMappedFile::closeAllFiles( ss3 ); - rawOut( ss3.str() ); - - // should we be locked here? we aren't. might be ok as-is. - //recCacheCloseAll(); - -#if !defined(_WIN32) && !defined(__sunos__) - if ( lockFile ){ - log() << "shutdown: removing fs lock..." << endl; - if( ftruncate( lockFile , 0 ) ) - log() << "couldn't remove fs lock " << errnoWithDescription() << endl; - flock( lockFile, LOCK_UN ); - } -#endif - } - -#if !defined(_WIN32) && !defined(__sunos__) +#if !defined(__sunos__) void writePid(int fd) { stringstream ss; ss << getpid() << endl; string s = ss.str(); const char * data = s.c_str(); +#ifdef WIN32 + assert ( _write( fd, data, strlen( data ) ) ); +#else assert ( write( fd, data, strlen( data ) ) ); +#endif } void acquirePathLock() { - string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string(); + string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string(); bool oldFile = false; @@ -785,37 +856,117 @@ namespace mongo { oldFile = true; } +#ifdef WIN32 + lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE, + 0 /* do not allow anyone else access */, NULL, + OPEN_ALWAYS /* success if fh can open */, 0, NULL ); + + if (lockFileHandle == INVALID_HANDLE_VALUE) { + DWORD code = GetLastError(); + char *msg; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, + NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR)&msg, 0, NULL); + uasserted( 13627 , msg ); + } + lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0); +#else lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO ); - if( lockFile <= 0 ) { - uasserted( 10309 , str::stream() << "Unable to create / open lock file for lockfilepath: " << name << ' ' << errnoWithDescription()); + if( lockFile <= 0 ) { + uasserted( 10309 , str::stream() << "Unable to create / open lock file for lockfilepath: " << name << ' ' << errnoWithDescription()); } if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) { close ( lockFile ); lockFile = 0; uassert( 10310 , "Unable to acquire lock for lockfilepath: " + name, 0 ); } +#endif - if ( oldFile ){ + if ( oldFile ) { // we check this here because we want to see if we can get the lock // if we can't, then its probably just another mongod running - cout << "************** \n" - << "old lock file: " << name << ". probably means unclean shutdown\n" - << "recommend removing file and running --repair\n" - << "see: http://dochub.mongodb.org/core/repair for more information\n" - << "*************" << endl; - close ( lockFile ); - lockFile = 0; - uassert( 12596 , "old lock file" , 0 ); + + string errmsg; + if (cmdLine.dur) { + if (!dur::haveJournalFiles()) { + + vector dbnames; + getDatabaseNames( dbnames ); + + if ( dbnames.size() == 0 ) { + // this means that mongod crashed + // between initial startup and when journaling was initialized + // it is safe to continue + } + else { + errmsg = str::stream() + << "************** \n" + << "old lock file: " << name << ". probably means unclean shutdown,\n" + << "but there are no journal files to recover.\n" + << "this is likely human error or filesystem corruption.\n" + << "found " << dbnames.size() << " dbs.\n" + << "see: http://dochub.mongodb.org/core/repair for more information\n" + << "*************"; + } + + + } + } + else { + errmsg = str::stream() + << "************** \n" + << "old lock file: " << name << ". probably means unclean shutdown\n" + << "recommend removing file and running --repair\n" + << "see: http://dochub.mongodb.org/core/repair for more information\n" + << "*************"; + } + + if (!errmsg.empty()) { + cout << errmsg << endl; +#ifdef WIN32 + CloseHandle( lockFileHandle ); +#else + close ( lockFile ); +#endif + lockFile = 0; + uassert( 12596 , "old lock file" , 0 ); + } + } + + // Not related to lock file, but this is where we handle unclean shutdown + if( !cmdLine.dur && dur::haveJournalFiles() ) { + cout << "**************" << endl; + cout << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl; + cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl; + cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl; + cout << "**************" << endl; + uasserted(13597, "can't start without --dur enabled when journal/ files are present"); } +#ifdef WIN32 + uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0); + writePid( lockFile ); + _commit( lockFile ); +#else uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0); writePid( lockFile ); fsync( lockFile ); +#endif } #else void acquirePathLock() { - // TODO - this is very bad + // TODO - this is very bad that the code above not running here. + + // Not related to lock file, but this is where we handle unclean shutdown + if( !cmdLine.dur && dur::haveJournalFiles() ) { + cout << "**************" << endl; + cout << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl; + cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl; + cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl; + cout << "**************" << endl; + uasserted(13618, "can't start without --dur enabled when journal/ files are present"); + } } -#endif - +#endif + } // namespace mongo diff --git a/db/instance.h b/db/instance.h index 5458fc1..2516aec 100644 --- a/db/instance.h +++ b/db/instance.h @@ -21,7 +21,7 @@ #include "../client/dbclient.h" -#include "curop.h" +#include "curop-inl.h" #include "security.h" #include "cmdline.h" #include "client.h" @@ -40,7 +40,7 @@ namespace mongo { DiagLog() : f(0) , level(0), mutex("DiagLog") { } void init() { - if ( ! f && level ){ + if ( ! f && level ) { log() << "diagLogging = " << level << endl; stringstream ss; ss << dbpath << "/diaglog." << hex << time(0); @@ -55,20 +55,20 @@ namespace mongo { /** * @return old */ - int setLevel( int newLevel ){ + int setLevel( int newLevel ) { int old = level; level = newLevel; init(); return old; } void flush() { - if ( level ){ + if ( level ) { scoped_lock lk(mutex); f->flush(); } } void write(char *data,int len) { - if ( level & 1 ){ + if ( level & 1 ) { scoped_lock lk(mutex); f->write(data,len); } @@ -77,7 +77,7 @@ namespace mongo { if ( level & 2 ) { bool log = (level & 4) == 0; OCCASIONALLY log = true; - if ( log ){ + if ( log ) { scoped_lock lk(mutex); assert( f ); f->write(data,len); @@ -102,52 +102,56 @@ namespace mongo { } ~DbResponse() { delete response; } }; - - bool assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client = unknownAddress ); + + void assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client = unknownAddress ); void getDatabaseNames( vector< string > &names , const string& usePath = dbpath ); - /* returns true if there is no data on this server. useful when starting replication. - local database does NOT count. + /* returns true if there is no data on this server. useful when starting replication. + local database does NOT count. */ bool replHasDatabases(); -// --- local client --- - + /** "embedded" calls to the local server directly. + Caller does not need to lock, that is handled within. + */ class DBDirectClient : public DBClientBase { - public: virtual auto_ptr query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0, const BSONObj *fieldsToReturn = 0, int queryOptions = 0); - + virtual bool isFailed() const { return false; } virtual string toString() { return "DBDirectClient"; } - virtual string getServerAddress() const{ + virtual string getServerAddress() const { return "localhost"; // TODO: should this have the port? } - virtual bool call( Message &toSend, Message &response, bool assertOk=true ); + virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 ); virtual void say( Message &toSend ); virtual void sayPiggyBack( Message &toSend ) { // don't need to piggy back when connected locally return say( toSend ); } - + virtual void killCursor( long long cursorID ); - - virtual bool callRead( Message& toSend , Message& response ){ + + virtual bool callRead( Message& toSend , Message& response ) { return call( toSend , response ); } - - virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; } - virtual bool isMember( const DBConnector * conn ) const { return this == conn; }; + + virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 ); + + virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; } }; extern int lockFile; +#ifdef WIN32 + extern HANDLE lockFileHandle; +#endif void acquirePathLock(); void maybeCreatePidFile(); - + } // namespace mongo diff --git a/db/introspect.cpp b/db/introspect.cpp index d72bb3f..cee0da8 100644 --- a/db/introspect.cpp +++ b/db/introspect.cpp @@ -26,8 +26,7 @@ namespace mongo { - void profile( const char *str, int millis) - { + void profile( const char *str, int millis) { BSONObjBuilder b; b.appendDate("ts", jsTime()); b.append("info", str); diff --git a/db/jsobj.cpp b/db/jsobj.cpp index 9f613c7..25ab8a8 100644 --- a/db/jsobj.cpp +++ b/db/jsobj.cpp @@ -18,6 +18,7 @@ */ #include "pch.h" +#include "../bson/oid.h" #include "jsobj.h" #include "nonce.h" #include "../bson/util/atomic_int.h" @@ -34,6 +35,7 @@ #define assert MONGO_assert // make sure our assumptions are valid +BOOST_STATIC_ASSERT( sizeof(short) == 2 ); BOOST_STATIC_ASSERT( sizeof(int) == 4 ); BOOST_STATIC_ASSERT( sizeof(long long) == 8 ); BOOST_STATIC_ASSERT( sizeof(double) == 8 ); @@ -48,6 +50,9 @@ namespace mongo { DateNowLabeler DATENOW; + MinKeyLabeler MINKEY; + MaxKeyLabeler MAXKEY; + string escape( string s , bool escape_slash=false) { StringBuilder ret; for ( string::iterator i = s.begin(); i != s.end(); ++i ) { @@ -81,7 +86,8 @@ namespace mongo { //TODO: these should be utf16 code-units not bytes char c = *i; ret << "\\u00" << toHexLower(&c, 1); - } else { + } + else { ret << *i; } } @@ -111,7 +117,8 @@ namespace mongo { number() <= numeric_limits< double >::max() ) { s.precision( 16 ); s << number(); - } else { + } + else { StringBuilder ss; ss << "Number " << number() << " cannot be represented in JSON"; string message = ss.str(); @@ -170,13 +177,15 @@ namespace mongo { case jstOID: if ( format == TenGen ) { s << "ObjectId( "; - } else { + } + else { s << "{ \"$oid\" : "; } s << '"' << __oid() << '"'; if ( format == TenGen ) { s << " )"; - } else { + } + else { s << " }"; } break; @@ -203,7 +212,8 @@ namespace mongo { if( d == 0 ) s << '0'; else s << '"' << date().toString() << '"'; - } else + } + else s << date(); if ( format == Strict ) s << " }"; @@ -211,13 +221,14 @@ namespace mongo { s << " )"; break; case RegEx: - if ( format == Strict ){ + if ( format == Strict ) { s << "{ \"$regex\" : \"" << escape( regex() ); s << "\", \"$options\" : \"" << regexFlags() << "\" }"; - } else { + } + else { s << "/" << escape( regex() , true ) << "/"; // FIXME Worry about alpha order? - for ( const char *f = regexFlags(); *f; ++f ){ + for ( const char *f = regexFlags(); *f; ++f ) { switch ( *f ) { case 'g': case 'i': @@ -232,7 +243,7 @@ namespace mongo { case CodeWScope: { BSONObj scope = codeWScopeObject(); - if ( ! scope.isEmpty() ){ + if ( ! scope.isEmpty() ) { s << "{ \"$code\" : " << _asCode() << " , " << " \"$scope\" : " << scope.jsonString() << " }"; break; @@ -243,7 +254,7 @@ namespace mongo { case Code: s << _asCode(); break; - + case Timestamp: s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }"; break; @@ -259,7 +270,7 @@ namespace mongo { default: StringBuilder ss; ss << "Cannot create a properly formatted JSON string with " - << "element: " << toString() << " of type: " << type(); + << "element: " << toString() << " of type: " << type(); string message = ss.str(); massert( 10312 , message.c_str(), false ); } @@ -279,13 +290,13 @@ namespace mongo { else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE; } } - else if ( fn[1] == 'n' && fn[2] == 'e' ){ + else if ( fn[1] == 'n' && fn[2] == 'e' ) { if ( fn[3] == 0 ) return BSONObj::NE; - if ( fn[3] == 'a' && fn[4] == 'r' && fn[5] == 0 ) + if ( fn[3] == 'a' && fn[4] == 'r') // matches anything with $near prefix return BSONObj::opNEAR; } - else if ( fn[1] == 'm' ){ + else if ( fn[1] == 'm' ) { if ( fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 ) return BSONObj::opMOD; if ( fn[2] == 'a' && fn[3] == 'x' && fn[4] == 'D' && fn[5] == 'i' && fn[6] == 's' && fn[7] == 't' && fn[8] == 'a' && fn[9] == 'n' && fn[10] == 'c' && fn[11] == 'e' && fn[12] == 0 ) @@ -301,7 +312,7 @@ namespace mongo { return BSONObj::opALL; else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 ) return BSONObj::opSIZE; - else if ( fn[1] == 'e' ){ + else if ( fn[1] == 'e' ) { if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 ) return BSONObj::opEXISTS; if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 ) @@ -370,22 +381,24 @@ namespace mongo { double left = l.number(); double right = r.number(); bool lNan = !( left <= numeric_limits< double >::max() && - left >= -numeric_limits< double >::max() ); + left >= -numeric_limits< double >::max() ); bool rNan = !( right <= numeric_limits< double >::max() && - right >= -numeric_limits< double >::max() ); + right >= -numeric_limits< double >::max() ); if ( lNan ) { if ( rNan ) { return 0; - } else { + } + else { return -1; } - } else if ( rNan ) { + } + else if ( rNan ) { return 1; } x = left - right; if ( x < 0 ) return -1; return x == 0 ? 0 : 1; - } + } case jstOID: return memcmp(l.value(), r.value(), 12); case Code: @@ -408,8 +421,7 @@ namespace mongo { if ( lsz - rsz != 0 ) return lsz - rsz; return memcmp(l.value()+4, r.value()+4, lsz+1); } - case RegEx: - { + case RegEx: { int c = strcmp(l.regex(), r.regex()); if ( c ) return c; @@ -462,11 +474,14 @@ namespace mongo { return fe.getGtLtOp(); } - FieldCompareResult compareDottedFieldNames( const string& l , const string& r ){ + FieldCompareResult compareDottedFieldNames( const string& l , const string& r ) { + static int maxLoops = 1024 * 1024; + size_t lstart = 0; size_t rstart = 0; - while ( 1 ){ - if ( lstart >= l.size() ){ + + for ( int i=0; i= l.size() ) { if ( rstart >= r.size() ) return SAME; return RIGHT_SUBFIELD; @@ -493,6 +508,10 @@ namespace mongo { lstart = lend + 1; rstart = rend + 1; } + + log() << "compareDottedFieldNames ERROR l: " << l << " r: " << r << " TOO MANY LOOPS" << endl; + assert(0); + return SAME; // will never get here } /* BSONObj ------------------------------------------------------------*/ @@ -534,33 +553,35 @@ namespace mongo { return s.str(); } -// todo: can be a little faster if we don't use toString() here. bool BSONObj::valid() const { - try{ + try { BSONObjIterator it(*this); - while( it.moreWithEOO() ){ + while( it.moreWithEOO() ) { // both throw exception on failure BSONElement e = it.next(true); e.validate(); - if (e.eoo()){ + if (e.eoo()) { if (it.moreWithEOO()) return false; return true; - }else if (e.isABSONObj()){ + } + else if (e.isABSONObj()) { if(!e.embeddedObject().valid()) return false; - }else if (e.type() == CodeWScope){ + } + else if (e.type() == CodeWScope) { if(!e.codeWScopeObject().valid()) return false; } } - } catch (...) { + } + catch (...) { } return false; } - int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const { + int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const { if ( isEmpty() ) return r.isEmpty() ? 0 : -1; if ( r.isEmpty() ) @@ -619,13 +640,13 @@ namespace mongo { return 1; int x; -/* - if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 && - l.type() == String && r.type() == String ) { - // note: no negative support yet, as this is just sort of a POC - x = _stricmp(l.valuestr(), r.valuestr()); - } - else*/ { + /* + if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 && + l.type() == String && r.type() == String ) { + // note: no negative support yet, as this is just sort of a POC + x = _stricmp(l.valuestr(), r.valuestr()); + } + else*/ { x = l.woCompare( r, considerFieldName ); if ( ordered && o.number() < 0 ) x = -x; @@ -639,7 +660,7 @@ namespace mongo { BSONObj staticNull = fromjson( "{'':null}" ); /* well ordered compare */ - int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const{ + int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const { if ( isEmpty() ) return other.isEmpty() ? 0 : -1; if ( other.isEmpty() ) @@ -648,7 +669,7 @@ namespace mongo { uassert( 10060 , "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() ); BSONObjIterator i(sortKey); - while ( 1 ){ + while ( 1 ) { BSONElement f = i.next(); if ( f.eoo() ) return 0; @@ -678,36 +699,41 @@ namespace mongo { const char* next = p+1; BSONElement e = getField( left.c_str() ); - if (e.type() == Object){ + if (e.type() == Object) { e.embeddedObject().getFieldsDotted(next, ret); - } else if (e.type() == Array) { + } + else if (e.type() == Array) { bool allDigits = false; - if ( isdigit( *next ) ){ + if ( isdigit( *next ) ) { const char * temp = next + 1; while ( isdigit( *temp ) ) temp++; - allDigits = *temp == '.'; + allDigits = (*temp == '.' || *temp == '\0'); } if (allDigits) { e.embeddedObject().getFieldsDotted(next, ret); - } else { + } + else { BSONObjIterator i(e.embeddedObject()); - while ( i.more() ){ + while ( i.more() ) { BSONElement e2 = i.next(); if (e2.type() == Object || e2.type() == Array) e2.embeddedObject().getFieldsDotted(next, ret); } } - } else { + } + else { // do nothing: no match } } - } else { - if (e.type() == Array){ + } + else { + if (e.type() == Array) { BSONObjIterator i(e.embeddedObject()); while ( i.more() ) ret.insert(i.next()); - } else { + } + else { ret.insert(e); } } @@ -715,15 +741,18 @@ namespace mongo { BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const { const char *p = strchr(name, '.'); - string left; + + BSONElement sub; + if ( p ) { - left = string(name, p-name); + sub = getField( string(name, p-name) ); name = p + 1; - } else { - left = string(name); + } + else { + sub = getField( name ); name = name + strlen(name); } - BSONElement sub = getField(left.c_str()); + if ( sub.eoo() ) return nullElement; else if ( sub.type() == Array || name[0] == '\0') @@ -778,7 +807,7 @@ namespace mongo { break; BSONElement x = filter.getField( e.fieldName() ); if ( ( x.eoo() && !inFilter ) || - ( !x.eoo() && inFilter ) ) + ( !x.eoo() && inFilter ) ) b.append( e ); } return b.obj(); @@ -858,7 +887,8 @@ namespace mongo { gotId = gotId || strcmp(fname, "_id")==0; if ( n == N && gotId ) break; - } else if ( strcmp(fname, "_id")==0 ) { + } + else if ( strcmp(fname, "_id")==0 ) { b.append(e); gotId = true; if ( n == N && gotId ) @@ -882,20 +912,20 @@ namespace mongo { if ( e.eoo() ) break; switch( e.type() ) { - case MinKey: { - BSONObjBuilder m; - m.append( "$minElement", 1 ); - b.append( e.fieldName(), m.done() ); - break; - } - case MaxKey: { - BSONObjBuilder m; - m.append( "$maxElement", 1 ); - b.append( e.fieldName(), m.done() ); - break; - } - default: - b.append( e ); + case MinKey: { + BSONObjBuilder m; + m.append( "$minElement", 1 ); + b.append( e.fieldName(), m.done() ); + break; + } + case MaxKey: { + BSONObjBuilder m; + m.append( "$maxElement", 1 ); + b.append( e.fieldName(), m.done() ); + break; + } + default: + b.append( e ); } } return b.obj(); @@ -913,7 +943,8 @@ namespace mongo { if ( !f.eoo() ) { b.appendAs( e, f.fieldName() ); f = j.next(); - } else { + } + else { b.append( e ); } } @@ -922,20 +953,20 @@ namespace mongo { bool BSONObj::okForStorage() const { BSONObjIterator i( *this ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); const char * name = e.fieldName(); - + if ( strchr( name , '.' ) || - strchr( name , '$' ) ){ - return + strchr( name , '$' ) ) { + return strcmp( name , "$ref" ) == 0 || strcmp( name , "$id" ) == 0 ; } - - if ( e.mayEncapsulate() ){ - switch ( e.type() ){ + + if ( e.mayEncapsulate() ) { + switch ( e.type() ) { case Object: case Array: if ( ! e.embeddedObject().okForStorage() ) @@ -948,7 +979,7 @@ namespace mongo { default: uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 ); } - + } } return true; @@ -982,25 +1013,26 @@ namespace mongo { return ss.str(); } - void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base){ + void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) { BSONObjIterator it(obj); - while (it.more()){ + while (it.more()) { BSONElement e = it.next(); - if (e.type() == Object){ + if (e.type() == Object) { string newbase = base + e.fieldName() + "."; nested2dotted(b, e.embeddedObject(), newbase); - }else{ + } + else { string newbase = base + e.fieldName(); b.appendAs(e, newbase); } } } - void dotted2nested(BSONObjBuilder& b, const BSONObj& obj){ + void dotted2nested(BSONObjBuilder& b, const BSONObj& obj) { //use map to sort fields BSONMap sorted = bson2map(obj); EmbeddedBuilder eb(&b); - for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it){ + for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it) { eb.appendAs(it->second, it->first); } eb.done(); @@ -1037,16 +1069,16 @@ namespace mongo { } minkeydata; BSONObj minKey((const char *) &minkeydata); -/* - struct JSObj0 { - JSObj0() { - totsize = 5; - eoo = EOO; - } - int totsize; - char eoo; - } js0; -*/ + /* + struct JSObj0 { + JSObj0() { + totsize = 5; + eoo = EOO; + } + int totsize; + char eoo; + } js0; + */ #pragma pack() struct BsonUnitTest : public UnitTest { @@ -1078,7 +1110,7 @@ namespace mongo { assert( b == id ); } - void testbounds(){ + void testbounds() { BSONObj l , r; { BSONObjBuilder b; @@ -1101,7 +1133,7 @@ namespace mongo { assert( r.woCompare( l ) > 0 ); } - void testorder(){ + void testorder() { { BSONObj x,y,z; { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); } @@ -1176,84 +1208,6 @@ namespace mongo { } } bson_unittest; -/* - BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const char * value ) { - _builder->append( _fieldName , value ); - return *_builder; - } - - BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const int value ) { - _builder->append( _fieldName , value ); - return *_builder; - } - - BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const double value ) { - _builder->append( _fieldName , value ); - return *_builder; - } -*/ - - void OID::init() { - static AtomicUInt inc = getRandomNumber(); - unsigned t = (unsigned) time(0); - char *T = (char *) &t; - data[0] = T[3]; - data[1] = T[2]; - data[2] = T[1]; - data[3] = T[0]; - - (unsigned&) data[4] = _machine; - - int new_inc = inc++; - T = (char *) &new_inc; - char * raw = (char*)&b; - raw[0] = T[3]; - raw[1] = T[2]; - raw[2] = T[1]; - raw[3] = T[0]; - } - - unsigned OID::_machine = (unsigned) security.getNonceInitSafe(); - void OID::newState(){ - unsigned before = _machine; - // using fresh Security object to avoid buffered devrandom - _machine = (unsigned)security.getNonce(); - assert( _machine != before ); - } - - void OID::init( string s ){ - assert( s.size() == 24 ); - const char *p = s.c_str(); - for( int i = 0; i < 12; i++ ) { - data[i] = fromHex(p); - p += 2; - } - } - - void OID::init(Date_t date, bool max){ - int time = (int) (date / 1000); - char* T = (char *) &time; - data[0] = T[3]; - data[1] = T[2]; - data[2] = T[1]; - data[3] = T[0]; - - if (max) - *(long long*)(data + 4) = 0xFFFFFFFFFFFFFFFFll; - else - *(long long*)(data + 4) = 0x0000000000000000ll; - } - - time_t OID::asTimeT(){ - int time; - char* T = (char *) &time; - T[0] = data[3]; - T[1] = data[2]; - T[2] = data[1]; - T[3] = data[0]; - return time; - } - Labeler::Label GT( "$gt" ); Labeler::Label GTE( "$gte" ); Labeler::Label LT( "$lt" ); @@ -1268,21 +1222,20 @@ namespace mongo { timestamp = OpTime::now().asDate(); } - void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ){ - switch ( t ){ + void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) { + switch ( t ) { case MinKey: appendMinKey( fieldName ); return; case MaxKey: appendMinKey( fieldName ); return; case NumberInt: case NumberDouble: case NumberLong: append( fieldName , - numeric_limits::max() ); return; - case jstOID: - { - OID o; - memset(&o, 0, sizeof(o)); - appendOID( fieldName , &o); - return; - } + case jstOID: { + OID o; + memset(&o, 0, sizeof(o)); + appendOID( fieldName , &o); + return; + } case Bool: appendBool( fieldName , false); return; case Date: appendDate( fieldName , 0); return; case jstNULL: appendNull( fieldName ); return; @@ -1296,13 +1249,12 @@ namespace mongo { case Undefined: appendUndefined( fieldName ); return; case RegEx: appendRegex( fieldName , "" ); return; - case DBRef: - { - OID o; - memset(&o, 0, sizeof(o)); - appendDBRef( fieldName , "" , o ); - return; - } + case DBRef: { + OID o; + memset(&o, 0, sizeof(o)); + appendDBRef( fieldName , "" , o ); + return; + } case Code: appendCode( fieldName , "" ); return; case CodeWScope: appendCodeWScope( fieldName , "" , BSONObj() ); return; case Timestamp: appendTimestamp( fieldName , 0); return; @@ -1312,8 +1264,8 @@ namespace mongo { uassert( 10061 , "type not supported for appendMinElementForType" , false ); } - void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ){ - switch ( t ){ + void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) { + switch ( t ) { case MinKey: appendMaxKey( fieldName ); break; case MaxKey: appendMaxKey( fieldName ); break; case NumberInt: @@ -1324,13 +1276,12 @@ namespace mongo { case BinData: appendMinForType( fieldName , jstOID ); break; - case jstOID: - { - OID o; - memset(&o, 0xFF, sizeof(o)); - appendOID( fieldName , &o); - break; - } + case jstOID: { + OID o; + memset(&o, 0xFF, sizeof(o)); + appendOID( fieldName , &o); + break; + } case Undefined: case jstNULL: appendMinForType( fieldName , NumberInt ); @@ -1349,7 +1300,7 @@ namespace mongo { } const string BSONObjBuilder::numStrs[] = { - "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", @@ -1361,77 +1312,77 @@ namespace mongo { "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", }; - bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ){ + bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) { if ( data.size() == 0 || data == "-") return false; - + unsigned int pos=0; if ( data[0] == '-' ) pos++; - + bool hasDec = false; - - for ( ; pos( data ); append( fieldName , num ); return true; } - catch(bad_lexical_cast &){ + catch(bad_lexical_cast &) { return false; } } - void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ){ + void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ) { BSONObjIterator i(keyPattern); BSONObjIterator j(values); - - while ( i.more() && j.more() ){ + + while ( i.more() && j.more() ) { appendAs( j.next() , i.next().fieldName() ); } - + assert( ! i.more() ); assert( ! j.more() ); } - int BSONElementFieldSorter( const void * a , const void * b ){ + int BSONElementFieldSorter( const void * a , const void * b ) { const char * x = *((const char**)a); const char * y = *((const char**)b); x++; y++; return lexNumCmp( x , y ); } - - BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ){ + + BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) { _nfields = o.nFields(); _fields = new const char*[_nfields]; int x = 0; BSONObjIterator i( o ); - while ( i.more() ){ + while ( i.more() ) { _fields[x++] = i.next().rawdata(); assert( _fields[x-1] ); } @@ -1441,10 +1392,10 @@ namespace mongo { } /** transform a BSON array into a vector of BSONElements. - we match array # positions with their vector position, and ignore - any non-numeric fields. + we match array # positions with their vector position, and ignore + any fields with non-numeric field names. */ - vector BSONElement::Array() const { + vector BSONElement::Array() const { chk(mongo::Array); vector v; BSONObjIterator i(Obj()); @@ -1453,7 +1404,7 @@ namespace mongo { const char *f = e.fieldName(); try { unsigned u = stringToNum(f); - assert( u < 4096 ); + assert( u < 1000000 ); if( u >= v.size() ) v.resize(u+1); v[u] = e; diff --git a/db/jsobj.h b/db/jsobj.h index 258a952..a6472d5 100644 --- a/db/jsobj.h +++ b/db/jsobj.h @@ -1,4 +1,4 @@ -/** @file jsobj.h +/** @file jsobj.h BSON classes */ @@ -40,7 +40,7 @@ #include "../bson/bsonmisc.h" #include "../bson/bsonobjbuilder.h" #include "../bson/bsonobjiterator.h" -#include "../bson/bsoninlines.h" +#include "../bson/bson-inl.h" #include "../bson/ordering.h" #include "../bson/stringdata.h" diff --git a/db/jsobjmanipulator.h b/db/jsobjmanipulator.h index c43e876..0b3c0c2 100644 --- a/db/jsobjmanipulator.h +++ b/db/jsobjmanipulator.h @@ -19,6 +19,7 @@ #pragma once #include "jsobj.h" +#include "dur.h" namespace mongo { @@ -35,41 +36,68 @@ namespace mongo { OpTime::now().asDate() */ void initTimestamp(); - + /** Change the value, in place, of the number. */ void setNumber(double d) { if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() ) = d; else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d; else assert(0); } - void setLong(long long n) { + void SetNumber(double d) { + if ( _element.type() == NumberDouble ) + *getDur().writing( reinterpret_cast< double * >( value() ) ) = d; + else if ( _element.type() == NumberInt ) + *getDur().writing( reinterpret_cast< int * >( value() ) ) = (int) d; + else assert(0); + } + void setLong(long long n) { assert( _element.type() == NumberLong ); *reinterpret_cast< long long * >( value() ) = n; } - void setInt(int n) { + void SetLong(long long n) { + assert( _element.type() == NumberLong ); + *getDur().writing( reinterpret_cast< long long * >(value()) ) = n; + } + void setInt(int n) { assert( _element.type() == NumberInt ); *reinterpret_cast< int * >( value() ) = n; } + void SetInt(int n) { + assert( _element.type() == NumberInt ); + getDur().writingInt( *reinterpret_cast< int * >( value() ) ) = n; + } + - /** Replace the type and value of the element with the type and value of e, preserving the original fieldName */ void replaceTypeAndValue( const BSONElement &e ) { *data() = e.type(); memcpy( value(), e.value(), e.valuesize() ); } - - static void lookForTimestamps( const BSONObj& obj ){ + + /* dur:: version */ + void ReplaceTypeAndValue( const BSONElement &e ) { + char *d = data(); + char *v = value(); + int valsize = e.valuesize(); + int ofs = (int) (v-d); + dassert( ofs > 0 ); + char *p = (char *) getDur().writingPtr(d, valsize + ofs); + *p = e.type(); + memcpy( p + ofs, e.value(), valsize ); + } + + static void lookForTimestamps( const BSONObj& obj ) { // If have a Timestamp field as the first or second element, // update it to a Date field set to OpTime::now().asDate(). The // replacement policy is a work in progress. - + BSONObjIterator i( obj ); for( int j = 0; i.moreWithEOO() && j < 2; ++j ) { BSONElement e = i.next(); if ( e.eoo() ) break; - if ( e.type() == Timestamp ){ + if ( e.type() == Timestamp ) { BSONElementManipulator( e ).initTimestamp(); break; } diff --git a/db/json.cpp b/db/json.cpp index 185a8ca..4a6fad8 100644 --- a/db/json.cpp +++ b/db/json.cpp @@ -43,12 +43,12 @@ using namespace boost::spirit; namespace mongo { struct ObjectBuilder : boost::noncopyable { - ~ObjectBuilder(){ + ~ObjectBuilder() { unsigned i = builders.size(); - if ( i ){ + if ( i ) { i--; - for ( ; i>=1; i-- ){ - if ( builders[i] ){ + for ( ; i>=1; i-- ) { + if ( builders[i] ) { builders[i]->done(); } } @@ -205,7 +205,8 @@ namespace mongo { else if ( first < 0x08 ) { b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) ); b.ss << char( 0x80 | ( ~0xc0 & second ) ); - } else { + } + else { b.ss << char( 0xe0 | ( first >> 4 ) ); b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) ); b.ss << char( 0x80 | ( ~0xc0 & second ) ); @@ -342,7 +343,7 @@ namespace mongo { struct dbrefEnd { dbrefEnd( ObjectBuilder &_b ) : b( _b ) {} void operator() ( const char *start, const char *end ) const { - b.back()->appendDBRef( b.fieldName(), b.ns.c_str(), b.oid ); + b.back()->appendDBRef( b.fieldName(), b.ns, b.oid ); } ObjectBuilder &b; }; @@ -417,8 +418,7 @@ namespace mongo { struct regexEnd { regexEnd( ObjectBuilder &_b ) : b( _b ) {} void operator() ( const char *start, const char *end ) const { - b.back()->appendRegex( b.fieldName(), b.regex.c_str(), - b.regexOptions.c_str() ); + b.back()->appendRegex( b.fieldName(), b.regex, b.regexOptions ); } ObjectBuilder &b; }; @@ -438,7 +438,7 @@ namespace mongo { // in the original z example on line 3, if the input was "ab", foo() would only // be called once. struct JsonGrammar : public grammar< JsonGrammar > { -public: + public: JsonGrammar( ObjectBuilder &_b ) : b( _b ) {} template < typename ScannerT > @@ -472,32 +472,32 @@ public: str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >> *( ( ch_p( '\\' ) >> ( - ch_p( 'b' )[ chE( self.b ) ] | - ch_p( 'f' )[ chE( self.b ) ] | - ch_p( 'n' )[ chE( self.b ) ] | - ch_p( 'r' )[ chE( self.b ) ] | - ch_p( 't' )[ chE( self.b ) ] | - ch_p( 'v' )[ chE( self.b ) ] | - ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) | - ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported + ch_p( 'b' )[ chE( self.b ) ] | + ch_p( 'f' )[ chE( self.b ) ] | + ch_p( 'n' )[ chE( self.b ) ] | + ch_p( 'r' )[ chE( self.b ) ] | + ch_p( 't' )[ chE( self.b ) ] | + ch_p( 'v' )[ chE( self.b ) ] | + ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) | + ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported ) ) | ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ]; singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >> - *( ( ch_p( '\\' ) >> - ( - ch_p( 'b' )[ chE( self.b ) ] | - ch_p( 'f' )[ chE( self.b ) ] | - ch_p( 'n' )[ chE( self.b ) ] | - ch_p( 'r' )[ chE( self.b ) ] | - ch_p( 't' )[ chE( self.b ) ] | - ch_p( 'v' )[ chE( self.b ) ] | - ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) | - ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported - ) - ) | - ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ]; + *( ( ch_p( '\\' ) >> + ( + ch_p( 'b' )[ chE( self.b ) ] | + ch_p( 'f' )[ chE( self.b ) ] | + ch_p( 'n' )[ chE( self.b ) ] | + ch_p( 'r' )[ chE( self.b ) ] | + ch_p( 't' )[ chE( self.b ) ] | + ch_p( 'v' )[ chE( self.b ) ] | + ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) | + ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported + ) + ) | + ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ]; // real_p accepts numbers with nonsignificant zero prefixes, which // aren't allowed in JSON. Oh well. @@ -548,8 +548,8 @@ public: >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ]; } rule< ScannerT > object, members, array, elements, value, str, number, integer, - dbref, dbrefS, dbrefT, oid, oidS, oidT, bindata, date, dateS, dateT, - regex, regexS, regexT, quotedOid, fieldName, unquotedFieldName, singleQuoteStr; + dbref, dbrefS, dbrefT, oid, oidS, oidT, bindata, date, dateS, dateT, + regex, regexS, regexT, quotedOid, fieldName, unquotedFieldName, singleQuoteStr; const rule< ScannerT > &start() const { return object; } @@ -558,7 +558,7 @@ public: }; BSONObj fromjson( const char *str , int* len) { - if ( str[0] == '\0' ){ + if ( str[0] == '\0' ) { if (len) *len = 0; return BSONObj(); } @@ -568,7 +568,8 @@ public: parse_info<> result = parse( str, parser, space_p ); if (len) { *len = result.stop - str; - } else if ( !result.full ) { + } + else if ( !result.full ) { int limit = strnlen(result.stop , 10); if (limit == -1) limit = 10; msgasserted(10340, "Failure parsing JSON string near: " + string( result.stop, limit )); diff --git a/db/lasterror.cpp b/db/lasterror.cpp index 12fc694..ba52111 100644 --- a/db/lasterror.cpp +++ b/db/lasterror.cpp @@ -34,28 +34,37 @@ namespace mongo { void raiseError(int code , const char *msg) { LastError *le = lastError.get(); if ( le == 0 ) { - /* might be intentional (non-user thread) */ + /* might be intentional (non-user thread) */ DEV { static unsigned n; if( ++n < 4 && !isShell ) log() << "dev: lastError==0 won't report:" << msg << endl; } - } else if ( le->disabled ) { + } + else if ( le->disabled ) { log() << "lastError disabled, can't report: " << code << ":" << msg << endl; - } else { + } + else { le->raiseError(code, msg); } } - - void LastError::appendSelf( BSONObjBuilder &b ) { + + bool LastError::appendSelf( BSONObjBuilder &b , bool blankErr ) { if ( !valid ) { - b.appendNull( "err" ); + if ( blankErr ) + b.appendNull( "err" ); b.append( "n", 0 ); - return; + return false; } - if ( msg.empty() ) - b.appendNull( "err" ); - else + + if ( msg.empty() ) { + if ( blankErr ) { + b.appendNull( "err" ); + } + } + else { b.append( "err", msg ); + } + if ( code ) b.append( "code" , code ); if ( updatedExisting != NotUpdate ) @@ -65,13 +74,24 @@ namespace mongo { if ( writebackId.isSet() ) b.append( "writeback" , writebackId ); b.appendNumber( "n", nObjects ); + + return ! msg.empty(); + } + + LastErrorHolder::~LastErrorHolder() { + for ( IDMap::iterator i = _ids.begin(); i != _ids.end(); ++i ) { + delete i->second.lerr; + i->second.lerr = 0; + } + _ids.clear(); } - void LastErrorHolder::setID( int id ){ + + void LastErrorHolder::setID( int id ) { _id.set( id ); } - - int LastErrorHolder::getID(){ + + int LastErrorHolder::getID() { return _id.get(); } @@ -89,24 +109,24 @@ namespace mongo { return ret; return 0; } - - LastError * LastErrorHolder::_get( bool create ){ + + LastError * LastErrorHolder::_get( bool create ) { int id = _id.get(); - if ( id == 0 ){ + if ( id == 0 ) { LastError * le = _tl.get(); - if ( ! le && create ){ + if ( ! le && create ) { le = new LastError(); _tl.reset( le ); } return le; } - scoped_lock lock(_idsmutex); + scoped_lock lock(_idsmutex); map::iterator i = _ids.find( id ); - if ( i == _ids.end() ){ + if ( i == _ids.end() ) { if ( ! create ) return 0; - + LastError * le = new LastError(); Status s; s.time = time(0); @@ -114,42 +134,42 @@ namespace mongo { _ids[id] = s; return le; } - + Status &status = i->second; status.time = time(0); return status.lerr; } - void LastErrorHolder::remove( int id ){ + void LastErrorHolder::remove( int id ) { scoped_lock lock(_idsmutex); map::iterator i = _ids.find( id ); if ( i == _ids.end() ) return; - + delete i->second.lerr; _ids.erase( i ); } - void LastErrorHolder::release(){ + void LastErrorHolder::release() { int id = _id.get(); - if ( id == 0 ){ + if ( id == 0 ) { _tl.release(); return; } - + remove( id ); } /** ok to call more than once. */ - void LastErrorHolder::initThread() { + void LastErrorHolder::initThread() { if( _tl.get() ) return; assert( _id.get() == 0 ); _tl.reset( new LastError() ); } - - void LastErrorHolder::reset( LastError * le ){ + + void LastErrorHolder::reset( LastError * le ) { int id = _id.get(); - if ( id == 0 ){ + if ( id == 0 ) { _tl.reset( le ); return; } @@ -159,17 +179,18 @@ namespace mongo { status.time = time(0); status.lerr = le; } - + void prepareErrForNewRequest( Message &m, LastError * err ) { // a killCursors message shouldn't affect last error if ( m.operation() == dbKillCursors ) { err->disabled = true; - } else { + } + else { err->disabled = false; err->nPrev++; - } + } } - + LastError * LastErrorHolder::startRequest( Message& m , int clientId ) { assert( clientId ); setID( clientId ); @@ -183,33 +204,33 @@ namespace mongo { prepareErrForNewRequest( m, connectionOwned ); } - void LastErrorHolder::disconnect( int clientId ){ + void LastErrorHolder::disconnect( int clientId ) { if ( clientId ) remove(clientId); } struct LastErrorHolderTest : public UnitTest { public: - - void test( int i ){ + + void test( int i ) { _tl.set( i ); assert( _tl.get() == i ); } - - void tlmaptest(){ + + void tlmaptest() { test( 1 ); test( 12123123 ); test( -123123 ); test( numeric_limits::min() ); test( numeric_limits::max() ); } - - void run(){ + + void run() { tlmaptest(); LastError * a = new LastError(); LastError * b = new LastError(); - + LastErrorHolder holder; holder.reset( a ); assert( a == holder.get() ); @@ -219,10 +240,10 @@ namespace mongo { assert( b == holder.get() ); holder.setID( 0 ); assert( a == holder.get() ); - + holder.remove( 1 ); } - + ThreadLocalValue _tl; } lastErrorHolderTest; diff --git a/db/lasterror.h b/db/lasterror.h index 2006f1c..c77ec74 100644 --- a/db/lasterror.h +++ b/db/lasterror.h @@ -33,7 +33,7 @@ namespace mongo { int nPrev; bool valid; bool disabled; - void writeback( OID& oid ){ + void writeback( OID& oid ) { reset( true ); writebackId = oid; } @@ -42,13 +42,13 @@ namespace mongo { code = _code; msg = _msg; } - void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ){ + void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ) { reset( true ); nObjects = _nObjects; updatedExisting = _updateObjects ? True : False; if ( _upsertedId.isSet() ) upsertedId = _upsertedId; - + } void recordDelete( long long nDeleted ) { reset( true ); @@ -68,20 +68,25 @@ namespace mongo { upsertedId.clear(); writebackId.clear(); } - void appendSelf( BSONObjBuilder &b ); + + /** + * @return if there is an err + */ + bool appendSelf( BSONObjBuilder &b , bool blankErr = true ); struct Disabled : boost::noncopyable { - Disabled( LastError * le ){ + Disabled( LastError * le ) { _le = le; - if ( _le ){ + if ( _le ) { _prev = _le->disabled; _le->disabled = true; - } else { + } + else { _prev = false; } } - - ~Disabled(){ + + ~Disabled() { if ( _le ) _le->disabled = _prev; } @@ -89,18 +94,19 @@ namespace mongo { LastError * _le; bool _prev; }; - + static LastError noError; }; extern class LastErrorHolder { public: LastErrorHolder() : _id( 0 ) {} + ~LastErrorHolder(); LastError * get( bool create = false ); - LastError * getSafe(){ + LastError * getSafe() { LastError * le = get(false); - if ( ! le ){ + if ( ! le ) { log( LL_ERROR ) << " no LastError! id: " << getID() << endl; assert( le ); } @@ -122,11 +128,11 @@ namespace mongo { void remove( int id ); void release(); - + /** when db receives a message/request, call this */ void startRequest( Message& m , LastError * connectionOwned ); LastError * startRequest( Message& m , int clientId ); - + void disconnect( int clientId ); // used to disable lastError reporting while processing a killCursors message @@ -135,13 +141,15 @@ namespace mongo { private: ThreadLocalValue _id; boost::thread_specific_ptr _tl; - + struct Status { time_t time; LastError *lerr; }; + typedef map IDMap; + static mongo::mutex _idsmutex; - map _ids; + IDMap _ids; } lastError; void raiseError(int code , const char *msg); diff --git a/db/matcher.cpp b/db/matcher.cpp index cd62563..38e8e05 100644 --- a/db/matcher.cpp +++ b/db/matcher.cpp @@ -30,7 +30,7 @@ #include "pdfile.h" namespace { - inline pcrecpp::RE_Options flags2options(const char* flags){ + inline pcrecpp::RE_Options flags2options(const char* flags) { pcrecpp::RE_Options options; options.set_utf8(true); while ( flags && *flags ) { @@ -52,7 +52,7 @@ namespace { namespace mongo { extern BSONObj staticNull; - + class Where { public: Where() { @@ -64,22 +64,22 @@ namespace mongo { if ( scope.get() ) scope->execSetup( "_mongo.readOnly = false;" , "make not read only" ); - if ( jsScope ){ + if ( jsScope ) { delete jsScope; jsScope = 0; } func = 0; } - + auto_ptr scope; ScriptingFunction func; BSONObj *jsScope; - + void setFunc(const char *code) { massert( 10341 , "scope has to be created first!" , scope.get() ); func = scope->createFunction( code ); } - + }; Matcher::~Matcher() { @@ -87,37 +87,48 @@ namespace mongo { where = 0; } - ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot ) : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) { - if ( _op == BSONObj::opMOD ){ + ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot ) + : toMatch( _e ) , compareOp( _op ), isNot( _isNot ), subMatcherOnPrimitives(false) { + if ( _op == BSONObj::opMOD ) { BSONObj o = _e.embeddedObject(); mod = o["0"].numberInt(); modm = o["1"].numberInt(); - + uassert( 10073 , "mod can't be 0" , mod ); } - else if ( _op == BSONObj::opTYPE ){ + else if ( _op == BSONObj::opTYPE ) { type = (BSONType)(_e.numberInt()); } - else if ( _op == BSONObj::opELEM_MATCH ){ + else if ( _op == BSONObj::opELEM_MATCH ) { BSONElement m = _e; uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object ); - subMatcher.reset( new Matcher( m.embeddedObject() ) ); + BSONObj x = m.embeddedObject(); + if ( x.firstElement().getGtLtOp() == 0 ) { + subMatcher.reset( new Matcher( x ) ); + subMatcherOnPrimitives = false; + } + else { + // meant to act on primitives + subMatcher.reset( new Matcher( BSON( "" << x ) ) ); + subMatcherOnPrimitives = true; + } } } - ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ) - : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) { - + ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ) + : toMatch( _e ) , compareOp( _op ), isNot( _isNot ), subMatcherOnPrimitives(false) { + myset.reset( new set() ); - + BSONObjIterator i( array ); while ( i.more() ) { BSONElement ie = i.next(); - if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){ + if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) { shared_ptr s; s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) ); allMatchers.push_back( s ); - } else if ( ie.type() == RegEx ) { + } + else if ( ie.type() == RegEx ) { if ( !myregex.get() ) { myregex.reset( new vector< RegexMatcher >() ); } @@ -132,19 +143,20 @@ namespace mongo { string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix); if (purePrefix) rm.prefix = prefix; - } else { + } + else { myset->insert(ie); } } - - if ( allMatchers.size() ){ + + if ( allMatchers.size() ) { uassert( 13020 , "with $all, can't mix $elemMatch and others" , myset->size() == 0 && !myregex.get()); } - + } - - - void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot){ + + + void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) { if ( nRegex >= 4 ) { out() << "ERROR: too many regexes in query" << endl; @@ -158,106 +170,106 @@ namespace mongo { rm.isNot = isNot; nRegex++; - if (!isNot){ //TODO something smarter + if (!isNot) { //TODO something smarter bool purePrefix; string prefix = simpleRegex(regex, flags, &purePrefix); if (purePrefix) rm.prefix = prefix; } - } + } } - + bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) { const char *fn = fe.fieldName(); int op = fe.getGtLtOp( -1 ); - if ( op == -1 ){ - if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){ + if ( op == -1 ) { + if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ) { return false; // { $ref : xxx } - treat as normal object } uassert( 10068 , (string)"invalid operator: " + fn , op != -1 ); } - - switch ( op ){ - case BSONObj::GT: - case BSONObj::GTE: - case BSONObj::LT: - case BSONObj::LTE:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), op, isNot); - break; - } - case BSONObj::NE:{ - haveNeg = true; - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), BSONObj::NE, isNot); - break; - } - case BSONObj::opALL: - all = true; - case BSONObj::opIN: - uassert( 13276 , "$in needs an array" , fe.isABSONObj() ); - basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); - break; - case BSONObj::NIN: - uassert( 13277 , "$nin needs an array" , fe.isABSONObj() ); - haveNeg = true; - basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); - break; - case BSONObj::opMOD: - case BSONObj::opTYPE: - case BSONObj::opELEM_MATCH: { - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - // these are types where ElementMatcher has all the info - basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) ); - break; - } - case BSONObj::opSIZE:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot); - haveSize = true; - break; - } - case BSONObj::opEXISTS:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot); - break; - } - case BSONObj::opREGEX:{ - uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot ); - if ( fe.type() == RegEx ){ - regex = fe.regex(); - flags = fe.regexFlags(); - } - else { - regex = fe.valuestrsafe(); - } - break; - } - case BSONObj::opOPTIONS:{ - uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot ); - flags = fe.valuestrsafe(); - break; - } - case BSONObj::opNEAR: - case BSONObj::opWITHIN: - case BSONObj::opMAX_DISTANCE: - break; - default: - uassert( 10069 , (string)"BUG - can't operator for: " + fn , 0 ); - } + + switch ( op ) { + case BSONObj::GT: + case BSONObj::GTE: + case BSONObj::LT: + case BSONObj::LTE: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), op, isNot); + break; + } + case BSONObj::NE: { + haveNeg = true; + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::NE, isNot); + break; + } + case BSONObj::opALL: + all = true; + case BSONObj::opIN: + uassert( 13276 , "$in needs an array" , fe.isABSONObj() ); + basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); + break; + case BSONObj::NIN: + uassert( 13277 , "$nin needs an array" , fe.isABSONObj() ); + haveNeg = true; + basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); + break; + case BSONObj::opMOD: + case BSONObj::opTYPE: + case BSONObj::opELEM_MATCH: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + // these are types where ElementMatcher has all the info + basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) ); + break; + } + case BSONObj::opSIZE: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot); + haveSize = true; + break; + } + case BSONObj::opEXISTS: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot); + break; + } + case BSONObj::opREGEX: { + uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot ); + if ( fe.type() == RegEx ) { + regex = fe.regex(); + flags = fe.regexFlags(); + } + else { + regex = fe.valuestrsafe(); + } + break; + } + case BSONObj::opOPTIONS: { + uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot ); + flags = fe.valuestrsafe(); + break; + } + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + case BSONObj::opMAX_DISTANCE: + break; + default: + uassert( 10069 , (string)"BUG - can't operator for: " + fn , 0 ); + } return true; } - + void Matcher::parseOr( const BSONElement &e, bool subMatcher, list< shared_ptr< Matcher > > &matchers ) { uassert( 13090, "nested $or/$nor not allowed", !subMatcher ); uassert( 13086, "$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 ); @@ -276,14 +288,16 @@ namespace mongo { return false; if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) { parseOr( e, subMatcher, _orMatchers ); - } else if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) { + } + else if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) { parseOr( e, subMatcher, _norMatchers ); - } else { + } + else { return false; } return true; } - + /* _jsobj - the query pattern */ Matcher::Matcher(const BSONObj &_jsobj, bool subMatcher) : @@ -293,6 +307,8 @@ namespace mongo { while ( i.more() ) { BSONElement e = i.next(); + uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined ); + if ( parseOrNor( e, subMatcher ) ) { continue; } @@ -301,7 +317,7 @@ namespace mongo { // $where: function()... uassert( 10066 , "$where occurs twice?", where == 0 ); uassert( 10067 , "$where query, but no script engine", globalScriptEngine ); - massert( 13089 , "no current client needed for $where" , haveClient() ); + massert( 13089 , "no current client needed for $where" , haveClient() ); where = new Where(); where->scope = globalScriptEngine->getPooledScope( cc().ns() ); where->scope->localConnect( cc().database()->name.c_str() ); @@ -314,7 +330,7 @@ namespace mongo { const char *code = e.valuestr(); where->setFunc(code); } - + where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" ); continue; @@ -324,7 +340,7 @@ namespace mongo { addRegex( e.fieldName(), e.regex(), e.regexFlags() ); continue; } - + // greater than / less than... // e.g., e == { a : { $gt : 3 } } // or @@ -333,35 +349,36 @@ namespace mongo { // support {$regex:"a|b", $options:"imx"} const char* regex = NULL; const char* flags = ""; - + // e.g., fe == { $gt : 3 } BSONObjIterator j(e.embeddedObject()); bool isOperator = false; while ( j.more() ) { BSONElement fe = j.next(); const char *fn = fe.fieldName(); - + if ( fn[0] == '$' && fn[1] ) { isOperator = true; - + if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) { haveNeg = true; switch( fe.type() ) { - case Object: { - BSONObjIterator k( fe.embeddedObject() ); - uassert( 13030, "$not cannot be empty", k.more() ); - while( k.more() ) { - addOp( e, k.next(), true, regex, flags ); - } - break; + case Object: { + BSONObjIterator k( fe.embeddedObject() ); + uassert( 13030, "$not cannot be empty", k.more() ); + while( k.more() ) { + addOp( e, k.next(), true, regex, flags ); } - case RegEx: - addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true ); - break; - default: - uassert( 13031, "invalid use of $not", false ); + break; + } + case RegEx: + addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true ); + break; + default: + uassert( 13031, "invalid use of $not", false ); } - } else { + } + else { if ( !addOp( e, fe, false, regex, flags ) ) { isOperator = false; break; @@ -373,43 +390,43 @@ namespace mongo { break; } } - if (regex){ + if (regex) { addRegex(e.fieldName(), regex, flags); } if ( isOperator ) continue; } - if ( e.type() == Array ){ + if ( e.type() == Array ) { hasArray = true; } else if( strcmp(e.fieldName(), "$atomic") == 0 ) { _atomic = e.trueValue(); continue; } - + // normal, simple case e.g. { a : "foo" } addBasic(e, BSONObj::Equality, false); } } - + Matcher::Matcher( const Matcher &other, const BSONObj &key ) : - where(0), constrainIndexKey_( key ), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) { + where(0), constrainIndexKey_( key ), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) { // do not include fields which would make keyMatch() false for( vector< ElementMatcher >::const_iterator i = other.basics.begin(); i != other.basics.end(); ++i ) { if ( key.hasField( i->toMatch.fieldName() ) ) { switch( i->compareOp ) { - case BSONObj::opSIZE: - case BSONObj::opALL: - case BSONObj::NE: - case BSONObj::NIN: - break; - default: { - if ( !i->isNot && i->toMatch.type() != Array ) { - basics.push_back( *i ); - } + case BSONObj::opSIZE: + case BSONObj::opALL: + case BSONObj::NE: + case BSONObj::NIN: + break; + default: { + if ( !i->isNot && i->toMatch.type() != Array ) { + basics.push_back( *i ); } } + } } } for( int i = 0; i < other.nRegex; ++i ) { @@ -421,29 +438,29 @@ namespace mongo { _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) ); } } - + inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) { - switch (e.type()){ - case String: - case Symbol: - if (rm.prefix.empty()) - return rm.re->PartialMatch(e.valuestr()); - else - return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size()); - case RegEx: - return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags()); - default: - return false; + switch (e.type()) { + case String: + case Symbol: + if (rm.prefix.empty()) + return rm.re->PartialMatch(e.valuestr()); + else + return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size()); + case RegEx: + return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags()); + default: + return false; } } - + inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) { assert( op != BSONObj::NE && op != BSONObj::NIN ); - + if ( op == BSONObj::Equality ) { return l.valuesEqual(r); } - + if ( op == BSONObj::opIN ) { // { $in : [1,2,3] } int count = bm.myset->count(l); @@ -471,15 +488,15 @@ namespace mongo { } return count == r.number(); } - - if ( op == BSONObj::opMOD ){ + + if ( op == BSONObj::opMOD ) { if ( ! l.isNumber() ) return false; - + return l.numberLong() % bm.mod == bm.modm; } - - if ( op == BSONObj::opTYPE ){ + + if ( op == BSONObj::opTYPE ) { return bm.type == l.type(); } @@ -506,7 +523,7 @@ namespace mongo { return 0; return bm.toMatch.boolean() ? -1 : 1; } - + /* Check if a particular field matches. fieldName - field to match "a.b" if we are reaching into an embedded object. @@ -519,8 +536,8 @@ namespace mongo { { "a.b" : 3 } means obj.a.b == 3 { a : { $lt : 3 } } means obj.a < 3 - { a : { $in : [1,2] } } means [1,2].contains(obj.a) - + { a : { $in : [1,2] } } means [1,2].contains(obj.a) + return value -1 mismatch 0 missing element @@ -529,20 +546,20 @@ namespace mongo { int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) { DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) ); if ( compareOp == BSONObj::opALL ) { - - if ( em.allMatchers.size() ){ + + if ( em.allMatchers.size() ) { BSONElement e = obj.getFieldDotted( fieldName ); uassert( 13021 , "$all/$elemMatch needs to be applied to array" , e.type() == Array ); - - for ( unsigned i=0; imatches( f.embeddedObject() ) ){ + if ( em.allMatchers[i]->matches( f.embeddedObject() ) ) { found = true; break; } @@ -551,36 +568,32 @@ namespace mongo { if ( ! found ) return -1; } - + return 1; } - + if ( em.myset->size() == 0 && !em.myregex.get() ) return -1; // is this desired? - - BSONObjSetDefaultOrder actualKeys; - IndexSpec( BSON( fieldName << 1 ) ).getKeys( obj, actualKeys ); - if ( actualKeys.size() == 0 ) - return 0; - + + BSONElementSet myValues; + obj.getFieldsDotted( fieldName , myValues ); + for( set< BSONElement, element_lt >::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) { // ignore nulls if ( i->type() == jstNULL ) continue; - // parallel traversal would be faster worst case I guess - BSONObjBuilder b; - b.appendAs( *i, "" ); - if ( !actualKeys.count( b.done() ) ) + + if ( myValues.count( *i ) == 0 ) return -1; } if ( !em.myregex.get() ) return 1; - + for( vector< RegexMatcher >::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) { bool match = false; - for( BSONObjSetDefaultOrder::const_iterator j = actualKeys.begin(); j != actualKeys.end(); ++j ) { - if ( regexMatches( *i, j->firstElement() ) ) { + for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) { + if ( regexMatches( *i, *j ) ) { match = true; break; } @@ -588,10 +601,10 @@ namespace mongo { if ( !match ) return -1; } - + return 1; } // end opALL - + if ( compareOp == BSONObj::NE ) return matchesNe( fieldName, toMatch, obj, em , details ); if ( compareOp == BSONObj::NIN ) { @@ -613,18 +626,19 @@ namespace mongo { } return 1; } - + BSONElement e; bool indexed = !constrainIndexKey_.isEmpty(); if ( indexed ) { e = obj.getFieldUsingIndexNames(fieldName, constrainIndexKey_); - if( e.eoo() ){ + if( e.eoo() ) { cout << "obj: " << obj << endl; cout << "fieldName: " << fieldName << endl; cout << "constrainIndexKey_: " << constrainIndexKey_ << endl; assert( !e.eoo() ); } - } else { + } + else { const char *p = strchr(fieldName, '.'); if ( p ) { @@ -662,7 +676,7 @@ namespace mongo { if ( details ) details->elemMatchKey = z.fieldName(); return 1; - } + } else if ( cmp < 0 ) { found = true; } @@ -671,7 +685,7 @@ namespace mongo { return found ? -1 : retMissing( em ); } - if( p ) { + if( p ) { return retMissing( em ); } else { @@ -681,21 +695,31 @@ namespace mongo { if ( compareOp == BSONObj::opEXISTS ) { return ( e.eoo() ^ ( toMatch.boolean() ^ em.isNot ) ) ? 1 : -1; - } else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) && - valuesMatch(e, toMatch, compareOp, em ) ) { + } + else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) && + valuesMatch(e, toMatch, compareOp, em ) ) { return 1; - } else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) { + } + else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) { BSONObjIterator ai(e.embeddedObject()); while ( ai.moreWithEOO() ) { BSONElement z = ai.next(); - - if ( compareOp == BSONObj::opELEM_MATCH ){ - // SERVER-377 - if ( z.type() == Object && em.subMatcher->matches( z.embeddedObject() ) ){ - if ( details ) - details->elemMatchKey = z.fieldName(); - return 1; + + if ( compareOp == BSONObj::opELEM_MATCH ) { + if ( z.type() == Object ) { + if ( em.subMatcher->matches( z.embeddedObject() ) ) { + if ( details ) + details->elemMatchKey = z.fieldName(); + return 1; + } + } + else if ( em.subMatcherOnPrimitives ) { + if ( z.type() && em.subMatcher->matches( z.wrap( "" ) ) ) { + if ( details ) + details->elemMatchKey = z.fieldName(); + return 1; + } } } else { @@ -707,12 +731,12 @@ namespace mongo { } } - - if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ){ + + if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) { // match an entire array to itself return 1; } - + } else if ( e.eoo() ) { // 0 indicates "missing element" @@ -745,7 +769,8 @@ namespace mongo { if ( ( bm.compareOp == BSONObj::NE ) ^ bm.isNot ) { return false; } - } else { + } + else { if ( !bm.isNot ) { return false; } @@ -760,7 +785,8 @@ namespace mongo { BSONElement e = jsobj.getFieldUsingIndexNames(rm.fieldName, constrainIndexKey_); if ( !e.eoo() ) s.insert( e ); - } else { + } + else { jsobj.getFieldsDotted( rm.fieldName, s ); } bool match = false; @@ -770,11 +796,11 @@ namespace mongo { if ( !match ^ rm.isNot ) return false; } - + if ( _orMatchers.size() > 0 ) { bool match = false; for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin(); - i != _orMatchers.end(); ++i ) { + i != _orMatchers.end(); ++i ) { // SERVER-205 don't submit details - we don't want to track field // matched within $or, and at this point we've already loaded the // whole document @@ -787,55 +813,56 @@ namespace mongo { return false; } } - + if ( _norMatchers.size() > 0 ) { for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin(); - i != _norMatchers.end(); ++i ) { + i != _norMatchers.end(); ++i ) { // SERVER-205 don't submit details - we don't want to track field // matched within $nor, and at this point we've already loaded the // whole document if ( (*i)->matches( jsobj ) ) { return false; } - } + } } - + for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orConstraints.begin(); - i != _orConstraints.end(); ++i ) { + i != _orConstraints.end(); ++i ) { if ( (*i)->matches( jsobj ) ) { return false; } } - + if ( where ) { if ( where->func == 0 ) { uassert( 10070 , "$where compile error", false); return false; // didn't compile } - - if ( where->jsScope ){ + + if ( where->jsScope ) { where->scope->init( where->jsScope ); } where->scope->setThis( const_cast< BSONObj * >( &jsobj ) ); where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) ); where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant - + int err = where->scope->invoke( where->func , BSONObj() , 1000 * 60 , false ); where->scope->setThis( 0 ); if ( err == -3 ) { // INVOKE_ERROR stringstream ss; - ss << "error on invocation of $where function:\n" + ss << "error on invocation of $where function:\n" << where->scope->getError(); uassert( 10071 , ss.str(), false); return false; - } else if ( err != 0 ) { // ! INVOKE_SUCCESS + } + else if ( err != 0 ) { // ! INVOKE_SUCCESS uassert( 10072 , "unknown error in invocation of $where function", false); - return false; + return false; } return where->scope->getBoolean( "return" ) != 0; } - + return true; } @@ -880,9 +907,9 @@ namespace mongo { } } return true; - } - - + } + + /*- just for testing -- */ #pragma pack(1) struct JSObj1 { @@ -946,7 +973,7 @@ namespace mongo { assert( !n.matches(j2) ); BSONObj j0 = BSONObj(); -// BSONObj j0((const char *) &js0); +// BSONObj j0((const char *) &js0); Matcher p(j0); assert( p.matches(j1) ); assert( p.matches(j2) ); @@ -959,7 +986,7 @@ namespace mongo { RXTest() { } - + void run() { /* static const boost::regex e("(\\d{4}[- ]){3}\\d{4}"); @@ -969,7 +996,7 @@ namespace mongo { */ int ret = 0; - + pcre_config( PCRE_CONFIG_UTF8 , &ret ); massert( 10342 , "pcre not compiled with utf8 support" , ret ); @@ -987,7 +1014,7 @@ namespace mongo { pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret ); if ( ! ret ) cout << "warning: some regex utf8 things will not work. pcre build doesn't have --enable-unicode-properties" << endl; - + } } rxtest; diff --git a/db/matcher.h b/db/matcher.h index a4e1667..d242df6 100644 --- a/db/matcher.h +++ b/db/matcher.h @@ -24,7 +24,7 @@ #include namespace mongo { - + class Cursor; class CoveredIndexMatcher; class Matcher; @@ -40,11 +40,9 @@ namespace mongo { bool isNot; RegexMatcher() : isNot() {} }; - - struct element_lt - { - bool operator()(const BSONElement& l, const BSONElement& r) const - { + + struct element_lt { + bool operator()(const BSONElement& l, const BSONElement& r) const { int x = (int) l.canonicalType() - (int) r.canonicalType(); if ( x < 0 ) return true; else if ( x > 0 ) return false; @@ -52,17 +50,17 @@ namespace mongo { } }; - + class ElementMatcher { public: - + ElementMatcher() { } - + ElementMatcher( BSONElement _e , int _op, bool _isNot ); - + ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ); - + ~ElementMatcher() { } BSONElement toMatch; @@ -70,13 +68,14 @@ namespace mongo { bool isNot; shared_ptr< set > myset; shared_ptr< vector > myregex; - + // these are for specific operators int mod; int modm; BSONType type; shared_ptr subMatcher; + bool subMatcherOnPrimitives ; vector< shared_ptr > allMatchers; }; @@ -85,15 +84,15 @@ namespace mongo { class DiskLoc; struct MatchDetails { - MatchDetails(){ + MatchDetails() { reset(); } - - void reset(){ + + void reset() { loadedObject = false; elemMatchKey = 0; } - + string toString() const { stringstream ss; ss << "loadedObject: " << loadedObject << " "; @@ -129,7 +128,7 @@ namespace mongo { const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher&bm, MatchDetails * details ); - + public: static int opDirection(int op) { return op <= BSONObj::LTE ? -1 : 1; @@ -140,14 +139,14 @@ namespace mongo { ~Matcher(); bool matches(const BSONObj& j, MatchDetails * details = 0 ); - + // fast rough check to see if we must load the real doc - we also // compare field counts against covereed index matcher; for $or clauses // we just compare field counts bool keyMatch() const { return !all && !haveSize && !hasArray && !haveNeg; } bool atomic() const { return _atomic; } - + bool hasType( BSONObj::MatchType type ) const; string toString() const { @@ -157,18 +156,18 @@ namespace mongo { void addOrConstraint( const shared_ptr< FieldRangeVector > &frv ) { _orConstraints.push_back( frv ); } - + void popOrClause() { _orMatchers.pop_front(); } - + bool sameCriteriaCount( const Matcher &other ) const; - + private: // Only specify constrainIndexKey if matches() will be called with // index keys having empty string field names. Matcher( const Matcher &other, const BSONObj &constrainIndexKey ); - + void addBasic(const BSONElement &e, int c, bool isNot) { // TODO May want to selectively ignore these element types based on op type. if ( e.type() == MinKey || e.type() == MaxKey ) @@ -178,7 +177,7 @@ namespace mongo { void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false); bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ); - + int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm); bool parseOrNor( const BSONElement &e, bool subMatcher ); @@ -194,7 +193,7 @@ namespace mongo { bool haveNeg; /* $atomic - if true, a multi document operation (some removes, updates) - should be done atomically. in that case, we do not yield - + should be done atomically. in that case, we do not yield - i.e. we stay locked the whole time. http://www.mongodb.org/display/DOCS/Removing[ */ @@ -211,26 +210,27 @@ namespace mongo { friend class CoveredIndexMatcher; }; - + // If match succeeds on index key, then attempt to match full document. class CoveredIndexMatcher : boost::noncopyable { public: CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false ); - bool matches(const BSONObj &o){ return _docMatcher->matches( o ); } - bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ); + bool matches(const BSONObj &o) { return _docMatcher->matches( o ); } + bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true ); bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 ); - bool needRecord(){ return _needRecord; } - + bool needRecord() { return _needRecord; } + Matcher& docMatcher() { return *_docMatcher; } // once this is called, shouldn't use this matcher for matching any more void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) { _docMatcher->addOrConstraint( frv ); - // TODO this is not an optimal optimization, since we could skip an entire + // TODO this is not yet optimal. Since we could skip an entire // or clause (if a match is impossible) between calls to advanceOrClause() + // we may not pop all the clauses we can. _docMatcher->popOrClause(); } - + CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) { return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord ); } @@ -239,7 +239,10 @@ namespace mongo { void init( bool alwaysUseRecord ); shared_ptr< Matcher > _docMatcher; Matcher _keyMatcher; - bool _needRecord; + + bool _needRecord; // if the key itself isn't good enough to determine a positive match + bool _needRecordReject; // if the key itself isn't good enough to determine a negative match + bool _useRecordOnly; }; - + } // namespace mongo diff --git a/db/matcher_covered.cpp b/db/matcher_covered.cpp index 5866505..18892be 100644 --- a/db/matcher_covered.cpp +++ b/db/matcher_covered.cpp @@ -33,48 +33,51 @@ namespace mongo { CoveredIndexMatcher::CoveredIndexMatcher( const BSONObj &jsobj, const BSONObj &indexKeyPattern, bool alwaysUseRecord) : _docMatcher( new Matcher( jsobj ) ), - _keyMatcher( *_docMatcher, indexKeyPattern ) - { + _keyMatcher( *_docMatcher, indexKeyPattern ) { init( alwaysUseRecord ); } - + CoveredIndexMatcher::CoveredIndexMatcher( const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord ) : _docMatcher( docMatcher ), - _keyMatcher( *_docMatcher, indexKeyPattern ) - { + _keyMatcher( *_docMatcher, indexKeyPattern ) { init( alwaysUseRecord ); } void CoveredIndexMatcher::init( bool alwaysUseRecord ) { - _needRecord = - alwaysUseRecord || - ! ( _docMatcher->keyMatch() && - _keyMatcher.sameCriteriaCount( *_docMatcher ) && - ! _keyMatcher.hasType( BSONObj::opEXISTS ) ); - ; + _needRecord = + alwaysUseRecord || + ! ( _docMatcher->keyMatch() && + _keyMatcher.sameCriteriaCount( *_docMatcher ) ); + + _needRecordReject = _keyMatcher.hasType( BSONObj::opEXISTS ); } - - bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ){ - return matches( cursor->currKey() , cursor->currLoc() , details ); + + bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) { + // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264 + return matches( cursor->currKey() , cursor->currLoc() , details ); } - - bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details ) { + + bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) { if ( details ) details->reset(); - - if ( !_keyMatcher.matches(key, details ) ){ - return false; - } - - if ( ! _needRecord ){ - return true; + + if ( _needRecordReject == false && keyUsable ) { + + if ( !_keyMatcher.matches(key, details ) ) { + return false; + } + + if ( ! _needRecord ) { + return true; + } + } if ( details ) details->loadedObject = true; - return _docMatcher->matches(recLoc.rec() , details ); + return _docMatcher->matches(recLoc.obj() , details ); } - + } diff --git a/db/minilex.h b/db/minilex.h index ba8df26..677514a 100644 --- a/db/minilex.h +++ b/db/minilex.h @@ -17,37 +17,39 @@ * along with this program. If not, see . */ +#error does anything use this? + namespace mongo { #if defined(_WIN32) - + } // namespace mongo #include using namespace stdext; namespace mongo { - + typedef const char * MyStr; struct less_str { bool operator()(const MyStr & x, const MyStr & y) const { if ( strcmp(x, y) > 0) return true; - + return false; } }; - + typedef hash_map > strhashmap; - + #else - + } // namespace mongo #include namespace mongo { - + using namespace __gnu_cxx; typedef const char * MyStr; @@ -55,106 +57,108 @@ namespace mongo { bool operator()(const MyStr & x, const MyStr & y) const { if ( strcmp(x, y) == 0) return true; - + return false; } }; - + typedef hash_map, eq_str > strhashmap; - + #endif - - struct MiniLex { + + /* + struct MiniLexNotUsed { strhashmap reserved; bool ic[256]; // ic=Identifier Character bool starter[256]; // dm: very dumb about comments and escaped quotes -- but we are faster then at least, // albeit returning too much (which is ok for jsbobj current usage). - void grabVariables(char *code /*modified and must stay in scope*/, strhashmap& vars) { - char *p = code; - char last = 0; - while ( *p ) { - if ( starter[*p] ) { - char *q = p+1; - while ( *q && ic[*q] ) q++; - const char *identifier = p; - bool done = *q == 0; - *q = 0; - if ( !reserved.count(identifier) ) { - // we try to be smart about 'obj' but have to be careful as obj.obj - // can happen; this is so that nFields is right for simplistic where cases - // so we can stop scanning in jsobj when we find the field of interest. - if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' ) - ; - else - vars[identifier] = 1; - } - if ( done ) - break; - p = q + 1; - continue; - } - - if ( *p == '\'' ) { - p++; - while ( *p && *p != '\'' ) p++; - } - else if ( *p == '"' ) { - p++; - while ( *p && *p != '"' ) p++; - } - p++; + void grabVariables(char *code , strhashmap& vars) { // 'code' modified and must stay in scope*/ + char *p = code; + char last = 0; + while ( *p ) { + if ( starter[*p] ) { + char *q = p+1; + while ( *q && ic[*q] ) q++; + const char *identifier = p; + bool done = *q == 0; + *q = 0; + if ( !reserved.count(identifier) ) { + // we try to be smart about 'obj' but have to be careful as obj.obj + // can happen; this is so that nFields is right for simplistic where cases + // so we can stop scanning in jsobj when we find the field of interest. + if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' ) + ; + else + vars[identifier] = 1; } + if ( done ) + break; + p = q + 1; + continue; } - MiniLex() { - strhashmap atest; - atest["foo"] = 3; - assert( atest.count("bar") == 0 ); - assert( atest.count("foo") == 1 ); - assert( atest["foo"] == 3 ); - - for ( int i = 0; i < 256; i++ ) { - ic[i] = starter[i] = false; - } - for ( int i = 'a'; i <= 'z'; i++ ) - ic[i] = starter[i] = true; - for ( int i = 'A'; i <= 'Z'; i++ ) - ic[i] = starter[i] = true; - for ( int i = '0'; i <= '9'; i++ ) - ic[i] = true; - for ( int i = 128; i < 256; i++ ) - ic[i] = starter[i] = true; - ic['$'] = starter['$'] = true; - ic['_'] = starter['_'] = true; - - reserved["break"] = true; - reserved["case"] = true; - reserved["catch"] = true; - reserved["continue"] = true; - reserved["default"] = true; - reserved["delete"] = true; - reserved["do"] = true; - reserved["else"] = true; - reserved["finally"] = true; - reserved["for"] = true; - reserved["function"] = true; - reserved["if"] = true; - reserved["in"] = true; - reserved["instanceof"] = true; - reserved["new"] = true; - reserved["return"] = true; - reserved["switch"] = true; - reserved["this"] = true; - reserved["throw"] = true; - reserved["try"] = true; - reserved["typeof"] = true; - reserved["var"] = true; - reserved["void"] = true; - reserved["while"] = true; - reserved["with "] = true; + if ( *p == '\'' ) { + p++; + while ( *p && *p != '\'' ) p++; } - }; + else if ( *p == '"' ) { + p++; + while ( *p && *p != '"' ) p++; + } + p++; + } +} + +MiniLex() { + strhashmap atest; + atest["foo"] = 3; + assert( atest.count("bar") == 0 ); + assert( atest.count("foo") == 1 ); + assert( atest["foo"] == 3 ); + + for ( int i = 0; i < 256; i++ ) { + ic[i] = starter[i] = false; + } + for ( int i = 'a'; i <= 'z'; i++ ) + ic[i] = starter[i] = true; + for ( int i = 'A'; i <= 'Z'; i++ ) + ic[i] = starter[i] = true; + for ( int i = '0'; i <= '9'; i++ ) + ic[i] = true; + for ( int i = 128; i < 256; i++ ) + ic[i] = starter[i] = true; + ic['$'] = starter['$'] = true; + ic['_'] = starter['_'] = true; + + reserved["break"] = true; + reserved["case"] = true; + reserved["catch"] = true; + reserved["continue"] = true; + reserved["default"] = true; + reserved["delete"] = true; + reserved["do"] = true; + reserved["else"] = true; + reserved["finally"] = true; + reserved["for"] = true; + reserved["function"] = true; + reserved["if"] = true; + reserved["in"] = true; + reserved["instanceof"] = true; + reserved["new"] = true; + reserved["return"] = true; + reserved["switch"] = true; + reserved["this"] = true; + reserved["throw"] = true; + reserved["try"] = true; + reserved["typeof"] = true; + reserved["var"] = true; + reserved["void"] = true; + reserved["while"] = true; + reserved["with "] = true; +} +}; +*/ } // namespace mongo diff --git a/db/module.cpp b/db/module.cpp index 1e4f511..6a182f2 100644 --- a/db/module.cpp +++ b/db/module.cpp @@ -24,29 +24,29 @@ namespace mongo { std::list * Module::_all; Module::Module( const string& name ) - : _name( name ) , _options( (string)"Module " + name + " options" ){ + : _name( name ) , _options( (string)"Module " + name + " options" ) { if ( ! _all ) _all = new list(); _all->push_back( this ); } - Module::~Module(){} + Module::~Module() {} - void Module::addOptions( program_options::options_description& options ){ + void Module::addOptions( program_options::options_description& options ) { if ( ! _all ) { return; } - for ( list::iterator i=_all->begin(); i!=_all->end(); i++ ){ + for ( list::iterator i=_all->begin(); i!=_all->end(); i++ ) { Module* m = *i; options.add( m->_options ); } } - void Module::configAll( program_options::variables_map& params ){ + void Module::configAll( program_options::variables_map& params ) { if ( ! _all ) { return; } - for ( list::iterator i=_all->begin(); i!=_all->end(); i++ ){ + for ( list::iterator i=_all->begin(); i!=_all->end(); i++ ) { Module* m = *i; m->config( params ); } @@ -54,11 +54,11 @@ namespace mongo { } - void Module::initAll(){ + void Module::initAll() { if ( ! _all ) { return; } - for ( list::iterator i=_all->begin(); i!=_all->end(); i++ ){ + for ( list::iterator i=_all->begin(); i!=_all->end(); i++ ) { Module* m = *i; m->init(); } diff --git a/db/module.h b/db/module.h index d4939dd..e90923a 100644 --- a/db/module.h +++ b/db/module.h @@ -34,8 +34,8 @@ namespace mongo { public: Module( const string& name ); virtual ~Module(); - - boost::program_options::options_description_easy_init add_options(){ + + boost::program_options::options_description_easy_init add_options() { return _options.add_options(); } @@ -54,10 +54,10 @@ namespace mongo { */ virtual void shutdown() = 0; - const string& getName(){ return _name; } - + const string& getName() { return _name; } + // --- static things - + static void addOptions( program_options::options_description& options ); static void configAll( program_options::variables_map& params ); static void initAll(); diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp index 40e9001..b180262 100644 --- a/db/modules/mms.cpp +++ b/db/modules/mms.cpp @@ -37,54 +37,54 @@ namespace mongo { MMS() : Module( "mms" ) , _baseurl( "" ) , _secsToSleep(1) , _token( "" ) , _name( "" ) { - + add_options() - ( "mms-url" , po::value()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" ) - ( "mms-token" , po::value() , "account token for mongo monitoring server" ) - ( "mms-name" , po::value() , "server name for mongo monitoring server" ) - ( "mms-interval" , po::value()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" ) - ; - } - - ~MMS(){} - - void config( program_options::variables_map& params ){ + ( "mms-url" , po::value()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" ) + ( "mms-token" , po::value() , "account token for mongo monitoring server" ) + ( "mms-name" , po::value() , "server name for mongo monitoring server" ) + ( "mms-interval" , po::value()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" ) + ; + } + + ~MMS() {} + + void config( program_options::variables_map& params ) { _baseurl = params["mms-url"].as(); - if ( params.count( "mms-token" ) ){ + if ( params.count( "mms-token" ) ) { _token = params["mms-token"].as(); } - if ( params.count( "mms-name" ) ){ + if ( params.count( "mms-name" ) ) { _name = params["mms-name"].as(); } _secsToSleep = params["mms-interval"].as(); } - - void run(){ - if ( _token.size() == 0 && _name.size() == 0 ){ + + void run() { + if ( _token.size() == 0 && _name.size() == 0 ) { log(1) << "mms not configured" << endl; return; } - if ( _token.size() == 0 ){ + if ( _token.size() == 0 ) { log() << "no token for mms - not running" << endl; return; } - - if ( _name.size() == 0 ){ + + if ( _name.size() == 0 ) { log() << "no name for mms - not running" << endl; return; } - + log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl; Client::initThread( "mms" ); Client& c = cc(); - - + + // TODO: using direct client is bad, but easy for now - - while ( ! inShutdown() ){ + + while ( ! inShutdown() ) { sleepsecs( _secsToSleep ); - + try { stringstream url; url << _baseurl << "?" @@ -92,47 +92,47 @@ namespace mongo { << "name=" << _name << "&" << "ts=" << time(0) ; - + BSONObjBuilder bb; // duplicated so the post has everything bb.append( "token" , _token ); bb.append( "name" , _name ); bb.appendDate( "ts" , jsTime() ); - + // any commands _add( bb , "buildinfo" ); _add( bb , "serverStatus" ); - + BSONObj postData = bb.obj(); - + log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;; - + HttpClient c; HttpClient::Result r; int rc = c.post( url.str() , postData.jsonString() , &r ); log(1) << "\t response code: " << rc << endl; - if ( rc != 200 ){ + if ( rc != 200 ) { log() << "mms error response code:" << rc << endl; log(1) << "mms error body:" << r.getEntireResponse() << endl; } } - catch ( std::exception& e ){ + catch ( std::exception& e ) { log() << "mms exception: " << e.what() << endl; } } - + c.shutdown(); } - - void _add( BSONObjBuilder& postData , const char* cmd ){ + + void _add( BSONObjBuilder& postData , const char* cmd ) { Command * c = Command::findCommand( cmd ); - if ( ! c ){ + if ( ! c ) { log() << "MMS can't find command: " << cmd << endl; postData.append( cmd , "can't find command" ); return; } - - if ( c->locktype() ){ + + if ( c->locktype() ) { log() << "MMS can only use noLocking commands not: " << cmd << endl; postData.append( cmd , "not noLocking" ); return; @@ -147,24 +147,24 @@ namespace mongo { else postData.append( cmd , sub.obj() ); } - - void init(){ go(); } - void shutdown(){ + void init() { go(); } + + void shutdown() { // TODO } private: string _baseurl; int _secsToSleep; - + string _token; string _name; - + } /*mms*/ ; } - + diff --git a/db/mongommf.cpp b/db/mongommf.cpp new file mode 100644 index 0000000..5ae573d --- /dev/null +++ b/db/mongommf.cpp @@ -0,0 +1,391 @@ +// @file mongommf.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +/* this module adds some of our layers atop memory mapped files - specifically our handling of private views & such + if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, not this. +*/ + +#include "pch.h" +#include "cmdline.h" +#include "mongommf.h" +#include "dur.h" +#include "dur_journalformat.h" +#include "../util/mongoutils/str.h" + +using namespace mongoutils; + +namespace mongo { + +#if defined(_WIN32) + extern mutex mapViewMutex; + + __declspec(noinline) void makeChunkWritable(size_t chunkno) { + scoped_lock lk(mapViewMutex); + + if( writable.get(chunkno) ) // double check lock + return; + + // remap all maps in this chunk. common case is a single map, but could have more than one with smallfiles or .ns files + size_t chunkStart = chunkno * MemoryMappedFile::ChunkSize; + size_t chunkNext = chunkStart + MemoryMappedFile::ChunkSize; + + scoped_lock lk2(privateViews._mutex()); + map::iterator i = privateViews.finditer_inlock((void*) (chunkNext-1)); + while( 1 ) { + const pair x = *(--i); + MongoMMF *mmf = x.second; + if( mmf == 0 ) + break; + + size_t viewStart = (size_t) x.first; + size_t viewEnd = viewStart + mmf->length(); + if( viewEnd <= chunkStart ) + break; + + size_t protectStart = max(viewStart, chunkStart); + dassert(protectStart0&&protectSize<=MemoryMappedFile::ChunkSize); + + DWORD old; + bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old); + if( !ok ) { + DWORD e = GetLastError(); + log() << "VirtualProtect failed " << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl; + assert(false); + } + } + + writable.set(chunkno); + } + + __declspec(noinline) void makeChunkWritableOld(size_t chunkno) { + scoped_lock lk(mapViewMutex); + + if( writable.get(chunkno) ) + return; + + size_t loc = chunkno * MemoryMappedFile::ChunkSize; + void *Loc = (void*) loc; + size_t ofs; + MongoMMF *mmf = privateViews.find( (void *) (loc), ofs ); + MemoryMappedFile *f = (MemoryMappedFile*) mmf; + assert(f); + + size_t len = MemoryMappedFile::ChunkSize; + assert( mmf->getView() <= Loc ); + if( ofs + len > f->length() ) { + // at the very end of the map + len = f->length() - ofs; + } + else { + ; + } + + // todo: check this goes away on remap + DWORD old; + bool ok = VirtualProtect(Loc, len, PAGE_WRITECOPY, &old); + if( !ok ) { + DWORD e = GetLastError(); + log() << "VirtualProtect failed " << Loc << ' ' << len << ' ' << errnoWithDescription(e) << endl; + assert(false); + } + + writable.set(chunkno); + } + + // align so that there is only one map per chunksize so our bitset works right + void* mapaligned(HANDLE h, unsigned long long _len) { + void *loc = 0; + int n = 0; + while( 1 ) { + n++; + void *m = MapViewOfFileEx(h, FILE_MAP_READ, 0, 0, 0, loc); + if( m == 0 ) { + DWORD e = GetLastError(); + if( n == 0 ) { + // if first fails, it isn't going to work + log() << "mapaligned errno: " << e << endl; + break; + } + if( debug && n == 1 ) { + log() << "mapaligned info e:" << e << " at n=1" << endl; + } + if( n > 98 ) { + log() << "couldn't align mapped view of file len:" << _len/1024.0/1024.0 << "MB errno:" << e << endl; + break; + } + loc = (void*) (((size_t)loc)+MemoryMappedFile::ChunkSize); + continue; + } + + size_t x = (size_t) m; + if( x % MemoryMappedFile::ChunkSize == 0 ) { + void *end = (void*) (x+_len); + DEV log() << "mapaligned " << m << '-' << end << " len:" << _len << endl; + return m; + } + + UnmapViewOfFile(m); + x = ((x+MemoryMappedFile::ChunkSize-1) / MemoryMappedFile::ChunkSize) * MemoryMappedFile::ChunkSize; + loc = (void*) x; + if( n % 20 == 0 ) { + log() << "warning mapaligned n=20" << endl; + } + if( n > 100 ) { + log() << "couldn't align mapped view of file len:" << _len/1024.0/1024.0 << "MB" << endl; + break; + } + } + return 0; + } + + void* MemoryMappedFile::createPrivateMap() { + assert( maphandle ); + scoped_lock lk(mapViewMutex); + //void *p = mapaligned(maphandle, len); + void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0); + if ( p == 0 ) { + DWORD e = GetLastError(); + log() << "createPrivateMap failed " << filename() << " " << errnoWithDescription(e) << endl; + } + else { + clearWritableBits(p); + views.push_back(p); + } + return p; + } + + void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) { + dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive + + // the mapViewMutex is to assure we get the same address on the remap + scoped_lock lk(mapViewMutex); + + clearWritableBits(oldPrivateAddr); + + if( !UnmapViewOfFile(oldPrivateAddr) ) { + DWORD e = GetLastError(); + log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl; + assert(false); + } + + // we want the new address to be the same as the old address in case things keep pointers around (as namespaceindex does). + void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0, + /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/, + oldPrivateAddr); + + if ( p == 0 ) { + DWORD e = GetLastError(); + log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl; + assert(p); + } + assert(p == oldPrivateAddr); + return p; + } +#endif + + void MongoMMF::remapThePrivateView() { + assert( cmdLine.dur ); + + // todo 1.9 : it turns out we require that we always remap to the same address. + // so the remove / add isn't necessary and can be removed + privateViews.remove(_view_private); + _view_private = remapPrivateView(_view_private); + privateViews.add(_view_private, this); + } + + /** register view. threadsafe */ + void PointerToMMF::add(void *view, MongoMMF *f) { + assert(view); + assert(f); + mutex::scoped_lock lk(_m); + _views.insert( pair(view,f) ); + } + + /** de-register view. threadsafe */ + void PointerToMMF::remove(void *view) { + if( view ) { + mutex::scoped_lock lk(_m); + _views.erase(view); + } + } + + PointerToMMF::PointerToMMF() : _m("PointerToMMF") { +#if defined(SIZE_MAX) + size_t max = SIZE_MAX; +#else + size_t max = ~((size_t)0); +#endif + assert( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane + + // this way we don't need any boundary checking in _find() + _views.insert( pair((void*)0,(MongoMMF*)0) ); + _views.insert( pair((void*)max,(MongoMMF*)0) ); + } + + /** underscore version of find is for when you are already locked + @param ofs out return our offset in the view + @return the MongoMMF to which this pointer belongs + */ + MongoMMF* PointerToMMF::find_inlock(void *p, /*out*/ size_t& ofs) { + // + // .................memory.......................... + // v1 p v2 + // [--------------------] [-------] + // + // e.g., _find(p) == v1 + // + const pair x = *(--_views.upper_bound(p)); + MongoMMF *mmf = x.second; + if( mmf ) { + size_t o = ((char *)p) - ((char*)x.first); + if( o < mmf->length() ) { + ofs = o; + return mmf; + } + } + return 0; + } + + /** find associated MMF object for a given pointer. + threadsafe + @param ofs out returns offset into the view of the pointer, if found. + @return the MongoMMF to which this pointer belongs. null if not found. + */ + MongoMMF* PointerToMMF::find(void *p, /*out*/ size_t& ofs) { + mutex::scoped_lock lk(_m); + return find_inlock(p, ofs); + } + + PointerToMMF privateViews; + + /* void* MongoMMF::switchToPrivateView(void *readonly_ptr) { + assert( cmdLine.dur ); + assert( testIntent ); + + void *p = readonly_ptr; + + { + size_t ofs=0; + MongoMMF *mmf = ourReadViews.find(p, ofs); + if( mmf ) { + void *res = ((char *)mmf->_view_private) + ofs; + return res; + } + } + + { + size_t ofs=0; + MongoMMF *mmf = privateViews.find(p, ofs); + if( mmf ) { + log() << "dur: perf warning p=" << p << " is already in the writable view of " << mmf->filename() << endl; + return p; + } + } + + // did you call writing() with a pointer that isn't into a datafile? + log() << "dur error switchToPrivateView " << p << endl; + return p; + }*/ + + /* switch to _view_write. normally, this is a bad idea since your changes will not + show up in _view_private if there have been changes there; thus the leading underscore + as a tad of a "warning". but useful when done with some care, such as during + initialization. + */ + void* MongoMMF::_switchToWritableView(void *p) { + size_t ofs; + MongoMMF *f = privateViews.find(p, ofs); + assert( f ); + return (((char *)f->_view_write)+ofs); + } + + extern string dbpath; + + // here so that it is precomputed... + void MongoMMF::setPath(string f) { + string suffix; + string prefix; + bool ok = str::rSplitOn(f, '.', prefix, suffix); + uassert(13520, str::stream() << "MongoMMF only supports filenames in a certain format " << f, ok); + if( suffix == "ns" ) + _fileSuffixNo = dur::JEntry::DotNsSuffix; + else + _fileSuffixNo = (int) str::toUnsigned(suffix); + + _p = RelativePath::fromFullPath(prefix); + } + + bool MongoMMF::open(string fname, bool sequentialHint) { + setPath(fname); + _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0); + return finishOpening(); + } + + bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) { + setPath(fname); + _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0); + return finishOpening(); + } + + bool MongoMMF::finishOpening() { + if( _view_write ) { + if( cmdLine.dur ) { + _view_private = createPrivateMap(); + if( _view_private == 0 ) { + massert( 13636 , "createPrivateMap failed (look in log for error)" , false ); + } + privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then... + } + else { + _view_private = _view_write; + } + return true; + } + return false; + } + + MongoMMF::MongoMMF() : _willNeedRemap(false) { + _view_write = _view_private = 0; + } + + MongoMMF::~MongoMMF() { + close(); + } + + namespace dur { + void closingFileNotification(); + } + + /*virtual*/ void MongoMMF::close() { + { + if( cmdLine.dur && _view_write/*actually was opened*/ ) { + if( debug ) + log() << "closingFileNotication:" << filename() << endl; + dur::closingFileNotification(); + } + privateViews.remove(_view_private); + } + _view_write = _view_private = 0; + MemoryMappedFile::close(); + } + +} diff --git a/db/mongommf.h b/db/mongommf.h new file mode 100644 index 0000000..5da46fc --- /dev/null +++ b/db/mongommf.h @@ -0,0 +1,140 @@ +/** @file mongommf.h +* +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../util/mmap.h" +#include "../util/paths.h" + +namespace mongo { + + /** MongoMMF adds some layers atop memory mapped files - specifically our handling of private views & such. + if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, + not this. + */ + class MongoMMF : private MemoryMappedFile { + public: + MongoMMF(); + virtual ~MongoMMF(); + virtual void close(); + + /** @return true if opened ok. */ + bool open(string fname, bool sequentialHint); + + /** @return file length */ + unsigned long long length() const { return MemoryMappedFile::length(); } + + string filename() const { return MemoryMappedFile::filename(); } + + void flush(bool sync) { MemoryMappedFile::flush(sync); } + + /* Creates with length if DNE, otherwise uses existing file length, + passed length. + @param sequentialHint if true will be sequentially accessed + @return true for ok + */ + bool create(string fname, unsigned long long& len, bool sequentialHint); + + /* Get the "standard" view (which is the private one). + @return the private view. + */ + void* getView() const { return _view_private; } + + /* Get the "write" view (which is required for writing). + @return the write view. + */ + void* view_write() const { return _view_write; } + + + /* switch to _view_write. normally, this is a bad idea since your changes will not + show up in _view_private if there have been changes there; thus the leading underscore + as a tad of a "warning". but useful when done with some care, such as during + initialization. + */ + static void* _switchToWritableView(void *private_ptr); + + /** for a filename a/b/c.3 + filePath() is "a/b/c" + fileSuffixNo() is 3 + if the suffix is "ns", fileSuffixNo -1 + */ + RelativePath relativePath() const { + DEV assert( !_p._p.empty() ); + return _p; + } + + int fileSuffixNo() const { return _fileSuffixNo; } + + /** true if we have written. + set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration. + reset to false in REMAPPRIVATEVIEW + */ + bool& willNeedRemap() { return _willNeedRemap; } + + void remapThePrivateView(); + + virtual bool isMongoMMF() { return true; } + + private: + + void *_view_write; + void *_view_private; + bool _willNeedRemap; + RelativePath _p; // e.g. "somepath/dbname" + int _fileSuffixNo; // e.g. 3. -1="ns" + + void setPath(string pathAndFileName); + bool finishOpening(); + }; + + /** for durability support we want to be able to map pointers to specific MongoMMF objects. + */ + class PointerToMMF : boost::noncopyable { + public: + PointerToMMF(); + + /** register view. \ + threadsafe + */ + void add(void *view, MongoMMF *f); + + /** de-register view. + threadsafe + */ + void remove(void *view); + + /** find associated MMF object for a given pointer. + threadsafe + @param ofs out returns offset into the view of the pointer, if found. + @return the MongoMMF to which this pointer belongs. null if not found. + */ + MongoMMF* find(void *p, /*out*/ size_t& ofs); + + /** for doing many finds in a row with one lock operation */ + mutex& _mutex() { return _m; } + MongoMMF* find_inlock(void *p, /*out*/ size_t& ofs); + + map::iterator finditer_inlock(void *p) { return _views.upper_bound(p); } + + private: + mutex _m; + map _views; + }; + + // allows a pointer into any private view of a MongoMMF to be resolved to the MongoMMF object + extern PointerToMMF privateViews; +} diff --git a/db/mongomutex.h b/db/mongomutex.h new file mode 100644 index 0000000..fac4113 --- /dev/null +++ b/db/mongomutex.h @@ -0,0 +1,239 @@ +// @file mongomutex.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#pragma once + +// note: include concurrency.h, not this. + +namespace mongo { + + /** the 'big lock' we use for most operations. a read/write lock. + there is one of these, dbMutex. + + generally if you need to declare a mutex use the right primitive class, not this. + + use readlock and writelock classes for scoped locks on this rather than direct + manipulation. + */ + class MongoMutex { + public: + MongoMutex(const char * name); + + /** @return + * > 0 write lock + * = 0 no lock + * < 0 read lock + */ + int getState() const { return _state.get(); } + + bool atLeastReadLocked() const { return _state.get() != 0; } + void assertAtLeastReadLocked() const { assert(atLeastReadLocked()); } + bool isWriteLocked() const { return getState() > 0; } + void assertWriteLocked() const { + assert( getState() > 0 ); + DEV assert( !_releasedEarly.get() ); + } + + // write lock. use the writelock scoped lock class, not this directly. + void lock() { + if ( _writeLockedAlready() ) + return; + + _state.set(1); + + Client *c = curopWaitingForLock( 1 ); // stats + _m.lock(); + curopGotLock(c); + + _minfo.entered(); + + MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build + + _acquiredWriteLock(); + } + + // try write lock + bool lock_try( int millis ) { + if ( _writeLockedAlready() ) + return true; + + Client *c = curopWaitingForLock( 1 ); + bool got = _m.lock_try( millis ); + + if ( got ) { + curopGotLock(c); + _minfo.entered(); + _state.set(1); + MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build + _acquiredWriteLock(); + } + + return got; + } + + // un write lock + void unlock() { + int s = _state.get(); + if( s > 1 ) { + _state.set(s-1); // recursive lock case + return; + } + if( s != 1 ) { + if( _releasedEarly.get() ) { + _releasedEarly.set(false); + return; + } + massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false); + } + _releasingWriteLock(); + MongoFile::unmarkAllWritable(); // _DEBUG validation + _state.set(0); + _minfo.leaving(); + _m.unlock(); + } + + /* unlock (write lock), and when unlock() is called later, + be smart then and don't unlock it again. + */ + void releaseEarly() { + assert( getState() == 1 ); // must not be recursive + assert( !_releasedEarly.get() ); + _releasedEarly.set(true); + unlock(); + } + + // read lock. don't call directly, use readlock. + void lock_shared() { + int s = _state.get(); + if( s ) { + if( s > 0 ) { + // already in write lock - just be recursive and stay write locked + _state.set(s+1); + } + else { + // already in read lock - recurse + _state.set(s-1); + } + } + else { + _state.set(-1); + Client *c = curopWaitingForLock( -1 ); + _m.lock_shared(); + curopGotLock(c); + } + } + + // try read lock + bool lock_shared_try( int millis ) { + int s = _state.get(); + if ( s ) { + // we already have a lock, so no need to try + lock_shared(); + return true; + } + + /* [dm] should there be + Client *c = curopWaitingForLock( 1 ); + here? i think so. seems to be missing. + */ + bool got = _m.lock_shared_try( millis ); + if ( got ) + _state.set(-1); + return got; + } + + void unlock_shared() { + int s = _state.get(); + if( s > 0 ) { + assert( s > 1 ); /* we must have done a lock write first to have s > 1 */ + _state.set(s-1); + return; + } + if( s < -1 ) { + _state.set(s+1); + return; + } + assert( s == -1 ); + _state.set(0); + _m.unlock_shared(); + } + + MutexInfo& info() { return _minfo; } + + private: + void _acquiredWriteLock(); + void _releasingWriteLock(); + + /* @return true if was already write locked. increments recursive lock count. */ + bool _writeLockedAlready(); + + RWLock _m; + + /* > 0 write lock with recurse count + < 0 read lock + */ + ThreadLocalValue _state; + + MutexInfo _minfo; + + public: + // indicates we need to call dur::REMAPPRIVATEVIEW on the next write lock + bool _remapPrivateViewRequested; + + private: + /* See the releaseEarly() method. + we use a separate TLS value for releasedEarly - that is ok as + our normal/common code path, we never even touch it */ + ThreadLocalValue _releasedEarly; + + /* this is for fsyncAndLock command. otherwise write lock's greediness will + make us block on any attempted write lock the the fsync's lock. + */ + //volatile bool _blockWrites; + }; + + extern MongoMutex &dbMutex; + + namespace dur { + void REMAPPRIVATEVIEW(); + void releasingWriteLock(); // because it's hard to include dur.h here + } + + inline void MongoMutex::_releasingWriteLock() { + dur::releasingWriteLock(); + } + + inline void MongoMutex::_acquiredWriteLock() { + if( _remapPrivateViewRequested ) { + dur::REMAPPRIVATEVIEW(); + dassert( !_remapPrivateViewRequested ); + } + } + + /* @return true if was already write locked. increments recursive lock count. */ + inline bool MongoMutex::_writeLockedAlready() { + int s = _state.get(); + if( s > 0 ) { + _state.set(s+1); + return true; + } + massert( 10293 , string("internal error: locks are not upgradeable: ") + sayClientState() , s == 0 ); + return false; + } + +} diff --git a/db/mr.cpp b/db/mr.cpp deleted file mode 100644 index 7786c85..0000000 --- a/db/mr.cpp +++ /dev/null @@ -1,721 +0,0 @@ -// mr.cpp - -/** - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "pch.h" -#include "db.h" -#include "instance.h" -#include "commands.h" -#include "../scripting/engine.h" -#include "../client/dbclient.h" -#include "../client/connpool.h" -#include "../client/parallel.h" -#include "queryoptimizer.h" -#include "matcher.h" -#include "clientcursor.h" - -namespace mongo { - - namespace mr { - - typedef vector BSONList; - - class MyCmp { - public: - MyCmp(){} - bool operator()( const BSONObj &l, const BSONObj &r ) const { - return l.firstElement().woCompare( r.firstElement() ) < 0; - } - }; - - typedef pair Data; - //typedef list< Data > InMemory; - typedef map< BSONObj,BSONList,MyCmp > InMemory; - - BSONObj reduceValues( BSONList& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){ - uassert( 10074 , "need values" , values.size() ); - - int sizeEstimate = ( values.size() * values.begin()->getField( "value" ).size() ) + 128; - BSONObj key; - - BSONObjBuilder reduceArgs( sizeEstimate ); - boost::scoped_ptr valueBuilder; - - int sizeSoFar = 0; - unsigned n = 0; - for ( ; n ( 4 * 1024 * 1024 ) ){ - assert( n > 1 ); // if not, inf. loop - break; - } - - valueBuilder->append( ee ); - sizeSoFar += ee.size(); - } - assert(valueBuilder); - valueBuilder->done(); - BSONObj args = reduceArgs.obj(); - - s->invokeSafe( reduce , args ); - if ( s->type( "return" ) == Array ){ - uassert( 10075 , "reduce -> multiple not supported yet",0); - return BSONObj(); - } - - int endSizeEstimate = key.objsize() + ( args.objsize() / values.size() ); - - if ( n < values.size() ){ - BSONList x; - for ( ; n < values.size(); n++ ){ - x.push_back( values[n] ); - } - BSONObjBuilder temp( endSizeEstimate ); - temp.append( key.firstElement() ); - s->append( temp , "1" , "return" ); - x.push_back( temp.obj() ); - return reduceValues( x , s , reduce , final , finalize ); - } - - - - if ( finalize ){ - BSONObjBuilder b(endSizeEstimate); - b.appendAs( key.firstElement() , "_id" ); - s->append( b , "value" , "return" ); - s->invokeSafe( finalize , b.obj() ); - } - - BSONObjBuilder b(endSizeEstimate); - b.appendAs( key.firstElement() , final ? "_id" : "0" ); - s->append( b , final ? "value" : "1" , "return" ); - return b.obj(); - } - - class MRSetup { - public: - MRSetup( const string& _dbname , const BSONObj& cmdObj , bool markAsTemp = true ){ - static int jobNumber = 1; - - dbname = _dbname; - ns = dbname + "." + cmdObj.firstElement().valuestr(); - - verbose = cmdObj["verbose"].trueValue(); - keeptemp = cmdObj["keeptemp"].trueValue(); - - { // setup names - stringstream ss; - if ( ! keeptemp ) - ss << "tmp."; - ss << "mr." << cmdObj.firstElement().fieldName() << "_" << time(0) << "_" << jobNumber++; - tempShort = ss.str(); - tempLong = dbname + "." + tempShort; - incLong = tempLong + "_inc"; - - if ( ! keeptemp && markAsTemp ) - cc().addTempCollection( tempLong ); - - replicate = keeptemp; - - if ( cmdObj["out"].type() == String ){ - finalShort = cmdObj["out"].valuestr(); - replicate = true; - } - else - finalShort = tempShort; - - finalLong = dbname + "." + finalShort; - - } - - { // code - mapCode = cmdObj["map"]._asCode(); - reduceCode = cmdObj["reduce"]._asCode(); - if ( cmdObj["finalize"].type() ){ - finalizeCode = cmdObj["finalize"]._asCode(); - } - checkCodeWScope( "map" , cmdObj ); - checkCodeWScope( "reduce" , cmdObj ); - checkCodeWScope( "finalize" , cmdObj ); - - if ( cmdObj["mapparams"].type() == Array ){ - mapparams = cmdObj["mapparams"].embeddedObjectUserCheck(); - } - - if ( cmdObj["scope"].type() == Object ){ - scopeSetup = cmdObj["scope"].embeddedObjectUserCheck(); - } - - } - - { // query options - if ( cmdObj["query"].type() == Object ){ - filter = cmdObj["query"].embeddedObjectUserCheck(); - } - - if ( cmdObj["sort"].type() == Object ){ - sort = cmdObj["sort"].embeddedObjectUserCheck(); - } - - if ( cmdObj["limit"].isNumber() ) - limit = cmdObj["limit"].numberLong(); - else - limit = 0; - } - } - - void checkCodeWScope( const char * field , const BSONObj& o ){ - BSONElement e = o[field]; - if ( e.type() != CodeWScope ) - return; - BSONObj x = e.codeWScopeObject(); - uassert( 13035 , (string)"can't use CodeWScope with map/reduce function: " + field , x.isEmpty() ); - } - - /** - @return number objects in collection - */ - long long renameIfNeeded( DBDirectClient& db ){ - if ( finalLong != tempLong ){ - db.dropCollection( finalLong ); - if ( db.count( tempLong ) ){ - BSONObj info; - uassert( 10076 , "rename failed" , db.runCommand( "admin" , BSON( "renameCollection" << tempLong << "to" << finalLong ) , info ) ); - } - } - return db.count( finalLong ); - } - - string dbname; - string ns; - - // options - bool verbose; - bool keeptemp; - bool replicate; - - // query options - - BSONObj filter; - BSONObj sort; - long long limit; - - // functions - - string mapCode; - string reduceCode; - string finalizeCode; - - BSONObj mapparams; - BSONObj scopeSetup; - - // output tables - string incLong; - - string tempShort; - string tempLong; - - string finalShort; - string finalLong; - - }; // end MRsetup - - class MRState { - public: - MRState( MRSetup& s ) : setup(s){ - scope = globalScriptEngine->getPooledScope( setup.dbname ); - scope->localConnect( setup.dbname.c_str() ); - - map = scope->createFunction( setup.mapCode.c_str() ); - if ( ! map ) - throw UserException( 9012, (string)"map compile failed: " + scope->getError() ); - - reduce = scope->createFunction( setup.reduceCode.c_str() ); - if ( ! reduce ) - throw UserException( 9013, (string)"reduce compile failed: " + scope->getError() ); - - if ( setup.finalizeCode.size() ) - finalize = scope->createFunction( setup.finalizeCode.c_str() ); - else - finalize = 0; - - if ( ! setup.scopeSetup.isEmpty() ) - scope->init( &setup.scopeSetup ); - - db.dropCollection( setup.tempLong ); - db.dropCollection( setup.incLong ); - - writelock l( setup.incLong ); - Client::Context ctx( setup.incLong ); - string err; - assert( userCreateNS( setup.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ); - - } - - void finalReduce( BSONList& values ){ - if ( values.size() == 0 ) - return; - - BSONObj key = values.begin()->firstElement().wrap( "_id" ); - BSONObj res = reduceValues( values , scope.get() , reduce , 1 , finalize ); - - writelock l( setup.tempLong ); - Client::Context ctx( setup.incLong ); - if ( setup.replicate ) - theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false ); - else - theDataFileMgr.insertWithObjMod( setup.tempLong.c_str() , res , false ); - } - - - MRSetup& setup; - auto_ptr scope; - DBDirectClient db; - - ScriptingFunction map; - ScriptingFunction reduce; - ScriptingFunction finalize; - - }; - - class MRTL { - public: - MRTL( MRState& state ) - : _state( state ) - , _temp(new InMemory()) - { - _size = 0; - numEmits = 0; - } - - void reduceInMemory(){ - boost::shared_ptr old = _temp; - _temp.reset(new InMemory()); - _size = 0; - - for ( InMemory::iterator i=old->begin(); i!=old->end(); i++ ){ - BSONObj key = i->first; - BSONList& all = i->second; - - if ( all.size() == 1 ){ - // this key has low cardinality, so just write to db - writelock l(_state.setup.incLong); - Client::Context ctx(_state.setup.incLong.c_str()); - write( *(all.begin()) ); - } - else if ( all.size() > 1 ){ - BSONObj res = reduceValues( all , _state.scope.get() , _state.reduce , false , 0 ); - insert( res ); - } - } - } - - void dump(){ - writelock l(_state.setup.incLong); - Client::Context ctx(_state.setup.incLong); - - for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ){ - BSONList& all = i->second; - if ( all.size() < 1 ) - continue; - - for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ ) - write( *j ); - } - _temp->clear(); - _size = 0; - - } - - void insert( const BSONObj& a ){ - BSONList& all = (*_temp)[a]; - all.push_back( a ); - _size += a.objsize() + 16; - } - - void checkSize(){ - if ( _size < 1024 * 5 ) - return; - - long before = _size; - reduceInMemory(); - log(1) << " mr: did reduceInMemory " << before << " -->> " << _size << endl; - - if ( _size < 1024 * 15 ) - return; - - dump(); - log(1) << " mr: dumping to db" << endl; - } - - private: - void write( BSONObj& o ){ - theDataFileMgr.insertWithObjMod( _state.setup.incLong.c_str() , o , true ); - } - - MRState& _state; - - boost::shared_ptr _temp; - long _size; - - public: - long long numEmits; - }; - - boost::thread_specific_ptr _tlmr; - - BSONObj fast_emit( const BSONObj& args ){ - uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 ); - uassert( 13069 , "an emit can't be more than 2mb" , args.objsize() < ( 2 * 1024 * 1024 ) ); - _tlmr->insert( args ); - _tlmr->numEmits++; - return BSONObj(); - } - - class MapReduceCommand : public Command { - public: - MapReduceCommand() : Command("mapReduce", false, "mapreduce"){} - virtual bool slaveOk() const { return true; } - - virtual void help( stringstream &help ) const { - help << "Run a map/reduce operation on the server.\n"; - help << "Note this is used for aggregation, not querying, in MongoDB.\n"; - help << "http://www.mongodb.org/display/DOCS/MapReduce"; - } - virtual LockType locktype() const { return NONE; } - bool run(const string& dbname , BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - Timer t; - Client::GodScope cg; - Client& client = cc(); - CurOp * op = client.curop(); - - MRSetup mr( dbname , cmd ); - - log(1) << "mr ns: " << mr.ns << endl; - - if ( ! db.exists( mr.ns ) ){ - errmsg = "ns doesn't exist"; - return false; - } - - bool shouldHaveData = false; - - long long num = 0; - long long inReduce = 0; - - BSONObjBuilder countsBuilder; - BSONObjBuilder timingBuilder; - try { - - MRState state( mr ); - state.scope->injectNative( "emit" , fast_emit ); - - MRTL * mrtl = new MRTL( state ); - _tlmr.reset( mrtl ); - - ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , db.count( mr.ns , mr.filter ) ) ); - long long mapTime = 0; - { - readlock lock( mr.ns ); - Client::Context ctx( mr.ns ); - - shared_ptr temp = bestGuessCursor( mr.ns.c_str(), mr.filter, mr.sort ); - auto_ptr cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , mr.ns.c_str() ) ); - - Timer mt; - while ( cursor->ok() ){ - - if ( ! cursor->currentMatches() ){ - cursor->advance(); - continue; - } - - BSONObj o = cursor->current(); - cursor->advance(); - - if ( mr.verbose ) mt.reset(); - - state.scope->setThis( &o ); - if ( state.scope->invoke( state.map , state.setup.mapparams , 0 , true ) ) - throw UserException( 9014, (string)"map invoke failed: " + state.scope->getError() ); - - if ( mr.verbose ) mapTime += mt.micros(); - - num++; - if ( num % 100 == 0 ){ - ClientCursor::YieldLock yield (cursor.get()); - Timer t; - mrtl->checkSize(); - inReduce += t.micros(); - - if ( ! yield.stillOk() ){ - cursor.release(); - break; - } - - killCurrentOp.checkForInterrupt(); - } - pm.hit(); - - if ( mr.limit && num >= mr.limit ) - break; - } - } - pm.finished(); - - killCurrentOp.checkForInterrupt(); - - countsBuilder.appendNumber( "input" , num ); - countsBuilder.appendNumber( "emit" , mrtl->numEmits ); - if ( mrtl->numEmits ) - shouldHaveData = true; - - timingBuilder.append( "mapTime" , mapTime / 1000 ); - timingBuilder.append( "emitLoop" , t.millis() ); - - // final reduce - op->setMessage( "m/r: (2/3) final reduce in memory" ); - mrtl->reduceInMemory(); - mrtl->dump(); - - BSONObj sortKey = BSON( "0" << 1 ); - db.ensureIndex( mr.incLong , sortKey ); - - { - writelock lock( mr.tempLong.c_str() ); - Client::Context ctx( mr.tempLong.c_str() ); - assert( userCreateNS( mr.tempLong.c_str() , BSONObj() , errmsg , mr.replicate ) ); - } - - - { - readlock rl(mr.incLong.c_str()); - Client::Context ctx( mr.incLong ); - - BSONObj prev; - BSONList all; - - assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , db.count( mr.incLong ) ) ); - - shared_ptr temp = bestGuessCursor( mr.incLong.c_str() , BSONObj() , sortKey ); - auto_ptr cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , mr.incLong.c_str() ) ); - - while ( cursor->ok() ){ - BSONObj o = cursor->current().getOwned(); - cursor->advance(); - - pm.hit(); - - if ( o.woSortOrder( prev , sortKey ) == 0 ){ - all.push_back( o ); - if ( pm->hits() % 1000 == 0 ){ - if ( ! cursor->yield() ){ - cursor.release(); - break; - } - killCurrentOp.checkForInterrupt(); - } - continue; - } - - ClientCursor::YieldLock yield (cursor.get()); - state.finalReduce( all ); - - all.clear(); - prev = o; - all.push_back( o ); - - if ( ! yield.stillOk() ){ - cursor.release(); - break; - } - - killCurrentOp.checkForInterrupt(); - } - - { - dbtempreleasecond tl; - if ( ! tl.unlocked() ) - log( LL_WARNING ) << "map/reduce can't temp release" << endl; - state.finalReduce( all ); - } - - pm.finished(); - } - - _tlmr.reset( 0 ); - } - catch ( ... ){ - log() << "mr failed, removing collection" << endl; - db.dropCollection( mr.tempLong ); - db.dropCollection( mr.incLong ); - throw; - } - - long long finalCount = 0; - { - dblock lock; - db.dropCollection( mr.incLong ); - - finalCount = mr.renameIfNeeded( db ); - } - - timingBuilder.append( "total" , t.millis() ); - - result.append( "result" , mr.finalShort ); - result.append( "timeMillis" , t.millis() ); - countsBuilder.appendNumber( "output" , finalCount ); - if ( mr.verbose ) result.append( "timing" , timingBuilder.obj() ); - result.append( "counts" , countsBuilder.obj() ); - - if ( finalCount == 0 && shouldHaveData ){ - result.append( "cmd" , cmd ); - errmsg = "there were emits but no data!"; - return false; - } - - return true; - } - - private: - DBDirectClient db; - - } mapReduceCommand; - - class MapReduceFinishCommand : public Command { - public: - MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ){} - virtual bool slaveOk() const { return true; } - - virtual LockType locktype() const { return NONE; } - bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe(); - - MRSetup mr( dbname , cmdObj.firstElement().embeddedObjectUserCheck() , false ); - - set servers; - - BSONObjBuilder shardCounts; - map counts; - - BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck(); - vector< auto_ptr > shardCursors; - - { // parse per shard results - BSONObjIterator i( shards ); - while ( i.more() ){ - BSONElement e = i.next(); - string shard = e.fieldName(); - - BSONObj res = e.embeddedObjectUserCheck(); - - uassert( 10078 , "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() ); - servers.insert( shard ); - shardCounts.appendAs( res["counts"] , shard.c_str() ); - - BSONObjIterator j( res["counts"].embeddedObjectUserCheck() ); - while ( j.more() ){ - BSONElement temp = j.next(); - counts[temp.fieldName()] += temp.numberLong(); - } - - } - - } - - DBDirectClient db; - - { // reduce from each stream - - BSONObj sortKey = BSON( "_id" << 1 ); - - ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection , - Query().sort( sortKey ) ); - cursor.init(); - - auto_ptr s = globalScriptEngine->getPooledScope( dbname ); - s->localConnect( dbname.c_str() ); - ScriptingFunction reduceFunction = s->createFunction( mr.reduceCode.c_str() ); - ScriptingFunction finalizeFunction = 0; - if ( mr.finalizeCode.size() ) - finalizeFunction = s->createFunction( mr.finalizeCode.c_str() ); - - BSONList values; - - result.append( "result" , mr.finalShort ); - - while ( cursor.more() ){ - BSONObj t = cursor.next().getOwned(); - - if ( values.size() == 0 ){ - values.push_back( t ); - continue; - } - - if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ){ - values.push_back( t ); - continue; - } - - - db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) ); - values.clear(); - values.push_back( t ); - } - - if ( values.size() ) - db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) ); - } - - long long finalCount = mr.renameIfNeeded( db ); - log(0) << " mapreducefinishcommand " << mr.finalLong << " " << finalCount << endl; - - for ( set::iterator i=servers.begin(); i!=servers.end(); i++ ){ - ScopedDbConnection conn( i->_server ); - conn->dropCollection( dbname + "." + shardedOutputCollection ); - conn.done(); - } - - result.append( "shardCounts" , shardCounts.obj() ); - - { - BSONObjBuilder c; - for ( map::iterator i=counts.begin(); i!=counts.end(); i++ ){ - c.append( i->first , i->second ); - } - result.append( "counts" , c.obj() ); - } - - return 1; - } - } mapReduceFinishCommand; - - } - -} - diff --git a/db/namespace-inl.h b/db/namespace-inl.h new file mode 100644 index 0000000..a777ff8 --- /dev/null +++ b/db/namespace-inl.h @@ -0,0 +1,130 @@ +// @file namespace-inl.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "namespace.h" + +namespace mongo { + + inline Namespace& Namespace::operator=(const char *ns) { + // we fill the remaining space with all zeroes here. as the full Namespace struct is in + // the datafiles (the .ns files specifically), that is helpful as then they are deterministic + // in the bytes they have for a given sequence of operations. that makes testing and debugging + // the data files easier. + // + // if profiling indicates this method is a significant bottleneck, we could have a version we + // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes. + // + unsigned len = strlen(ns); + uassert( 10080 , "ns name too long, max size is 128", len < MaxNsLen); + memset(buf, 0, MaxNsLen); + memcpy(buf, ns, len); + return *this; + } + + inline string Namespace::extraName(int i) const { + char ex[] = "$extra"; + ex[5] += i; + string s = string(buf) + ex; + massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen); + return s; + } + + inline bool Namespace::isExtra() const { + const char *p = strstr(buf, "$extr"); + return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example + } + + inline int Namespace::hash() const { + unsigned x = 0; + const char *p = buf; + while ( *p ) { + x = x * 131 + *p; + p++; + } + return (x & 0x7fffffff) | 0x8000000; // must be > 0 + } + + /* future : this doesn't need to be an inline. */ + inline string Namespace::getSisterNS( const char * local ) const { + assert( local && local[0] != '.' ); + string old(buf); + if ( old.find( "." ) != string::npos ) + old = old.substr( 0 , old.find( "." ) ); + return old + "." + local; + } + + inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) { + if( idxNo < NIndexesBase ) + return _indexes[idxNo]; + Extra *e = extra(); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 13283 , "Missing Extra" ); + massert(13282, "missing Extra", e); + } + int i = idxNo - NIndexesBase; + if( i >= NIndexesExtra ) { + e = e->next(this); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 13283 , "missing extra" ); + massert(13283, "missing Extra", e); + } + i -= NIndexesExtra; + } + return e->details[i]; + } + + inline int NamespaceDetails::idxNo(IndexDetails& idx) { + IndexIterator i = ii(); + while( i.more() ) { + if( &i.next() == &idx ) + return i.pos()-1; + } + massert( 10349 , "E12000 idxNo fails", false); + return -1; + } + + inline int NamespaceDetails::findIndexByKeyPattern(const BSONObj& keyPattern) { + IndexIterator i = ii(); + while( i.more() ) { + if( i.next().keyPattern() == keyPattern ) + return i.pos()-1; + } + return -1; + } + + // @return offset in indexes[] + inline int NamespaceDetails::findIndexByName(const char *name) { + IndexIterator i = ii(); + while( i.more() ) { + if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 ) + return i.pos()-1; + } + return -1; + } + + inline NamespaceDetails::IndexIterator::IndexIterator(NamespaceDetails *_d) { + d = _d; + i = 0; + n = d->nIndexes; + } + +} diff --git a/db/namespace.cpp b/db/namespace.cpp index 8a1ab6f..fcdaee2 100644 --- a/db/namespace.cpp +++ b/db/namespace.cpp @@ -19,7 +19,7 @@ #include "pch.h" #include "pdfile.h" #include "db.h" -#include "../util/mmap.h" +#include "mongommf.h" #include "../util/hashtab.h" #include "../scripting/engine.h" #include "btree.h" @@ -31,6 +31,8 @@ namespace mongo { + BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 ); + BSONObj idKeyPattern = fromjson("{\"_id\":1}"); /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes @@ -45,7 +47,7 @@ namespace mongo { NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) { /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */ firstExtent = lastExtent = capExtent = loc; - datasize = nrecords = 0; + stats.datasize = stats.nrecords = 0; lastExtentSize = 0; nIndexes = 0; capped = _capped; @@ -58,20 +60,23 @@ namespace mongo { // For capped case, signal that we are doing initial extent allocation. if ( capped ) cappedLastDelRecLastExtent().setInvalid(); - assert( sizeof(dataFileVersion) == 2 ); - dataFileVersion = 0; - indexFileVersion = 0; + assert( sizeof(dataFileVersion) == 2 ); + dataFileVersion = 0; + indexFileVersion = 0; multiKeyIndexBits = 0; reservedA = 0; extraOffset = 0; - backgroundIndexBuildInProgress = 0; + indexBuildInProgress = 0; + reservedB = 0; + capped2.cc2_ptr = 0; + capped2.fileNumber = 0; memset(reserved, 0, sizeof(reserved)); } bool NamespaceIndex::exists() const { return !MMF::exists(path()); } - + boost::filesystem::path NamespaceIndex::path() const { boost::filesystem::path ret( dir_ ); if ( directoryperdb ) @@ -88,23 +93,56 @@ namespace mongo { if ( !boost::filesystem::exists( dir ) ) BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( dir ) ); } - - int lenForNewNsFiles = 16 * 1024 * 1024; - - void NamespaceDetails::onLoad(const Namespace& k) { - if( k.isExtra() ) { + + unsigned lenForNewNsFiles = 16 * 1024 * 1024; + +#if defined(_DEBUG) + void NamespaceDetails::dump(const Namespace& k) { + if( !cmdLine.dur ) + cout << "ns offsets which follow will not display correctly with --dur disabled" << endl; + + size_t ofs = 1; // 1 is sentinel that the find call below failed + privateViews.find(this, /*out*/ofs); + + cout << "ns" << hex << setw(8) << ofs << ' '; + cout << k.toString() << '\n'; + + if( k.isExtra() ) { + cout << "ns\t extra" << endl; + return; + } + + cout << "ns " << firstExtent.toString() << ' ' << lastExtent.toString() << " nidx:" << nIndexes << '\n'; + cout << "ns " << stats.datasize << ' ' << stats.nrecords << ' ' << nIndexes << '\n'; + cout << "ns " << capped << ' ' << paddingFactor << ' ' << flags << ' ' << dataFileVersion << '\n'; + cout << "ns " << multiKeyIndexBits << ' ' << indexBuildInProgress << '\n'; + cout << "ns " << (int) reserved[0] << ' ' << (int) reserved[59]; + cout << endl; + } +#endif + + void NamespaceDetails::onLoad(const Namespace& k) { + //dump(k); + + if( k.isExtra() ) { /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */ return; } - assertInWriteLock(); - if( backgroundIndexBuildInProgress ) { - log() << "backgroundIndexBuildInProgress was " << backgroundIndexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl; - backgroundIndexBuildInProgress = 0; + DEV assertInWriteLock(); + + if( indexBuildInProgress || capped2.cc2_ptr ) { + assertInWriteLock(); + if( indexBuildInProgress ) { + log() << "indexBuildInProgress was " << indexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl; + getDur().writingInt( indexBuildInProgress ) = 0; + } + if( capped2.cc2_ptr ) + *getDur().writing(&capped2.cc2_ptr) = 0; } } - static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) { + static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) { v.onLoad(k); } @@ -117,105 +155,113 @@ namespace mongo { we need to be sure to clear any cached info for the database in local.*. */ - /* + /* if ( "local" != database_ ) { DBInfo i(database_.c_str()); i.dbDropped(); } - */ - int len = -1; + */ + + unsigned long long len = 0; boost::filesystem::path nsPath = path(); string pathString = nsPath.string(); - MMF::Pointer p; - if( MMF::exists(nsPath) ) { - p = f.map(pathString.c_str()); - if( !p.isNull() ) { + void *p = 0; + if( MMF::exists(nsPath) ) { + if( f.open(pathString, true) ) { len = f.length(); - if ( len % (1024*1024) != 0 ){ + if ( len % (1024*1024) != 0 ) { log() << "bad .ns file: " << pathString << endl; uassert( 10079 , "bad .ns file length, cannot open database", len % (1024*1024) == 0 ); } + p = f.getView(); } - } - else { - // use lenForNewNsFiles, we are making a new database - massert( 10343 , "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 ); + } + else { + // use lenForNewNsFiles, we are making a new database + massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 ); maybeMkdir(); - long l = lenForNewNsFiles; - p = f.map(pathString.c_str(), l); - if( !p.isNull() ) { - len = (int) l; + unsigned long long l = lenForNewNsFiles; + if( f.create(pathString, l, true) ) { + getDur().createdFile(pathString, l); // always a new file + len = l; assert( len == lenForNewNsFiles ); + p = f.getView(); } - } + } - if ( p.isNull() ) { - problem() << "couldn't open file " << pathString << " terminating" << endl; + if ( p == 0 ) { + /** TODO: this shouldn't terminate? */ + log() << "error couldn't open file " << pathString << " terminating" << endl; dbexit( EXIT_FS ); } - ht = new HashTable(p, len, "namespace index"); + + assert( len <= 0x7fffffff ); + ht = new HashTable(p, (int) len, "namespace index"); if( checkNsFilesOnLoad ) ht->iterAll(namespaceOnLoadCallback); } - + static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , void * extra ) { list * l = (list*)extra; if ( ! k.hasDollarSign() ) l->push_back( (string)k ); } - void NamespaceIndex::getNamespaces( list& tofill , bool onlyCollections ) const { assert( onlyCollections ); // TODO: need to implement this // need boost::bind or something to make this less ugly - + if ( ht ) ht->iterAll( namespaceGetNamespacesCallback , (void*)&tofill ); } void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) { - BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) ); + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) ); + { + Record *r = (Record *) getDur().writingPtr(d, sizeof(Record)); + d = &r->asDeleted(); // defensive code: try to make us notice if we reference a deleted record - (unsigned&) (((Record *) d)->data) = 0xeeeeeeee; + (unsigned&) (r->data) = 0xeeeeeeee; } - dassert( dloc.drec() == d ); - DEBUGGING out() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl; + DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl; if ( capped ) { if ( !cappedLastDelRecLastExtent().isValid() ) { // Initial extent allocation. Insert at end. d->nextDeleted = DiskLoc(); if ( cappedListOfAllDeletedRecords().isNull() ) - cappedListOfAllDeletedRecords() = dloc; + getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc; else { DiskLoc i = cappedListOfAllDeletedRecords(); - for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted ); - i.drec()->nextDeleted = dloc; + for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted ) + ; + i.drec()->nextDeleted.writing() = dloc; } - } else { + } + else { d->nextDeleted = cappedFirstDeletedInCurExtent(); - cappedFirstDeletedInCurExtent() = dloc; + getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc; // always compact() after this so order doesn't matter } - } else { + } + else { int b = bucket(d->lengthWithHeaders); DiskLoc& list = deletedList[b]; DiskLoc oldHead = list; - list = dloc; + getDur().writingDiskLoc(list) = dloc; d->nextDeleted = oldHead; } } - /* - lenToAlloc is WITH header - */ + // lenToAlloc is WITH header DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) { lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; DiskLoc loc = _alloc(ns, lenToAlloc); if ( loc.isNull() ) return loc; - DeletedRecord *r = loc.drec(); + const DeletedRecord *r = loc.drec(); + //r = getDur().writing(r); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ @@ -229,20 +275,21 @@ namespace mongo { if ( capped == 0 ) { if ( left < 24 || left < (lenToAlloc >> 3) ) { // you get the whole thing. - DataFileMgr::grow(loc, regionlen); + //DataFileMgr::grow(loc, regionlen); return loc; } } /* split off some for further use. */ - r->lengthWithHeaders = lenToAlloc; - DataFileMgr::grow(loc, lenToAlloc); + getDur().writingInt(r->lengthWithHeaders) = lenToAlloc; + //DataFileMgr::grow(loc, lenToAlloc); DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left); - newDel->extentOfs = r->extentOfs; - newDel->lengthWithHeaders = left; - newDel->nextDeleted.Null(); + DeletedRecord *newDelW = getDur().writing(newDel); + newDelW->extentOfs = r->extentOfs; + newDelW->lengthWithHeaders = left; + newDelW->nextDeleted.Null(); addDeletedRec(newDel, newDelLoc); @@ -267,7 +314,7 @@ namespace mongo { int a = cur.a(); if ( a < -1 || a >= 100000 ) { problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() << - " a:" << a << " b:" << b << " chain:" << chain << '\n'; + " a:" << a << " b:" << b << " chain:" << chain << '\n'; sayDbContext(); if ( cur == *prev ) prev->Null(); @@ -303,7 +350,7 @@ namespace mongo { cur.Null(); } else { - /*this defensive check only made sense for the mmap storage engine: + /*this defensive check only made sense for the mmap storage engine: if ( r->nextDeleted.getOfs() == 0 ) { problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() << " b:" << b << " chain:" << chain << ", fixing.\n"; @@ -316,9 +363,9 @@ namespace mongo { /* unlink ourself from the deleted list */ { - DeletedRecord *bmr = bestmatch.drec(); - *bestprev = bmr->nextDeleted; - bmr->nextDeleted.setInvalid(); // defensive. + const DeletedRecord *bmr = bestmatch.drec(); + *getDur().writing(bestprev) = bmr->nextDeleted; + bmr->nextDeleted.writing().setInvalid(); // defensive. assert(bmr->extentOfs < bestmatch.getOfs()); } @@ -371,9 +418,9 @@ namespace mongo { if ( e == capExtent ) out() << " (capExtent)"; out() << '\n'; - out() << " magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.buf << '\n'; + out() << " magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.toString() << '\n'; out() << " fr: " << e.ext()->firstRecord.toString() << - " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n'; + " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n'; } assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken } @@ -387,12 +434,27 @@ namespace mongo { return cappedAlloc(ns,len); } + void NamespaceIndex::kill_ns(const char *ns) { + if ( !ht ) + return; + Namespace n(ns); + ht->kill(n); + + for( int i = 0; i<=1; i++ ) { + try { + Namespace extra(n.extraName(i).c_str()); + ht->kill(extra); + } + catch(DBException&) { } + } + } + /* extra space for indexes when more than 10 */ NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) { assert( i >= 0 && i <= 1 ); Namespace n(ns); Namespace extra(n.extraName(i).c_str()); // throws userexception if ns name too long - + massert( 10350 , "allocExtra: base ns missing?", d ); massert( 10351 , "allocExtra: extra already exists", ht->get(extra) == 0 ); @@ -409,10 +471,10 @@ namespace mongo { long ofs = e->ofsFrom(this); if( i == 0 ) { assert( extraOffset == 0 ); - extraOffset = ofs; + *getDur().writing(&extraOffset) = ofs; assert( extra() == e ); } - else { + else { Extra *hd = extra(); assert( hd->next(this) == 0 ); hd->setNext(ofs); @@ -422,25 +484,23 @@ namespace mongo { /* you MUST call when adding an index. see pdfile.cpp */ IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) { - assert( nsdetails(thisns) == this ); - IndexDetails *id; try { id = &idx(nIndexes,true); } - catch(DBException&) { + catch(DBException&) { allocExtra(thisns, nIndexes); id = &idx(nIndexes,false); } - nIndexes++; + (*getDur().writing(&nIndexes))++; if ( resetTransient ) NamespaceDetailsTransient::get_w(thisns).addedIndex(); return *id; } // must be called when renaming a NS to fix up extra - void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) { + void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) { extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below. Extra *se = src->extra(); int n = NIndexesBase; @@ -454,7 +514,7 @@ namespace mongo { Extra *nxt = allocExtra(thisns, n); e->setNext( nxt->ofsFrom(this) ); e = nxt; - } + } assert( extraOffset ); } } @@ -473,25 +533,39 @@ namespace mongo { }*/ return -1; } - - long long NamespaceDetails::storageSize( int * numExtents ){ + + long long NamespaceDetails::storageSize( int * numExtents , BSONArrayBuilder * extentInfo ) const { Extent * e = firstExtent.ext(); assert( e ); - + long long total = 0; int n = 0; - while ( e ){ + while ( e ) { total += e->length; - e = e->getNextExtent(); n++; + + if ( extentInfo ) { + extentInfo->append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) ); + } + + e = e->getNextExtent(); } - + if ( numExtents ) *numExtents = n; - + return total; } - + + NamespaceDetails *NamespaceDetails::writingWithExtra() { + vector< pair< long long, unsigned > > writeRanges; + writeRanges.push_back( make_pair( 0, sizeof( NamespaceDetails ) ) ); + for( Extra *e = extra(); e; e = e->next( this ) ) { + writeRanges.push_back( make_pair( (char*)e - (char*)this, sizeof( Extra ) ) ); + } + return reinterpret_cast< NamespaceDetails* >( getDur().writingRangesAtOffsets( this, writeRanges ) ); + } + /* ------------------------------------------------------------------------- */ mongo::mutex NamespaceDetailsTransient::_qcMutex("qc"); @@ -505,14 +579,14 @@ namespace mongo { _keysComputed = false; _indexSpecs.clear(); } - -/* NamespaceDetailsTransient& NamespaceDetailsTransient::get(const char *ns) { - shared_ptr< NamespaceDetailsTransient > &t = map_[ ns ]; - if ( t.get() == 0 ) - t.reset( new NamespaceDetailsTransient(ns) ); - return *t; - } -*/ + + /* NamespaceDetailsTransient& NamespaceDetailsTransient::get(const char *ns) { + shared_ptr< NamespaceDetailsTransient > &t = map_[ ns ]; + if ( t.get() == 0 ) + t.reset( new NamespaceDetailsTransient(ns) ); + return *t; + } + */ void NamespaceDetailsTransient::clearForPrefix(const char *prefix) { assertInWriteLock(); vector< string > found; @@ -523,7 +597,7 @@ namespace mongo { _map[ *i ].reset(); } } - + void NamespaceDetailsTransient::computeIndexKeys() { _keysComputed = true; _indexKeys.clear(); @@ -565,92 +639,92 @@ namespace mongo { void renameNamespace( const char *from, const char *to ) { NamespaceIndex *ni = nsindex( from ); - assert( ni ); + assert( ni ); assert( ni->details( from ) ); assert( ! ni->details( to ) ); - - // Our namespace and index details will move to a different - // memory location. The only references to namespace and - // index details across commands are in cursors and nsd - // transient (including query cache) so clear these. - ClientCursor::invalidate( from ); - NamespaceDetailsTransient::clearForPrefix( from ); - - NamespaceDetails *details = ni->details( from ); - ni->add_ns( to, *details ); + + // Our namespace and index details will move to a different + // memory location. The only references to namespace and + // index details across commands are in cursors and nsd + // transient (including query cache) so clear these. + ClientCursor::invalidate( from ); + NamespaceDetailsTransient::clearForPrefix( from ); + + NamespaceDetails *details = ni->details( from ); + ni->add_ns( to, *details ); NamespaceDetails *todetails = ni->details( to ); - try { + try { todetails->copyingFrom(to, details); // fixes extraOffset } - catch( DBException& ) { + catch( DBException& ) { // could end up here if .ns is full - if so try to clean up / roll back a little ni->kill_ns(to); throw; } - ni->kill_ns( from ); - details = todetails; - - BSONObj oldSpec; - char database[MaxDatabaseLen]; - nsToDatabase(from, database); - string s = database; - s += ".system.namespaces"; - assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) ); - - BSONObjBuilder newSpecB; - BSONObjIterator i( oldSpec.getObjectField( "options" ) ); - while( i.more() ) { - BSONElement e = i.next(); - if ( strcmp( e.fieldName(), "create" ) != 0 ) - newSpecB.append( e ); - else - newSpecB << "create" << to; - } - BSONObj newSpec = newSpecB.done(); - addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec ); - - deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true ); - // oldSpec variable no longer valid memory - - BSONObj oldIndexSpec; - s = database; - s += ".system.indexes"; - while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) { - BSONObjBuilder newIndexSpecB; - BSONObjIterator i( oldIndexSpec ); - while( i.more() ) { - BSONElement e = i.next(); - if ( strcmp( e.fieldName(), "ns" ) != 0 ) - newIndexSpecB.append( e ); - else - newIndexSpecB << "ns" << to; - } - BSONObj newIndexSpec = newIndexSpecB.done(); - DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, BSONElement(), false ); - int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) ); - IndexDetails &indexDetails = details->idx(indexI); - string oldIndexNs = indexDetails.indexNamespace(); - indexDetails.info = newIndexSpecLoc; - string newIndexNs = indexDetails.indexNamespace(); - - BtreeBucket::renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() ); - deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true ); - } - } - - bool legalClientSystemNS( const string& ns , bool write ){ + ni->kill_ns( from ); + details = todetails; + + BSONObj oldSpec; + char database[MaxDatabaseNameLen]; + nsToDatabase(from, database); + string s = database; + s += ".system.namespaces"; + assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) ); + + BSONObjBuilder newSpecB; + BSONObjIterator i( oldSpec.getObjectField( "options" ) ); + while( i.more() ) { + BSONElement e = i.next(); + if ( strcmp( e.fieldName(), "create" ) != 0 ) + newSpecB.append( e ); + else + newSpecB << "create" << to; + } + BSONObj newSpec = newSpecB.done(); + addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec ); + + deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true ); + // oldSpec variable no longer valid memory + + BSONObj oldIndexSpec; + s = database; + s += ".system.indexes"; + while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) { + BSONObjBuilder newIndexSpecB; + BSONObjIterator i( oldIndexSpec ); + while( i.more() ) { + BSONElement e = i.next(); + if ( strcmp( e.fieldName(), "ns" ) != 0 ) + newIndexSpecB.append( e ); + else + newIndexSpecB << "ns" << to; + } + BSONObj newIndexSpec = newIndexSpecB.done(); + DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, BSONElement(), false ); + int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) ); + IndexDetails &indexDetails = details->idx(indexI); + string oldIndexNs = indexDetails.indexNamespace(); + indexDetails.info = newIndexSpecLoc; + string newIndexNs = indexDetails.indexNamespace(); + + BtreeBucket::renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() ); + deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true ); + } + } + + bool legalClientSystemNS( const string& ns , bool write ) { if( ns == "local.system.replset" ) return true; if ( ns.find( ".system.users" ) != string::npos ) return true; - if ( ns.find( ".system.js" ) != string::npos ){ + if ( ns.find( ".system.js" ) != string::npos ) { if ( write ) Scope::storedFuncMod(); return true; } - + return false; } - + } // namespace mongo diff --git a/db/namespace.h b/db/namespace.h index abc35bb..4ec1edd 100644 --- a/db/namespace.h +++ b/db/namespace.h @@ -23,130 +23,66 @@ #include "queryutil.h" #include "diskloc.h" #include "../util/hashtab.h" -#include "../util/mmap.h" +#include "mongommf.h" namespace mongo { - /* in the mongo source code, "client" means "database". */ + /* in the mongo source code, "client" means "database". */ - const int MaxDatabaseLen = 256; // max str len for the db name, including null char + const int MaxDatabaseNameLen = 256; // max str len for the db name, including null char - // "database.a.b.c" -> "database" - inline void nsToDatabase(const char *ns, char *database) { - const char *p = ns; - char *q = database; - while ( *p != '.' ) { - if ( *p == 0 ) - break; - *q++ = *p++; - } - *q = 0; - if (q-database>=MaxDatabaseLen) { - log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl; - dbexit( EXIT_POSSIBLE_CORRUPTION ); - } - } - inline string nsToDatabase(const char *ns) { - char buf[MaxDatabaseLen]; - nsToDatabase(ns, buf); - return buf; - } - inline string nsToDatabase(const string& ns) { - size_t i = ns.find( '.' ); - if ( i == string::npos ) - return ns; - return ns.substr( 0 , i ); - } - - /* e.g. - NamespaceString ns("acme.orders"); - cout << ns.coll; // "orders" - */ + /* e.g. + NamespaceString ns("acme.orders"); + cout << ns.coll; // "orders" + */ class NamespaceString { public: string db; string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes") + + NamespaceString( const char * ns ) { init(ns); } + NamespaceString( const string& ns ) { init(ns.c_str()); } + string ns() const { return db + '.' + coll; } + bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; } private: - void init(const char *ns) { + void init(const char *ns) { const char *p = strchr(ns, '.'); if( p == 0 ) return; db = string(ns, p - ns); coll = p + 1; } - public: - NamespaceString( const char * ns ) { init(ns); } - NamespaceString( const string& ns ) { init(ns.c_str()); } - - string ns() const { - return db + '.' + coll; - } - - bool isSystem() { - return strncmp(coll.c_str(), "system.", 7) == 0; - } }; #pragma pack(1) - /* This helper class is used to make the HashMap below in NamespaceDetails */ + /* This helper class is used to make the HashMap below in NamespaceDetails e.g. see line: + HashTable *ht; + */ class Namespace { public: - enum MaxNsLenValue { MaxNsLen = 128 }; - Namespace(const char *ns) { - *this = ns; - } - Namespace& operator=(const char *ns) { - uassert( 10080 , "ns name too long, max size is 128", strlen(ns) < MaxNsLen); - //memset(buf, 0, MaxNsLen); /* this is just to keep stuff clean in the files for easy dumping and reading */ - strcpy_s(buf, MaxNsLen, ns); - return *this; - } + explicit Namespace(const char *ns) { *this = ns; } + Namespace& operator=(const char *ns); - /* for more than 10 indexes -- see NamespaceDetails::Extra */ - string extraName(int i) { - char ex[] = "$extra"; - ex[5] += i; - string s = string(buf) + ex; - massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen); - return s; - } - bool isExtra() const { - const char *p = strstr(buf, "$extr"); - return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example - } bool hasDollarSign() const { return strchr( buf , '$' ) > 0; } void kill() { buf[0] = 0x7f; } bool operator==(const char *r) const { return strcmp(buf, r) == 0; } bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; } - int hash() const { - unsigned x = 0; - const char *p = buf; - while ( *p ) { - x = x * 131 + *p; - p++; - } - return (x & 0x7fffffff) | 0x8000000; // must be > 0 - } - - /** - ( foo.bar ).getSisterNS( "blah" ) == foo.blah - perhaps this should move to the NamespaceString helper? + int hash() const; // value returned is always > 0 + string toString() const { return (string) buf; } + operator string() const { return (string) buf; } + + /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes + (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward + file support. */ + string extraName(int i) const; + bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */ + + /** ( foo.bar ).getSisterNS( "blah" ) == foo.blah + perhaps this should move to the NamespaceString helper? */ - string getSisterNS( const char * local ) { - assert( local && local[0] != '.' ); - string old(buf); - if ( old.find( "." ) != string::npos ) - old = old.substr( 0 , old.find( "." ) ); - return old + "." + local; - } - - string toString() const { - return (string)buf; - } - - operator string() const { - return (string)buf; - } + string getSisterNS( const char * local ) const; + enum MaxNsLenValue { MaxNsLen = 128 }; + private: char buf[MaxNsLen]; }; #pragma pack() @@ -158,7 +94,9 @@ namespace mongo { namespace mongo { /** @return true if a client can modify this namespace - things like *.system.users */ + things like *.system.users + @param write used when .system.js + */ bool legalClientSystemNS( const string& ns , bool write ); /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes @@ -170,92 +108,106 @@ namespace mongo { extern int bucketSizes[]; #pragma pack(1) - /* this is the "header" for a collection that has all its details. in the .ns file. + /* NamespaceDetails : this is the "header" for a collection that has all its details. + It's in the .ns file and this is a memory mapped region (thus the pack pragma above). */ class NamespaceDetails { - friend class NamespaceIndex; - enum { NIndexesExtra = 30, - NIndexesBase = 10 - }; public: - struct ExtraOld { - // note we could use this field for more chaining later, so don't waste it: - unsigned long long reserved1; - IndexDetails details[NIndexesExtra]; - unsigned reserved2; - unsigned reserved3; - }; - class Extra { + enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 }; + + /*-------- data fields, as present on disk : */ + DiskLoc firstExtent; + DiskLoc lastExtent; + /* NOTE: capped collections v1 override the meaning of deletedList. + deletedList[0] points to a list of free records (DeletedRecord's) for all extents in + the capped namespace. + deletedList[1] points to the last record in the prev extent. When the "current extent" + changes, this value is updated. !deletedList[1].isValid() when this value is not + yet computed. + */ + DiskLoc deletedList[Buckets]; + // ofs 168 (8 byte aligned) + struct Stats { + // datasize and nrecords MUST Be adjacent code assumes! + long long datasize; // this includes padding, but not record headers + long long nrecords; + } stats; + int lastExtentSize; + int nIndexes; + private: + // ofs 192 + IndexDetails _indexes[NIndexesBase]; + public: + // ofs 352 (16 byte aligned) + int capped; + int max; // max # of objects for a capped table. TODO: should this be 64 bit? + double paddingFactor; // 1.0 = no padding. + // ofs 386 (16) + int flags; + DiskLoc capExtent; + DiskLoc capFirstNewRecord; + unsigned short dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h + unsigned short indexFileVersion; + unsigned long long multiKeyIndexBits; + private: + // ofs 400 (16) + unsigned long long reservedA; + long long extraOffset; // where the $extra info is located (bytes relative to this) + public: + int indexBuildInProgress; // 1 if in prog + unsigned reservedB; + // ofs 424 (8) + struct Capped2 { + unsigned long long cc2_ptr; // see capped.cpp + unsigned fileNumber; + } capped2; + char reserved[60]; + /*-------- end data 496 bytes */ + + explicit NamespaceDetails( const DiskLoc &loc, bool _capped ); + + class Extra { long long _next; - public: + public: IndexDetails details[NIndexesExtra]; - private: + private: unsigned reserved2; unsigned reserved3; - Extra(const Extra&) { assert(false); } - Extra& operator=(const Extra& r) { assert(false); return *this; } + Extra(const Extra&) { assert(false); } + Extra& operator=(const Extra& r) { assert(false); return *this; } public: Extra() { } - long ofsFrom(NamespaceDetails *d) { + long ofsFrom(NamespaceDetails *d) { return ((char *) this) - ((char *) d); } void init() { memset(this, 0, sizeof(Extra)); } - Extra* next(NamespaceDetails *d) { + Extra* next(NamespaceDetails *d) { if( _next == 0 ) return 0; return (Extra*) (((char *) d) + _next); } - void setNext(long ofs) { _next = ofs; } - void copy(NamespaceDetails *d, const Extra& e) { + void setNext(long ofs) { *getDur().writing(&_next) = ofs; } + void copy(NamespaceDetails *d, const Extra& e) { memcpy(this, &e, sizeof(Extra)); _next = 0; } - }; // Extra - - Extra* extra() { + }; + Extra* extra() { if( extraOffset == 0 ) return 0; return (Extra *) (((char *) this) + extraOffset); } - - public: /* add extra space for indexes when more than 10 */ Extra* allocExtra(const char *ns, int nindexessofar); - void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra - enum { NIndexesMax = 64 }; - - BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 ); - BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits - BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 ); - BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 ); - /* called when loaded from disk */ void onLoad(const Namespace& k); - NamespaceDetails( const DiskLoc &loc, bool _capped ); - - DiskLoc firstExtent; - DiskLoc lastExtent; - - /* NOTE: capped collections override the meaning of deleted list. - deletedList[0] points to a list of free records (DeletedRecord's) for all extents in - the capped namespace. - deletedList[1] points to the last record in the prev extent. When the "current extent" - changes, this value is updated. !deletedList[1].isValid() when this value is not - yet computed. - */ - DiskLoc deletedList[Buckets]; + /* dump info on this namespace. for debugging. */ + void dump(const Namespace& k); + /* dump info on all extents for this namespace. for debugging. */ void dumpExtents(); - long long datasize; - long long nrecords; - int lastExtentSize; - int nIndexes; - - private: - IndexDetails _indexes[NIndexesBase]; - private: Extent *theCapExtent() const { return capExtent.ext(); } void advanceCapExtent( const char *ns ); @@ -263,6 +215,7 @@ namespace mongo { DiskLoc cappedAlloc(const char *ns, int len); DiskLoc &cappedFirstDeletedInCurExtent(); bool nextIsInCapExtent( const DiskLoc &dl ) const; + public: DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; } DiskLoc& cappedLastDelRecLastExtent() { return deletedList[1]; } @@ -270,122 +223,79 @@ namespace mongo { bool capLooped() const { return capped && capFirstNewRecord.isValid(); } bool inCapExtent( const DiskLoc &dl ) const; void cappedCheckMigrate(); - void cappedTruncateAfter(const char *ns, DiskLoc after, bool inclusive); /** remove rest of the capped collection from this point onward */ + /** + * Truncate documents newer than the document at 'end' from the capped + * collection. The collection cannot be completely emptied using this + * function. An assertion will be thrown if that is attempted. + * @param inclusive - Truncate 'end' as well iff true + */ + void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive); + /** Remove all documents from the capped collection */ void emptyCappedCollection(const char *ns); - - int capped; - - int max; // max # of objects for a capped table. TODO: should this be 64 bit? - double paddingFactor; // 1.0 = no padding. - int flags; - - DiskLoc capExtent; - DiskLoc capFirstNewRecord; - - /* NamespaceDetails version. So we can do backward compatibility in the future. - See filever.h - */ - unsigned short dataFileVersion; - unsigned short indexFileVersion; - unsigned long long multiKeyIndexBits; - private: - unsigned long long reservedA; - long long extraOffset; // where the $extra info is located (bytes relative to this) - public: - int backgroundIndexBuildInProgress; // 1 if in prog - char reserved[76]; - - /* when a background index build is in progress, we don't count the index in nIndexes until + /* when a background index build is in progress, we don't count the index in nIndexes until complete, yet need to still use it in _indexRecord() - thus we use this function for that. */ - int nIndexesBeingBuilt() const { return nIndexes + backgroundIndexBuildInProgress; } + int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; } - /* NOTE: be careful with flags. are we manipulating them in read locks? if so, + /* NOTE: be careful with flags. are we manipulating them in read locks? if so, this isn't thread safe. TODO */ enum NamespaceFlags { Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called) }; - IndexDetails& idx(int idxNo, bool missingExpected = false ) { - if( idxNo < NIndexesBase ) - return _indexes[idxNo]; - Extra *e = extra(); - if ( ! e ){ - if ( missingExpected ) - throw MsgAssertionException( 13283 , "Missing Extra" ); - massert(13282, "missing Extra", e); - } - int i = idxNo - NIndexesBase; - if( i >= NIndexesExtra ) { - e = e->next(this); - if ( ! e ){ - if ( missingExpected ) - throw MsgAssertionException( 13283 , "missing extra" ); - massert(13283, "missing Extra", e); - } - i -= NIndexesExtra; - } - return e->details[i]; - } - IndexDetails& backgroundIdx() { - DEV assert(backgroundIndexBuildInProgress); + IndexDetails& idx(int idxNo, bool missingExpected = false ); + + /** get the IndexDetails for the index currently being built in the background. (there is at most one) */ + IndexDetails& inProgIdx() { + DEV assert(indexBuildInProgress); return idx(nIndexes); } - class IndexIterator { - friend class NamespaceDetails; - int i; - int n; - NamespaceDetails *d; - IndexIterator(NamespaceDetails *_d) { - d = _d; - i = 0; - n = d->nIndexes; - } + class IndexIterator { public: int pos() { return i; } // note this is the next one to come bool more() { return i < n; } IndexDetails& next() { return d->idx(i++); } - }; // IndexIterator + private: + friend class NamespaceDetails; + int i, n; + NamespaceDetails *d; + IndexIterator(NamespaceDetails *_d); + }; IndexIterator ii() { return IndexIterator(this); } - /* hackish - find our index # in the indexes array - */ - int idxNo(IndexDetails& idx) { - IndexIterator i = ii(); - while( i.more() ) { - if( &i.next() == &idx ) - return i.pos()-1; - } - massert( 10349 , "E12000 idxNo fails", false); - return -1; - } + /* hackish - find our index # in the indexes array */ + int idxNo(IndexDetails& idx); /* multikey indexes are indexes where there are more than one key in the index for a single document. see multikey in wiki. for these, we have to do some dedup work on queries. */ - bool isMultikey(int i) { - return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; - } - void setIndexIsMultikey(int i) { + bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; } + void setIndexIsMultikey(int i) { dassert( i < NIndexesMax ); - multiKeyIndexBits |= (((unsigned long long) 1) << i); + unsigned long long x = ((unsigned long long) 1) << i; + if( multiKeyIndexBits & x ) return; + *getDur().writing(&multiKeyIndexBits) |= x; } - void clearIndexIsMultikey(int i) { + void clearIndexIsMultikey(int i) { dassert( i < NIndexesMax ); - multiKeyIndexBits &= ~(((unsigned long long) 1) << i); + unsigned long long x = ((unsigned long long) 1) << i; + if( (multiKeyIndexBits & x) == 0 ) return; + *getDur().writing(&multiKeyIndexBits) &= ~x; } /* add a new index. does not add to system.indexes etc. - just to NamespaceDetails. - caller must populate returned object. + caller must populate returned object. */ IndexDetails& addIndex(const char *thisns, bool resetTransient=true); - void aboutToDeleteAnIndex() { flags &= ~Flag_HaveIdIndex; } + void aboutToDeleteAnIndex() { + *getDur().writing(&flags) = flags & ~Flag_HaveIdIndex; + } /* returns index of the first index in which the field is present. -1 if not present. */ int fieldIsIndexed(const char *fieldName); @@ -393,49 +303,35 @@ namespace mongo { void paddingFits() { double x = paddingFactor - 0.01; if ( x >= 1.0 ) - paddingFactor = x; + getDur().setNoJournal(&paddingFactor, &x, sizeof(x)); } void paddingTooSmall() { double x = paddingFactor + 0.6; if ( x <= 2.0 ) - paddingFactor = x; + getDur().setNoJournal(&paddingFactor, &x, sizeof(x)); } - //returns offset in indexes[] - int findIndexByName(const char *name) { - IndexIterator i = ii(); - while( i.more() ) { - if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 ) - return i.pos()-1; - } - return -1; - } + // @return offset in indexes[] + int findIndexByName(const char *name); + + // @return offset in indexes[] + int findIndexByKeyPattern(const BSONObj& keyPattern); - //returns offset in indexes[] - int findIndexByKeyPattern(const BSONObj& keyPattern) { - IndexIterator i = ii(); - while( i.more() ) { - if( i.next().keyPattern() == keyPattern ) - return i.pos()-1; - } - return -1; - } - void findIndexByType( const string& name , vector& matches ) { IndexIterator i = ii(); - while ( i.more() ){ + while ( i.more() ) { if ( i.next().getSpec().getTypeName() == name ) matches.push_back( i.pos() - 1 ); } } - /* @return -1 = not found + /* @return -1 = not found generally id is first index, so not that expensive an operation (assuming present). */ int findIdIndex() { IndexIterator i = ii(); while( i.more() ) { - if( i.next().isIdIndex() ) + if( i.next().isIdIndex() ) return i.pos()-1; } return -1; @@ -451,25 +347,46 @@ namespace mongo { /* allocate a new record. lenToAlloc includes headers. */ DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc); - /* add a given record to the deleted chains for this NS */ void addDeletedRec(DeletedRecord *d, DiskLoc dloc); - void dumpDeleted(set *extents = 0); - // Start from firstExtent by default. DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const; - // Start from lastExtent by default. DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const; + long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const; + + int averageObjectSize() { + if ( stats.nrecords == 0 ) + return 5; + return (int) (stats.datasize / stats.nrecords); + } + + NamespaceDetails *writingWithoutExtra() { + return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) ); + } + /** Make all linked Extra objects writeable as well */ + NamespaceDetails *writingWithExtra(); - long long storageSize( int * numExtents = 0 ); - private: DiskLoc _alloc(const char *ns, int len); void maybeComplain( const char *ns, int len ) const; DiskLoc __stdAlloc(int len); void compact(); // combine adjacent deleted records + friend class NamespaceIndex; + struct ExtraOld { + // note we could use this field for more chaining later, so don't waste it: + unsigned long long reserved1; + IndexDetails details[NIndexesExtra]; + unsigned reserved2; + unsigned reserved3; + }; + /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */ + void cappedTruncateLastDelUpdate(); + BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 ); + BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 ); + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 ); }; // NamespaceDetails #pragma pack() @@ -486,7 +403,7 @@ namespace mongo { todo: cleanup code, need abstractions and separation */ class NamespaceDetailsTransient : boost::noncopyable { - BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 ); + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 ); /* general ------------------------------------------------------------- */ private: @@ -494,18 +411,18 @@ namespace mongo { void reset(); static std::map< string, shared_ptr< NamespaceDetailsTransient > > _map; public: - NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount(){ } + NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount() { } /* _get() is not threadsafe -- see get_inlock() comments */ static NamespaceDetailsTransient& _get(const char *ns); /* use get_w() when doing write operations */ - static NamespaceDetailsTransient& get_w(const char *ns) { + static NamespaceDetailsTransient& get_w(const char *ns) { DEV assertInWriteLock(); return _get(ns); } void addedIndex() { reset(); } void deletedIndex() { reset(); } /* Drop cached information on all namespaces beginning with the specified prefix. - Can be useful as index namespaces share the same start as the regular collection. + Can be useful as index namespaces share the same start as the regular collection. SLOW - sequential scan of all NamespaceDetailsTransient objects */ static void clearForPrefix(const char *prefix); @@ -531,11 +448,11 @@ namespace mongo { map _indexSpecs; static mongo::mutex _isMutex; public: - const IndexSpec& getIndexSpec( const IndexDetails * details ){ + const IndexSpec& getIndexSpec( const IndexDetails * details ) { IndexSpec& spec = _indexSpecs[details]; - if ( ! spec._finishedInit ){ + if ( ! spec._finishedInit ) { scoped_lock lk(_isMutex); - if ( ! spec._finishedInit ){ + if ( ! spec._finishedInit ) { spec.reset( details ); assert( spec._finishedInit ); } @@ -591,7 +508,7 @@ namespace mongo { public: NamespaceIndex(const string &dir, const string &database) : - ht( 0 ), dir_( dir ), database_( database ) {} + ht( 0 ), dir_( dir ), database_( database ) {} /* returns true if new db will be created if we init lazily */ bool exists() const; @@ -600,13 +517,13 @@ namespace mongo { void add_ns(const char *ns, DiskLoc& loc, bool capped) { NamespaceDetails details( loc, capped ); - add_ns( ns, details ); + add_ns( ns, details ); } - void add_ns( const char *ns, const NamespaceDetails &details ) { + void add_ns( const char *ns, const NamespaceDetails &details ) { init(); Namespace n(ns); uassert( 10081 , "too many namespaces/collections", ht->put(n, details)); - } + } /* just for diagnostics */ /*size_t detailsOffset(NamespaceDetails *d) { @@ -625,20 +542,7 @@ namespace mongo { return d; } - void kill_ns(const char *ns) { - if ( !ht ) - return; - Namespace n(ns); - ht->kill(n); - - for( int i = 0; i<=1; i++ ) { - try { - Namespace extra(n.extraName(i).c_str()); - ht->kill(extra); - } - catch(DBException&) { } - } - } + void kill_ns(const char *ns); bool find(const char *ns, DiskLoc& loc) { NamespaceDetails *l = details(ns); @@ -658,12 +562,12 @@ namespace mongo { NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d); boost::filesystem::path path() const; - private: + private: void maybeMkdir() const; - - MMF f; - HashTable *ht; + + MongoMMF f; + HashTable *ht; string dir_; string database_; }; @@ -675,4 +579,31 @@ namespace mongo { // (Arguments should include db name) void renameNamespace( const char *from, const char *to ); + // "database.a.b.c" -> "database" + inline void nsToDatabase(const char *ns, char *database) { + const char *p = ns; + char *q = database; + while ( *p != '.' ) { + if ( *p == 0 ) + break; + *q++ = *p++; + } + *q = 0; + if (q-database>=MaxDatabaseNameLen) { + log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl; + dbexit( EXIT_POSSIBLE_CORRUPTION ); + } + } + inline string nsToDatabase(const char *ns) { + char buf[MaxDatabaseNameLen]; + nsToDatabase(ns, buf); + return buf; + } + inline string nsToDatabase(const string& ns) { + size_t i = ns.find( '.' ); + if ( i == string::npos ) + return ns; + return ns.substr( 0 , i ); + } + } // namespace mongo diff --git a/db/nonce.cpp b/db/nonce.cpp index 519cfaa..6f35c79 100644 --- a/db/nonce.cpp +++ b/db/nonce.cpp @@ -17,22 +17,25 @@ #include "pch.h" #include "nonce.h" +#include "../util/time_support.h" extern int do_md5_test(void); namespace mongo { - - Security::Security() { - static int n; - massert( 10352 , "Security is a singleton class", ++n == 1); - init(); - } - void Security::init(){ - if( _initialized ) return; - _initialized = true; + BOOST_STATIC_ASSERT( sizeof(nonce) == 8 ); -#if defined(__linux__) || defined(__sunos__) + Security::Security() { + static int n; + massert( 10352 , "Security is a singleton class", ++n == 1); + init(); + } + + void Security::init() { + if( _initialized ) return; + _initialized = true; + +#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__) _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in); massert( 10353 , "can't open dev/urandom", _devrandom->is_open() ); #elif defined(_WIN32) @@ -40,36 +43,41 @@ namespace mongo { #else srandomdev(); #endif - assert( sizeof(nonce) == 8 ); - + #ifndef NDEBUG if ( do_md5_test() ) - massert( 10354 , "md5 unit test fails", false); + massert( 10354 , "md5 unit test fails", false); #endif } - - nonce Security::getNonce(){ + + nonce Security::getNonce() { static mongo::mutex m("getNonce"); scoped_lock lk(m); + + if ( ! _initialized ) + init(); - /* question/todo: /dev/random works on OS X. is it better - to use that than random() / srandom()? - */ + /* question/todo: /dev/random works on OS X. is it better + to use that than random() / srandom()? + */ nonce n; -#if defined(__linux__) || defined(__sunos__) +#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__) _devrandom->read((char*)&n, sizeof(n)); massert( 10355 , "devrandom failed", !_devrandom->fail()); #elif defined(_WIN32) - n = (((unsigned long long)rand())<<32) | rand(); + unsigned a=0, b=0; + assert( rand_s(&a) == 0 ); + assert( rand_s(&b) == 0 ); + n = (((unsigned long long)a)<<32) | b; #else n = (((unsigned long long)random())<<32) | random(); #endif return n; } unsigned getRandomNumber() { return (unsigned) security.getNonce(); } - - bool Security::_initialized; + + bool Security::_initialized; Security security; - + } // namespace mongo diff --git a/db/nonce.h b/db/nonce.h index 593931f..21592ab 100644 --- a/db/nonce.h +++ b/db/nonce.h @@ -20,23 +20,23 @@ namespace mongo { typedef unsigned long long nonce; - + struct Security { Security(); nonce getNonce(); - /** safe during global var initialization */ - nonce getNonceInitSafe() { - init(); - return getNonce(); - } - private: + /** safe during global var initialization */ + nonce getNonceInitSafe() { + init(); + return getNonce(); + } + private: ifstream *_devrandom; - static bool _initialized; - void init(); // can call more than once + static bool _initialized; + void init(); // can call more than once }; - + extern Security security; - + } // namespace mongo diff --git a/db/oplog.cpp b/db/oplog.cpp index 93800c7..1557cbd 100644 --- a/db/oplog.cpp +++ b/db/oplog.cpp @@ -22,18 +22,19 @@ #include "repl.h" #include "commands.h" #include "repl/rs.h" +#include "stats/counters.h" namespace mongo { void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ); - int __findingStartInitialTimeout = 5; // configurable for testing + int __findingStartInitialTimeout = 5; // configurable for testing // cached copies of these...so don't rename them, drop them, etc.!!! static NamespaceDetails *localOplogMainDetails = 0; static Database *localDB = 0; static NamespaceDetails *rsOplogDetails = 0; - void oplogCheckCloseDatabase( Database * db ){ + void oplogCheckCloseDatabase( Database * db ) { localDB = 0; localOplogMainDetails = 0; rsOplogDetails = 0; @@ -44,10 +45,10 @@ namespace mongo { uassert(13288, "replSet error write op to db before replSet initialized", str::startsWith(ns, "local.") || *opstr == 'n'); } - /** write an op to the oplog that is already built. + /** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ - void _logOpObjRS(const BSONObj& op) { + void _logOpObjRS(const BSONObj& op) { DEV assertInWriteLock(); const OpTime ts = op["ts"]._opTime(); @@ -62,11 +63,11 @@ namespace mongo { rsOplogDetails = nsdetails(logns); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); } - Client::Context ctx( "" , localDB, false ); + Client::Context ctx( logns , localDB, false ); { int len = op.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); - memcpy(r->data, op.objdata(), len); + memcpy(getDur().writingPtr(r->data, len), op.objdata(), len); } /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. @@ -82,11 +83,42 @@ namespace mongo { } } + /** given a BSON object, create a new one at dst which is the existing (partial) object + with a new object element appended at the end with fieldname "o". + + @param partial already build object with everything except the o member. e.g. something like: + { ts:..., ns:..., os2:... } + @param o a bson object to be added with fieldname "o" + @dst where to put the newly built combined object. e.g. ends up as something like: + { ts:..., ns:..., os2:..., o:... } + */ + void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) { + const int size1 = partial.objsize() - 1; // less the EOO char + const int oOfs = size1+3; // 3 = byte BSONOBJTYPE + byte 'o' + byte \0 + + void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1); + + memcpy(p, partial.objdata(), size1); + + // adjust overall bson object size for the o: field + *(static_cast(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/; + + char *b = static_cast(p); + b += size1; + *b++ = (char) Object; + *b++ = 'o'; // { o : ... } + *b++ = 0; // null terminate "o" fieldname + memcpy(b, o.objdata(), o.objsize()); + b += o.objsize(); + *b = EOO; + } + static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) { DEV assertInWriteLock(); + // ^- static is safe as we are in write lock static BufBuilder bufbuilder(8*1024); - - if ( strncmp(ns, "local.", 6) == 0 ){ + + if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) resetSlaveCache(); return; @@ -94,15 +126,15 @@ namespace mongo { const OpTime ts = OpTime::now(); - long long hNew; - if( theReplSet ) { + long long hashNew; + if( theReplSet ) { massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary()); - hNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId(); + hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId(); } else { // must be initiation assert( *ns == 0 ); - hNew = 0; + hashNew = 0; } /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- @@ -113,7 +145,7 @@ namespace mongo { BSONObjBuilder b(bufbuilder); b.appendTimestamp("ts", ts.asDate()); - b.append("h", hNew); + b.append("h", hashNew); b.append("op", opstr); b.append("ns", ns); @@ -136,7 +168,7 @@ namespace mongo { rsOplogDetails = nsdetails(logns); massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); } - Client::Context ctx( "" , localDB, false ); + Client::Context ctx( logns , localDB, false ); r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. @@ -147,22 +179,13 @@ namespace mongo { log() << "replSet " << theReplSet->isPrimary() << rsLog; } theReplSet->lastOpTimeWritten = ts; - theReplSet->lastH = hNew; + theReplSet->lastH = hashNew; ctx.getClient()->setLastOp( ts.asDate() ); } } - char *p = r->data; - memcpy(p, partial.objdata(), posz); - *((unsigned *)p) += obj.objsize() + 1 + 2; - p += posz - 1; - *p++ = (char) Object; - *p++ = 'o'; - *p++ = 0; - memcpy(p, obj.objdata(), obj.objsize()); - p += obj.objsize(); - *p = EOO; - + append_O_Obj(r->data, partial, obj); + if ( logLevel >= 6 ) { BSONObj temp(r); log( 6 ) << "logOp:" << temp << endl; @@ -192,9 +215,9 @@ namespace mongo { static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) { DEV assertInWriteLock(); static BufBuilder bufbuilder(8*1024); - - if ( strncmp(ns, "local.", 6) == 0 ){ - if ( strncmp(ns, "local.slaves", 12) == 0 ){ + + if ( strncmp(ns, "local.", 6) == 0 ) { + if ( strncmp(ns, "local.slaves", 12) == 0 ) { resetSlaveCache(); } return; @@ -202,7 +225,7 @@ namespace mongo { const OpTime ts = OpTime::now(); Client::Context context; - + /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ @@ -216,9 +239,10 @@ namespace mongo { b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); - BSONObj partial = b.done(); - int posz = partial.objsize(); - int len = posz + obj.objsize() + 1 + 2 /*o:*/; + BSONObj partial = b.done(); // partial is everything except the o:... part. + + int po_sz = partial.objsize(); + int len = po_sz + obj.objsize() + 1 + 2 /*o:*/; Record *r; if( logNS == 0 ) { @@ -230,25 +254,18 @@ namespace mongo { localOplogMainDetails = nsdetails(logNS); assert( localOplogMainDetails ); } - Client::Context ctx( "" , localDB, false ); + Client::Context ctx( logNS , localDB, false ); r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len); - } else { + } + else { Client::Context ctx( logNS, dbpath, 0, false ); assert( nsdetails( logNS ) ); + // first we allocate the space, then we fill it below. r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len); } - char *p = r->data; - memcpy(p, partial.objdata(), posz); - *((unsigned *)p) += obj.objsize() + 1 + 2; - p += posz - 1; - *p++ = (char) Object; - *p++ = 'o'; - *p++ = 0; - memcpy(p, obj.objdata(), obj.objsize()); - p += obj.objsize(); - *p = EOO; - + append_O_Obj(r->data, partial, obj); + context.getClient()->setLastOp( ts.asDate() ); if ( logLevel >= 6 ) { @@ -259,17 +276,17 @@ namespace mongo { } static void (*_logOp)(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) = _logOpOld; - void newReplUp() { + void newReplUp() { replSettings.master = true; - _logOp = _logOpRS; + _logOp = _logOpRS; } - void newRepl() { + void newRepl() { replSettings.master = true; - _logOp = _logOpUninitialized; + _logOp = _logOpUninitialized; } void oldRepl() { _logOp = _logOpOld; } - void logKeepalive() { + void logKeepalive() { _logOp("n", "", 0, BSONObj(), 0, 0); } void logOpComment(const BSONObj& obj) { @@ -289,13 +306,10 @@ namespace mongo { void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) { if ( replSettings.master ) { _logOp(opstr, ns, 0, obj, patt, b); - // why? : - //char cl[ 256 ]; - //nsToDatabase( ns, cl ); } - + logOpForSharding( opstr , ns , obj , patt ); - } + } void createOplog() { dblock lk; @@ -307,15 +321,15 @@ namespace mongo { ns = rsoplog; Client::Context ctx(ns); - + NamespaceDetails * nsd = nsdetails( ns ); if ( nsd ) { - - if ( cmdLine.oplogSize != 0 ){ + + if ( cmdLine.oplogSize != 0 ) { int o = (int)(nsd->storageSize() / ( 1024 * 1024 ) ); int n = (int)(cmdLine.oplogSize / ( 1024 * 1024 ) ); - if ( n != o ){ + if ( n != o ) { stringstream ss; ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog"; log() << ss.str() << endl; @@ -332,19 +346,19 @@ namespace mongo { } return; } - + /* create an oplog collection, if it doesn't yet exist. */ BSONObjBuilder b; double sz; if ( cmdLine.oplogSize != 0 ) sz = (double)cmdLine.oplogSize; else { - /* not specified. pick a default size */ + /* not specified. pick a default size */ sz = 50.0 * 1000 * 1000; if ( sizeof(int *) >= 8 ) { #if defined(__APPLE__) - // typically these are desktops (dev machines), so keep it smallish - sz = (256-64) * 1000 * 1000; + // typically these are desktops (dev machines), so keep it smallish + sz = (256-64) * 1000 * 1000; #else sz = 990.0 * 1000 * 1000; boost::intmax_t free = freeSpace(); //-1 if call not supported. @@ -356,7 +370,7 @@ namespace mongo { } log() << "******" << endl; - log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB... (use --oplogSize to change)" << endl; + log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl; b.append("size", sz); b.appendBool("capped", 1); @@ -366,7 +380,7 @@ namespace mongo { BSONObj o = b.done(); userCreateNS(ns, o, err, false); if( !rs ) - logOp( "n", "dummy", BSONObj() ); + logOp( "n", "", BSONObj() ); /* sync here so we don't get any surprising lag later when we try to sync */ MemoryMappedFile::flushAll(true); @@ -394,8 +408,8 @@ namespace mongo { void pretouchN(vector& v, unsigned a, unsigned b) { DEV assert( !dbMutex.isWriteLocked() ); - Client *c = &cc(); - if( c == 0 ) { + Client *c = currentClient.get(); + if( c == 0 ) { Client::initThread("pretouchN"); c = &cc(); } @@ -413,7 +427,7 @@ namespace mongo { continue; /* todo : other operations */ - try { + try { BSONObj o = op.getObjectField(which); BSONElement _id; if( o.getObjectID(_id) ) { @@ -426,7 +440,7 @@ namespace mongo { _dummy_z += result.objsize(); // touch } } - catch( DBException& e ) { + catch( DBException& e ) { log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl; } } @@ -447,7 +461,7 @@ namespace mongo { return; /* todo : other operations */ - try { + try { BSONObj o = op.getObjectField(which); BSONElement _id; if( o.getObjectID(_id) ) { @@ -461,15 +475,17 @@ namespace mongo { _dummy_z += result.objsize(); // touch } } - catch( DBException& ) { + catch( DBException& ) { log() << "ignoring assertion in pretouchOperation()" << endl; } } - void applyOperation_inlock(const BSONObj& op){ - if( logLevel >= 6 ) + void applyOperation_inlock(const BSONObj& op , bool fromRepl ) { + OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; + + if( logLevel >= 6 ) log() << "applying op: " << op << endl; - + assertInWriteLock(); OpDebug debug; @@ -479,6 +495,8 @@ namespace mongo { const char *opType = op.getStringField("op"); if ( *opType == 'i' ) { + opCounters->gotInsert(); + const char *p = strchr(ns, '.'); if ( p && strcmp(p, ".system.indexes") == 0 ) { // updates aren't allowed for indexes -- so we will do a regular insert. if index already @@ -499,11 +517,11 @@ namespace mongo { else { BSONObjBuilder b; b.append(_id); - + /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */ - RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow + RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow - /* todo : it may be better to do an insert here, and then catch the dup key exception and do update + /* todo : it may be better to do an insert here, and then catch the dup key exception and do update then. very few upserts will not be inserts... */ updateObjects(ns, o, b.done(), true, false, false , debug ); @@ -511,10 +529,14 @@ namespace mongo { } } else if ( *opType == 'u' ) { + opCounters->gotUpdate(); + RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow updateObjects(ns, o, op.getObjectField("o2"), /*upsert*/ op.getBoolField("b"), /*multi*/ false, /*logop*/ false , debug ); } else if ( *opType == 'd' ) { + opCounters->gotDelete(); + if ( opType[1] == 0 ) deleteObjects(ns, o, op.getBoolField("b")); else @@ -523,7 +545,9 @@ namespace mongo { else if ( *opType == 'n' ) { // no op } - else if ( *opType == 'c' ){ + else if ( *opType == 'c' ) { + opCounters->gotCommand(); + BufBuilder bb; BSONObjBuilder ob; _runCommands(ns, o, bb, ob, true, 0); @@ -533,9 +557,9 @@ namespace mongo { ss << "unknown opType [" << opType << "]"; throw MsgAssertionException( 13141 , ss.str() ); } - + } - + class ApplyOpsCmd : public Command { public: virtual bool slaveOk() const { return false; } @@ -545,17 +569,18 @@ namespace mongo { help << "examples: { applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }"; } virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - - if ( cmdObj.firstElement().type() != Array ){ + + if ( cmdObj.firstElement().type() != Array ) { errmsg = "ops has to be an array"; return false; } - + BSONObj ops = cmdObj.firstElement().Obj(); - - { // check input + + { + // check input BSONObjIterator i( ops ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); if ( e.type() == Object ) continue; @@ -564,16 +589,16 @@ namespace mongo { return false; } } - - if ( cmdObj["preCondition"].type() == Array ){ + + if ( cmdObj["preCondition"].type() == Array ) { BSONObjIterator i( cmdObj["preCondition"].Obj() ); - while ( i.more() ){ + while ( i.more() ) { BSONObj f = i.next().Obj(); - + BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() ); - + Matcher m( f["res"].Obj() ); - if ( ! m.matches( realres ) ){ + if ( ! m.matches( realres ) ) { result.append( "got" , realres ); result.append( "whatFailed" , f ); errmsg = "pre-condition failed"; @@ -581,23 +606,32 @@ namespace mongo { } } } - + // apply int num = 0; BSONObjIterator i( ops ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); - applyOperation_inlock( e.Obj() ); + applyOperation_inlock( e.Obj() , false ); num++; } result.append( "applied" , num ); + if ( ! fromRepl ) { + // We want this applied atomically on slaves + // so we re-wrap without the pre-condition for speed + + string tempNS = str::stream() << dbname << ".$cmd"; + + logOp( "c" , tempNS.c_str() , cmdObj.firstElement().wrap() ); + } + return true; } DBDirectClient db; - + } applyOpsCmd; } diff --git a/db/oplog.h b/db/oplog.h index 34c345f..d9073ab 100644 --- a/db/oplog.h +++ b/db/oplog.h @@ -16,7 +16,7 @@ * along with this program. If not, see . */ -/* +/* local.oplog.$main is the default */ @@ -30,6 +30,7 @@ #include "queryoptimizer.h" #include "../client/dbclient.h" #include "../util/optime.h" +#include "../util/timer.h" namespace mongo { @@ -38,7 +39,7 @@ namespace mongo { void _logOpObjRS(const BSONObj& op); /** Write operation to the log (local.oplog.$main) - + @param opstr "i" insert "u" update @@ -47,89 +48,88 @@ namespace mongo { "n" no-op "db" declares presence of a database (ns is set to the db name + '.') - See _logOp() in oplog.cpp for more details. + See _logOp() in oplog.cpp for more details. */ void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0); void logKeepalive(); - /** puts obj in the oplog as a comment (a no-op). Just for diags. - convention is + /** puts obj in the oplog as a comment (a no-op). Just for diags. + convention is { msg : "text", ... } */ void logOpComment(const BSONObj& obj); void oplogCheckCloseDatabase( Database * db ); - - extern int __findingStartInitialTimeout; // configurable for testing + + extern int __findingStartInitialTimeout; // configurable for testing class FindingStartCursor { public: - FindingStartCursor( const QueryPlan & qp ) : - _qp( qp ), - _findingStart( true ), - _findingStartMode(), - _findingStartTimer( 0 ), - _findingStartCursor( 0 ) + FindingStartCursor( const QueryPlan & qp ) : + _qp( qp ), + _findingStart( true ), + _findingStartMode(), + _findingStartTimer( 0 ) { init(); } bool done() const { return !_findingStart; } shared_ptr cRelease() { return _c; } void next() { - if ( !_findingStartCursor || !_findingStartCursor->c->ok() ) { + if ( !_findingStartCursor || !_findingStartCursor->ok() ) { _findingStart = false; _c = _qp.newCursor(); // on error, start from beginning destroyClientCursor(); return; } switch( _findingStartMode ) { - case Initial: { - if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) { - _findingStart = false; // found first record out of query range, so scan normally - _c = _qp.newCursor( _findingStartCursor->c->currLoc() ); - destroyClientCursor(); - return; - } - _findingStartCursor->c->advance(); - RARELY { - if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) { - createClientCursor( startLoc( _findingStartCursor->c->currLoc() ) ); - _findingStartMode = FindExtent; - return; - } - } + case Initial: { + if ( !_matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) { + _findingStart = false; // found first record out of query range, so scan normally + _c = _qp.newCursor( _findingStartCursor->currLoc() ); + destroyClientCursor(); return; } - case FindExtent: { - if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) { - _findingStartMode = InExtent; - return; - } - DiskLoc prev = prevLoc( _findingStartCursor->c->currLoc() ); - if ( prev.isNull() ) { // hit beginning, so start scanning from here - createClientCursor(); - _findingStartMode = InExtent; + _findingStartCursor->advance(); + RARELY { + if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) { + createClientCursor( startLoc( _findingStartCursor->currLoc() ) ); + _findingStartMode = FindExtent; return; } - // There might be a more efficient implementation than creating new cursor & client cursor each time, - // not worrying about that for now - createClientCursor( prev ); + } + return; + } + case FindExtent: { + if ( !_matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) { + _findingStartMode = InExtent; return; } - case InExtent: { - if ( _matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) { - _findingStart = false; // found first record in query range, so scan normally - _c = _qp.newCursor( _findingStartCursor->c->currLoc() ); - destroyClientCursor(); - return; - } - _findingStartCursor->c->advance(); + DiskLoc prev = prevLoc( _findingStartCursor->currLoc() ); + if ( prev.isNull() ) { // hit beginning, so start scanning from here + createClientCursor(); + _findingStartMode = InExtent; return; } - default: { - massert( 12600, "invalid _findingStartMode", false ); + // There might be a more efficient implementation than creating new cursor & client cursor each time, + // not worrying about that for now + createClientCursor( prev ); + return; + } + case InExtent: { + if ( _matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) { + _findingStart = false; // found first record in query range, so scan normally + _c = _qp.newCursor( _findingStartCursor->currLoc() ); + destroyClientCursor(); + return; } - } - } + _findingStartCursor->advance(); + return; + } + default: { + massert( 12600, "invalid _findingStartMode", false ); + } + } + } bool prepareToYield() { if ( _findingStartCursor ) { return _findingStartCursor->prepareToYield( _yieldData ); @@ -139,10 +139,10 @@ namespace mongo { void recoverFromYield() { if ( _findingStartCursor ) { if ( !ClientCursor::recoverFromYield( _yieldData ) ) { - _findingStartCursor = 0; + _findingStartCursor.reset( 0 ); } } - } + } private: enum FindingStartMode { Initial, FindExtent, InExtent }; const QueryPlan &_qp; @@ -150,7 +150,7 @@ namespace mongo { FindingStartMode _findingStartMode; auto_ptr< CoveredIndexMatcher > _matcher; Timer _findingStartTimer; - ClientCursor * _findingStartCursor; + ClientCursor::CleanupPointer _findingStartCursor; shared_ptr _c; ClientCursor::YieldData _yieldData; DiskLoc startLoc( const DiskLoc &rec ) { @@ -162,7 +162,7 @@ namespace mongo { // doesn't matter if we start the extent scan with capFirstNewRecord. return _qp.nsd()->capFirstNewRecord; } - + // should never have an empty extent in the oplog, so don't worry about that case DiskLoc prevLoc( const DiskLoc &rec ) { Extent *e = rec.rec()->myExtent( rec ); @@ -173,7 +173,8 @@ namespace mongo { e = e->xprev.ext(); if ( e->myLoc != _qp.nsd()->capExtent ) return e->firstRecord; - } else { + } + else { if ( !e->xprev.isNull() ) { e = e->xprev.ext(); return e->firstRecord; @@ -183,19 +184,16 @@ namespace mongo { } void createClientCursor( const DiskLoc &startLoc = DiskLoc() ) { shared_ptr c = _qp.newCursor( startLoc ); - _findingStartCursor = new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()); + _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) ); } void destroyClientCursor() { - if ( _findingStartCursor ) { - ClientCursor::erase( _findingStartCursor->cursorid ); - _findingStartCursor = 0; - } + _findingStartCursor.reset( 0 ); } void init() { // Use a ClientCursor here so we can release db mutex while scanning // oplog (can take quite a while with large oplogs). shared_ptr c = _qp.newReverseCursor(); - _findingStartCursor = new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()); + _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) ); _findingStartTimer.reset(); _findingStartMode = Initial; BSONElement tsElt = _qp.originalQuery()[ "ts" ]; @@ -210,5 +208,10 @@ namespace mongo { void pretouchOperation(const BSONObj& op); void pretouchN(vector&, unsigned a, unsigned b); - void applyOperation_inlock(const BSONObj& op); + /** + * take an op and apply locally + * used for applying from an oplog + * @param fromRepl really from replication or for testing/internal/command/etc... + */ + void applyOperation_inlock(const BSONObj& op , bool fromRepl = true ); } diff --git a/db/oplogreader.h b/db/oplogreader.h index 5c2881b..54c90d9 100644 --- a/db/oplogreader.h +++ b/db/oplogreader.h @@ -8,7 +8,7 @@ namespace mongo { - /* started abstracting out the querying of the primary/master's oplog + /* started abstracting out the querying of the primary/master's oplog still fairly awkward but a start. */ class OplogReader { @@ -16,28 +16,24 @@ namespace mongo { auto_ptr cursor; public: - OplogReader() { - DEV log() << "TEMP *** OplogReader()" << endl; + OplogReader() { } - ~OplogReader() { - DEV log() << "TEMP *** ~OplogReader()" << endl; + ~OplogReader() { } void resetCursor() { - DEV log() << "TEMP *** OplogReader::resetCursor" << endl; cursor.reset(); } void resetConnection() { - DEV log() << "TEMP *** OplogReader::resetConnection" << endl; cursor.reset(); _conn.reset(); } DBClientConnection* conn() { return _conn.get(); } - BSONObj findOne(const char *ns, const Query& q) { - return conn()->findOne(ns, q); + BSONObj findOne(const char *ns, const Query& q) { + return conn()->findOne(ns, q, 0, QueryOption_SlaveOk); } - BSONObj getLastOp(const char *ns) { + BSONObj getLastOp(const char *ns) { return findOne(ns, Query().sort(reverseNaturalObj)); } @@ -45,7 +41,7 @@ namespace mongo { bool connect(string hostname); void tailCheck() { - if( cursor.get() && cursor->isDead() ) { + if( cursor.get() && cursor->isDead() ) { log() << "repl: old cursor isDead, will initiate a new one" << endl; resetCursor(); } @@ -53,19 +49,19 @@ namespace mongo { bool haveCursor() { return cursor.get() != 0; } - void query(const char *ns, const BSONObj& query) { + void query(const char *ns, const BSONObj& query) { assert( !haveCursor() ); cursor = _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk); } - void tailingQuery(const char *ns, const BSONObj& query) { + void tailingQuery(const char *ns, const BSONObj& query) { assert( !haveCursor() ); log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl; - cursor = _conn->query( ns, query, 0, 0, 0, - QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay | - /* TODO: slaveok maybe shouldn't use? */ - QueryOption_AwaitData - ); + cursor = _conn->query( ns, query, 0, 0, 0, + QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay | + /* TODO: slaveok maybe shouldn't use? */ + QueryOption_AwaitData + ); } void tailingQueryGTE(const char *ns, OpTime t) { @@ -76,34 +72,34 @@ namespace mongo { tailingQuery(ns, query.done()); } - bool more() { + bool more() { assert( cursor.get() ); return cursor->more(); } - bool moreInCurrentBatch() { + bool moreInCurrentBatch() { assert( cursor.get() ); return cursor->moreInCurrentBatch(); } /* old mongod's can't do the await flag... */ - bool awaitCapable() { + bool awaitCapable() { return cursor->hasResultFlag(ResultFlag_AwaitCapable); } - void peek(vector& v, int n) { + void peek(vector& v, int n) { if( cursor.get() ) cursor->peek(v,n); } BSONObj nextSafe() { return cursor->nextSafe(); } - BSONObj next() { + BSONObj next() { return cursor->next(); } - void putBack(BSONObj op) { + void putBack(BSONObj op) { cursor->putBack(op); } }; - + } diff --git a/db/pdfile.cpp b/db/pdfile.cpp index 216f21a..20a7423 100644 --- a/db/pdfile.cpp +++ b/db/pdfile.cpp @@ -20,7 +20,6 @@ todo: _ table scans must be sequential, not next/prev pointers _ coalesce deleted - _ disallow system* manipulations from the database. */ @@ -37,21 +36,21 @@ _ disallow system* manipulations from the database. #include "query.h" #include "repl.h" #include "dbhelpers.h" -#include "namespace.h" +#include "namespace-inl.h" #include "queryutil.h" #include "extsort.h" -#include "curop.h" +#include "curop-inl.h" #include "background.h" namespace mongo { bool inDBRepair = false; struct doingRepair { - doingRepair(){ + doingRepair() { assert( ! inDBRepair ); inDBRepair = true; } - ~doingRepair(){ + ~doingRepair() { inDBRepair = false; } }; @@ -64,42 +63,42 @@ namespace mongo { return dbsInProg[db] != 0; } - bool BackgroundOperation::inProgForNs(const char *ns) { + bool BackgroundOperation::inProgForNs(const char *ns) { assertInWriteLock(); return nsInProg.count(ns) != 0; } - void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { + void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { uassert(12586, "cannot perform operation: a background operation is currently running for this database", - !inProgForDb(db)); + !inProgForDb(db)); } - void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { + void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { uassert(12587, "cannot perform operation: a background operation is currently running for this collection", - !inProgForNs(ns)); - } + !inProgForNs(ns)); + } - BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { + BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { assertInWriteLock(); dbsInProg[_ns.db]++; assert( nsInProg.count(_ns.ns()) == 0 ); nsInProg.insert(_ns.ns()); } - BackgroundOperation::~BackgroundOperation() { + BackgroundOperation::~BackgroundOperation() { assertInWriteLock(); dbsInProg[_ns.db]--; nsInProg.erase(_ns.ns()); } void BackgroundOperation::dump(stringstream& ss) { - if( nsInProg.size() ) { + if( nsInProg.size() ) { ss << "\nBackground Jobs in Progress\n"; for( set::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ ) ss << " " << *i << '\n'; } - for( map::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { - if( i->second ) + for( map::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { + if( i->second ) ss << "database " << i->first << ": " << i->second << '\n'; } } @@ -114,24 +113,23 @@ namespace mongo { DataFileMgr theDataFileMgr; DatabaseHolder dbHolder; int MAGIC = 0x1000; -// int curOp = -2; extern int otherTraceLevel; void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0); void ensureIdIndexForNewNs(const char *ns) { if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) && - strstr( ns, ".$freelist" ) == 0 ){ + strstr( ns, ".$freelist" ) == 0 ) { log( 1 ) << "adding _id index for collection " << ns << endl; ensureHaveIdIndex( ns ); - } + } } string getDbContext() { stringstream ss; Client * c = currentClient.get(); - if ( c ){ + if ( c ) { Client::Context * cx = c->getContext(); - if ( cx ){ + if ( cx ) { Database *database = cx->db(); if ( database ) { ss << database->name << ' '; @@ -142,20 +140,44 @@ namespace mongo { return ss.str(); } - BSONObj::BSONObj(const Record *r) { - init(r->data, false); - } - /*---------------------------------------------------------------------*/ - int initialExtentSize(int len) { + // inheritable class to implement an operation that may be applied to all + // files in a database using _applyOpToDataFiles() + class FileOp { + public: + virtual ~FileOp() {} + // Return true if file exists and operation successful + virtual bool apply( const boost::filesystem::path &p ) = 0; + virtual const char * op() const = 0; + }; + + void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); + + void _deleteDataFiles(const char *database) { + if ( directoryperdb ) { + FileAllocator::get()->waitUntilFinished(); + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) ); + return; + } + class : public FileOp { + virtual bool apply( const boost::filesystem::path &p ) { + return boost::filesystem::remove( p ); + } + virtual const char * op() const { + return "remove"; + } + } deleter; + _applyOpToDataFiles( database, deleter, true ); + } + + int Extent::initialSize(int len) { long long sz = len * 16; if ( len < 1000 ) sz = len * 64; if ( sz > 1000000000 ) sz = 1000000000; int z = ((int)sz) & 0xffffff00; assert( z > len ); - //DEV tlog() << "initialExtentSize(" << len << ") returns " << z << endl; return z; } @@ -165,7 +187,7 @@ namespace mongo { return false; } - log(1) << "create collection " << ns << ' ' << options << '\n'; + log(1) << "create collection " << ns << ' ' << options << endl; /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field and then go back and set to ok : 1 after we are done. @@ -174,33 +196,48 @@ namespace mongo { if( !isFreeList ) addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options); - long long size = initialExtentSize(128); - BSONElement e = options.getField("size"); - if ( e.isNumber() ) { - size = e.numberLong(); - size += 256; - size &= 0xffffffffffffff00LL; + long long size = Extent::initialSize(128); + { + BSONElement e = options.getField("size"); + if ( e.isNumber() ) { + size = e.numberLong(); + size += 256; + size &= 0xffffffffffffff00LL; + } } - + uassert( 10083 , "invalid size spec", size > 0 ); bool newCapped = false; int mx = 0; - e = options.getField("capped"); - if ( e.type() == Bool && e.boolean() ) { + if( options.getBoolField("capped") ) { newCapped = true; - e = options.getField("max"); + BSONElement e = options.getField("max"); if ( e.isNumber() ) { mx = e.numberInt(); } } - // $nExtents just for debug/testing. We create '$nExtents' extents, - // each of size 'size'. - e = options.getField( "$nExtents" ); - int nExtents = int( e.number() ); + // $nExtents just for debug/testing. + BSONElement e = options.getField( "$nExtents" ); Database *database = cc().database(); - if ( nExtents > 0 ) { + if ( e.type() == Array ) { + // We create one extent per array entry, with size specified + // by the array value. + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement e = i.next(); + int size = int( e.number() ); + assert( size <= 0x7fffffff ); + // $nExtents is just for testing - always allocate new extents + // rather than reuse existing extents so we have some predictibility + // in the extent size used by our tests + database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped ); + } + } + else if ( int( e.number() ) > 0 ) { + // We create '$nExtents' extents, each of size 'size'. + int nExtents = int( e.number() ); assert( size <= 0x7fffffff ); for ( int i = 0; i < nExtents; ++i ) { assert( size <= 0x7fffffff ); @@ -209,10 +246,16 @@ namespace mongo { // in the extent size used by our tests database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped ); } - } else { + } + else { + // This is the non test case, where we don't have a $nExtents spec. while ( size > 0 ) { int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize; int desiredExtentSize = (int) (size > max ? max : size); + if ( desiredExtentSize < Extent::minSize() ) { + desiredExtentSize = Extent::minSize(); + } + desiredExtentSize &= 0xffffff00; Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped ); size -= e->length; } @@ -223,15 +266,16 @@ namespace mongo { bool ensure = false; if ( options.getField( "autoIndexId" ).type() ) { - if ( options["autoIndexId"].trueValue() ){ + if ( options["autoIndexId"].trueValue() ) { ensure = true; } - } else { + } + else { if ( !newCapped ) { ensure=true; } } - if( ensure ) { + if( ensure ) { if( deferIdIndex ) *deferIdIndex = true; else @@ -239,7 +283,7 @@ namespace mongo { } if ( mx > 0 ) - d->max = mx; + getDur().writingInt( d->max ) = mx; return true; } @@ -250,7 +294,7 @@ namespace mongo { */ bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) { const char *coll = strchr( ns, '.' ) + 1; - massert( 10356 , "invalid ns", coll && *coll ); + massert( 10356 , str::stream() << "invalid ns: " << ns , coll && *coll ); char cl[ 256 ]; nsToDatabase( ns, cl ); bool ok = _userCreateNS(ns, options, err, deferIdIndex); @@ -272,14 +316,22 @@ namespace mongo { int MongoDataFile::maxSize() { if ( sizeof( int* ) == 4 ) { return 512 * 1024 * 1024; - } else if ( cmdLine.smallfiles ) { + } + else if ( cmdLine.smallfiles ) { return 0x7ff00000 >> 2; - } else { + } + else { return 0x7ff00000; } } - void MongoDataFile::badOfs(int ofs) const { + void MongoDataFile::badOfs2(int ofs) const { + stringstream ss; + ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; + uasserted(13441, ss.str()); + } + + void MongoDataFile::badOfs(int ofs) const { stringstream ss; ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; uasserted(13440, ss.str()); @@ -293,26 +345,18 @@ namespace mongo { else size = 0x7ff00000; - if ( strstr(filename, "_hudsonSmall") ) { - int mult = 1; - if ( fileNo > 1 && fileNo < 1000 ) - mult = fileNo; - size = 1024 * 512 * mult; - log() << "Warning : using small files for _hudsonSmall" << endl; - } - else if ( cmdLine.smallfiles ){ + if ( cmdLine.smallfiles ) { size = size >> 2; } - - + + return size; } void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) { { /* check quotas - very simple temporary implementation - we will in future look up - the quota from the grid database + very simple temporary implementation for now */ if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) { /* todo: if we were adding / changing keys in an index did we do some @@ -340,58 +384,66 @@ namespace mongo { if ( size > maxSize() ) size = maxSize(); - assert( ( size >= 64*1024*1024 ) || cmdLine.smallfiles || ( strstr( filename, "_hudsonSmall" ) ) ); + assert( size >= 64*1024*1024 || cmdLine.smallfiles ); assert( size % 4096 == 0 ); if ( preallocateOnly ) { if ( cmdLine.prealloc ) { - theFileAllocator().requestAllocation( filename, size ); + FileAllocator::get()->requestAllocation( filename, size ); } return; } - - _p = mmf.map(filename, size); - header = (DataFileHeader *) _p.at(0, DataFileHeader::HeaderSize); - if( sizeof(char *) == 4 ) - uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", header); + + { + assert( _mb == 0 ); + unsigned long long sz = size; + if( mmf.create(filename, sz, false) ) + _mb = mmf.getView(); + assert( sz <= 0x7fffffff ); + size = (int) sz; + } + //header = (DataFileHeader *) _p; + if( sizeof(char *) == 4 ) + uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0); else - uassert( 10085 , "can't map file memory", header); - header->init(fileNo, size); + uassert( 10085 , "can't map file memory", _mb != 0); + header()->init(fileNo, size, filename); } - void MongoDataFile::flush( bool sync ){ + void MongoDataFile::flush( bool sync ) { mmf.flush( sync ); } - void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { - DiskLoc oldExtentLoc; + void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { NamespaceIndex *ni = nsindex(ns); NamespaceDetails *details = ni->details(ns); if ( details ) { assert( !details->lastExtent.isNull() ); assert( !details->firstExtent.isNull() ); - e->xprev = details->lastExtent; - details->lastExtent.ext()->xnext = eloc; + getDur().writingDiskLoc(e->xprev) = details->lastExtent; + getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc; assert( !eloc.isNull() ); - details->lastExtent = eloc; + getDur().writingDiskLoc(details->lastExtent) = eloc; } else { ni->add_ns(ns, eloc, capped); details = ni->details(ns); } - details->lastExtentSize = e->length; - DEBUGGING out() << "temp: newextent adddelrec " << ns << endl; + { + NamespaceDetails *dw = details->writingWithoutExtra(); + dw->lastExtentSize = e->length; + } details->addDeletedRec(emptyLoc.drec(), emptyLoc); } Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) { - massert( 10357 , "shutdown in progress", !goingAway ); - massert( 10358 , "bad new extent size", approxSize >= 0 && approxSize <= Extent::maxSize() ); - massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header ); // null if file open failed - int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength; + massert( 10357 , "shutdown in progress", ! inShutdown() ); + massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() ); + massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed + int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength; DiskLoc loc; - if ( ExtentSize <= 0 ) { + if ( ExtentSize < Extent::minSize() ) { /* not there could be a lot of looping here is db just started and no files are open yet. we might want to do something about that. */ if ( loops > 8 ) { @@ -401,12 +453,14 @@ namespace mongo { log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n"; return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1); } - int offset = header->unused.getOfs(); - header->unused.setOfs( fileNo, offset + ExtentSize ); - header->unusedLength -= ExtentSize; - loc.setOfs(fileNo, offset); + int offset = header()->unused.getOfs(); + + DataFileHeader *h = getDur().writing(header()); + h->unused.set( fileNo, offset + ExtentSize ); + h->unusedLength -= ExtentSize; + loc.set(fileNo, offset); Extent *e = _getExtent(loc); - DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset); + DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset); addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped); @@ -415,7 +469,7 @@ namespace mongo { return e; } - Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { + Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { string s = cc().database()->name + ".$freelist"; NamespaceDetails *f = nsdetails(s.c_str()); if( f ) { @@ -426,7 +480,7 @@ namespace mongo { if( low > 2048 ) low -= 256; high = (int) (approxSize * 1.05) + 256; } - else { + else { low = (int) (approxSize * 0.8); high = (int) (approxSize * 1.4); } @@ -436,20 +490,20 @@ namespace mongo { int bestDiff = 0x7fffffff; { DiskLoc L = f->firstExtent; - while( !L.isNull() ) { + while( !L.isNull() ) { Extent * e = L.ext(); - if( e->length >= low && e->length <= high ) { + if( e->length >= low && e->length <= high ) { int diff = abs(e->length - approxSize); - if( diff < bestDiff ) { + if( diff < bestDiff ) { bestDiff = diff; best = e; - if( diff == 0 ) + if( diff == 0 ) break; } } L = e->xnext; ++n; - + } } OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; @@ -457,13 +511,13 @@ namespace mongo { Extent *e = best; // remove from the free list if( !e->xprev.isNull() ) - e->xprev.ext()->xnext = e->xnext; + e->xprev.ext()->xnext.writing() = e->xnext; if( !e->xnext.isNull() ) - e->xnext.ext()->xprev = e->xprev; + e->xnext.ext()->xprev.writing() = e->xprev; if( f->firstExtent == e->myLoc ) - f->firstExtent = e->xnext; + f->firstExtent.writing() = e->xnext; if( f->lastExtent == e->myLoc ) - f->lastExtent = e->xprev; + f->lastExtent.writing() = e->xprev; // use it OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; @@ -479,9 +533,11 @@ namespace mongo { /*---------------------------------------------------------------------*/ - DiskLoc Extent::reuse(const char *nsname) { - /*TODOMMF - work to do when extent is freed. */ - log(3) << "reset extent was:" << nsDiagnostic.buf << " now:" << nsname << '\n'; + DiskLoc Extent::reuse(const char *nsname) { + return getDur().writing(this)->_reuse(nsname); + } + DiskLoc Extent::_reuse(const char *nsname) { + log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n'; massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 ); xnext.Null(); xprev.Null(); @@ -493,12 +549,9 @@ namespace mongo { emptyLoc.inc( (int) (_extentData-(char*)this) ); int delRecLength = length - (_extentData - (char *) this); - //DeletedRecord *empty1 = (DeletedRecord *) extentData; - DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc); - //assert( empty == empty1 ); - - // do we want to zero the record? memset(empty, ...) + DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc); + empty = getDur().writing(empty); empty->lengthWithHeaders = delRecLength; empty->extentOfs = myLoc.getOfs(); empty->nextDeleted.Null(); @@ -509,7 +562,7 @@ namespace mongo { /* assumes already zeroed -- insufficient for block 'reuse' perhaps */ DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) { magic = 0x41424344; - myLoc.setOfs(_fileNo, _offset); + myLoc.set(_fileNo, _offset); xnext.Null(); xprev.Null(); nsDiagnostic = nsname; @@ -521,9 +574,7 @@ namespace mongo { emptyLoc.inc( (int) (_extentData-(char*)this) ); int l = _length - (_extentData - (char *) this); - //DeletedRecord *empty1 = (DeletedRecord *) extentData; - DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, l); - //assert( empty == empty1 ); + DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) ); empty->lengthWithHeaders = l; empty->extentOfs = myLoc.getOfs(); return emptyLoc; @@ -582,7 +633,7 @@ namespace mongo { } return maxExtentSize; } - + /*---------------------------------------------------------------------*/ shared_ptr DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) { @@ -612,12 +663,12 @@ namespace mongo { d->dumpDeleted(&extents); } - if ( d->capped ) + if ( d->capped ) return shared_ptr( new ForwardCappedCursor( d , startLoc ) ); - + if ( !startLoc.isNull() ) - return shared_ptr(new BasicCursor( startLoc )); - + return shared_ptr(new BasicCursor( startLoc )); + while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { /* todo: if extent is empty, free it for reuse elsewhere. that is a bit complicated have to clean up the freelists. @@ -638,37 +689,38 @@ namespace mongo { if ( el.number() >= 0 ) return DataFileMgr::findAll(ns, startLoc); - + // "reverse natural order" NamespaceDetails *d = nsdetails(ns); - + if ( !d ) return shared_ptr(new BasicCursor(DiskLoc())); - + if ( !d->capped ) { if ( !startLoc.isNull() ) - return shared_ptr(new ReverseCursor( startLoc )); + return shared_ptr(new ReverseCursor( startLoc )); Extent *e = d->lastExtent.ext(); while ( e->lastRecord.isNull() && !e->xprev.isNull() ) { OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl; e = e->getPrevExtent(); } return shared_ptr(new ReverseCursor( e->lastRecord )); - } else { + } + else { return shared_ptr( new ReverseCappedCursor( d, startLoc ) ); } } - void printFreeList() { + void printFreeList() { string s = cc().database()->name + ".$freelist"; log() << "dump freelist " << s << '\n'; NamespaceDetails *freeExtents = nsdetails(s.c_str()); - if( freeExtents == 0 ) { + if( freeExtents == 0 ) { log() << " freeExtents==0" << endl; return; } DiskLoc a = freeExtents->firstExtent; - while( !a.isNull() ) { + while( !a.isNull() ) { Extent *e = a.ext(); log() << " " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n'; a = e->xnext; @@ -687,7 +739,7 @@ namespace mongo { NamespaceString s(nsToDrop); assert( s.db == cc().database()->name ); if( s.isSystem() ) { - if( s.coll == "system.profile" ) + if( s.coll == "system.profile" ) uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 ); else uasserted( 12502, "can't drop system ns" ); @@ -698,32 +750,31 @@ namespace mongo { BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" } string system_namespaces = cc().database()->name + ".system.namespaces"; /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true); - // no check of return code as this ns won't exist for some of the new storage engines + // no check of return code as this ns won't exist for some of the new storage engines } // free extents if( !d->firstExtent.isNull() ) { string s = cc().database()->name + ".$freelist"; NamespaceDetails *freeExtents = nsdetails(s.c_str()); - if( freeExtents == 0 ) { + if( freeExtents == 0 ) { string err; _userCreateNS(s.c_str(), BSONObj(), err, 0); freeExtents = nsdetails(s.c_str()); massert( 10361 , "can't create .$freelist", freeExtents); } - if( freeExtents->firstExtent.isNull() ) { - freeExtents->firstExtent = d->firstExtent; - freeExtents->lastExtent = d->lastExtent; + if( freeExtents->firstExtent.isNull() ) { + freeExtents->firstExtent.writing() = d->firstExtent; + freeExtents->lastExtent.writing() = d->lastExtent; } - else { + else { DiskLoc a = freeExtents->firstExtent; assert( a.ext()->xprev.isNull() ); - a.ext()->xprev = d->lastExtent; - d->lastExtent.ext()->xnext = a; - freeExtents->firstExtent = d->firstExtent; - - d->firstExtent.setInvalid(); - d->lastExtent.setInvalid(); + getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent; + getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a; + getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent; + getDur().writingDiskLoc( d->firstExtent ).setInvalid(); + getDur().writingDiskLoc( d->lastExtent ).setInvalid(); } } @@ -740,7 +791,7 @@ namespace mongo { BackgroundOperation::assertNoBgOpInProgForNs(name.c_str()); if ( d->nIndexes != 0 ) { - try { + try { assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) ); } catch( DBException& e ) { @@ -754,11 +805,10 @@ namespace mongo { log(1) << "\t dropIndexes done" << endl; result.append("ns", name.c_str()); ClientCursor::invalidate(name.c_str()); - Client::invalidateNS( name ); Top::global.collectionDropped( name ); - dropNS(name); + dropNS(name); } - + int nUnindexes = 0; /* unindex all keys in index for this record. */ @@ -797,63 +847,69 @@ namespace mongo { int n = d->nIndexes; for ( int i = 0; i < n; i++ ) _unindexRecord(d->idx(i), obj, dl, !noWarn); - if( d->backgroundIndexBuildInProgress ) { + if( d->indexBuildInProgress ) { // background index // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it - _unindexRecord(d->idx(n), obj, dl, false); + _unindexRecord(d->idx(n), obj, dl, false); } } - /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. + /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. caller must check if capped */ - void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) - { + void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) { /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs != DiskLoc::NullOfs ) - todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs; + getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs; if ( todelete->nextOfs != DiskLoc::NullOfs ) - todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs; + getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs; } /* remove ourself from extent pointers */ { - Extent *e = todelete->myExtent(dl); + Extent *e = getDur().writing( todelete->myExtent(dl) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs == DiskLoc::NullOfs ) e->firstRecord.Null(); else - e->firstRecord.setOfs(dl.a(), todelete->nextOfs); + e->firstRecord.set(dl.a(), todelete->nextOfs); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs == DiskLoc::NullOfs ) e->lastRecord.Null(); else - e->lastRecord.setOfs(dl.a(), todelete->prevOfs); + e->lastRecord.set(dl.a(), todelete->prevOfs); } } /* add to the free list */ { - d->nrecords--; - d->datasize -= todelete->netLength(); - /* temp: if in system.indexes, don't reuse, and zero out: we want to be - careful until validated more, as IndexDetails has pointers - to this disk location. so an incorrectly done remove would cause - a lot of problems. - */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize -= todelete->netLength(); + s->nrecords--; + } + if ( strstr(ns, ".system.indexes") ) { - memset(todelete, 0, todelete->lengthWithHeaders); + /* temp: if in system.indexes, don't reuse, and zero out: we want to be + careful until validated more, as IndexDetails has pointers + to this disk location. so an incorrectly done remove would cause + a lot of problems. + */ + memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders); } else { - DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. + DEV { + unsigned long long *p = (unsigned long long *) todelete->data; + *getDur().writing(p) = 0; + //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. + } d->addDeletedRec((DeletedRecord*)todelete, dl); } } } - void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) - { + void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) { dassert( todelete == dl.rec() ); NamespaceDetails* d = nsdetails(ns); @@ -880,8 +936,7 @@ namespace mongo { NamespaceDetails *d, NamespaceDetailsTransient *nsdt, Record *toupdate, const DiskLoc& dl, - const char *_buf, int _len, OpDebug& debug, bool &changedId, bool god) - { + const char *_buf, int _len, OpDebug& debug, bool god) { StringBuilder& ss = debug.str; dassert( toupdate == dl.rec() ); @@ -891,7 +946,7 @@ namespace mongo { DEV assert( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { - /* add back the old _id value if the update removes it. Note this implementation is slow + /* add back the old _id value if the update removes it. Note this implementation is slow (copies entire object multiple times), but this shouldn't happen often, so going for simple code, not speed. */ @@ -903,11 +958,13 @@ namespace mongo { objNew = b.obj(); } - /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further + /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ vector changes; + bool changedId = false; getIndexChanges(changes, *d, objNew, objOld, changedId); + uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId ); dupCheck(changes, *d, dl); if ( toupdate->netLength() < objNew.objsize() ) { @@ -946,8 +1003,8 @@ namespace mongo { try { /* we did the dupCheck() above. so we don't have to worry about it here. */ idx.head.btree()->bt_insert( - idx.head, - dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); + idx.head, + dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); } catch (AssertionException& e) { ss << " exception update index "; @@ -959,25 +1016,30 @@ namespace mongo { ss << '\n' << keyUpdates << " key updates "; } - // update in place - memcpy(toupdate->data, objNew.objdata(), objNew.objsize()); + // update in place + int sz = objNew.objsize(); + memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz); return dl; } - int followupExtentSize(int len, int lastExtentLen) { + int Extent::followupSize(int len, int lastExtentLen) { assert( len < Extent::maxSize() ); - int x = initialExtentSize(len); + int x = initialSize(len); int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2); int sz = y > x ? y : x; - if ( sz < lastExtentLen ) - sz = lastExtentLen; - else if ( sz > Extent::maxSize() ) + if ( sz < lastExtentLen ) { + // this means there was an int overflow + // so we should turn it into maxSize + sz = Extent::maxSize(); + } + else if ( sz > Extent::maxSize() ) { sz = Extent::maxSize(); - + } + sz = ((int)sz) & 0xffffff00; assert( sz > len ); - + return sz; } @@ -990,7 +1052,7 @@ namespace mongo { Ordering ordering = Ordering::make(order); int n = 0; for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { - if( ++n == 2 ) { + if( ++n == 2 ) { d->setIndexIsMultikey(idxNo); } assert( !recordLoc.isNull() ); @@ -999,7 +1061,7 @@ namespace mongo { *i, ordering, dupsAllowed, idx); } catch (AssertionException& e) { - if( e.getCode() == 10287 && idxNo == d->nIndexes ) { + if( e.getCode() == 10287 && idxNo == d->nIndexes ) { DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; continue; } @@ -1012,8 +1074,7 @@ namespace mongo { } } - void testSorting() - { + void testSorting() { BSONObjBuilder b; b.appendNull(""); BSONObj x = b.obj(); @@ -1027,9 +1088,9 @@ namespace mongo { sorter.add(x, DiskLoc(3,77)); sorter.sort(); - + auto_ptr i = sorter.iterator(); - while( i->more() ) { + while( i->more() ) { BSONObjExternalSorter::Data d = i->next(); /*cout << d.second.toString() << endl; cout << d.first.objsize() << endl; @@ -1039,7 +1100,6 @@ namespace mongo { // throws DBException unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { - assert( d->backgroundIndexBuildInProgress == 0 ); CurOp * op = cc().curop(); Timer t; @@ -1050,17 +1110,17 @@ namespace mongo { bool dropDups = idx.dropDups() || inDBRepair; BSONObj order = idx.keyPattern(); - idx.head.Null(); - + getDur().writingDiskLoc(idx.head).Null(); + if ( logLevel > 1 ) printMemInfo( "before index start" ); /* get and sort all the keys ----- */ unsigned long long n = 0; shared_ptr c = theDataFileMgr.findAll(ns); BSONObjExternalSorter sorter(order); - sorter.hintNumObjects( d->nrecords ); + sorter.hintNumObjects( d->stats.nrecords ); unsigned long long nkeys = 0; - ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ) ); + ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) ); while ( c->ok() ) { BSONObj o = c->current(); DiskLoc loc = c->currLoc(); @@ -1069,17 +1129,17 @@ namespace mongo { idx.getKeysFromObject(o, keys); int k = 0; for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { - if( ++k == 2 ) + if( ++k == 2 ) { d->setIndexIsMultikey(idxNo); - //cout<<"SORTER ADD " << i->toString() << ' ' << loc.toString() << endl; + } sorter.add(*i, loc); nkeys++; } - + c->advance(); n++; pm.hit(); - if ( logLevel > 1 && n % 10000 == 0 ){ + if ( logLevel > 1 && n % 10000 == 0 ) { printMemInfo( "\t iterating objects" ); } @@ -1089,37 +1149,37 @@ namespace mongo { if ( logLevel > 1 ) printMemInfo( "before final sort" ); sorter.sort(); if ( logLevel > 1 ) printMemInfo( "after final sort" ); - + log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; list dupsToDrop; - /* build index --- */ + /* build index --- */ { BtreeBuilder btBuilder(dupsAllowed, idx); BSONObj keyLast; auto_ptr i = sorter.iterator(); assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) ); - while( i->more() ) { + while( i->more() ) { RARELY killCurrentOp.checkForInterrupt(); BSONObjExternalSorter::Data d = i->next(); - try { + try { btBuilder.addKey(d.first, d.second); } - catch( AssertionException& e ) { - if ( dupsAllowed ){ + catch( AssertionException& e ) { + if ( dupsAllowed ) { // unknow exception?? throw; } - + if( e.interrupted() ) throw; if ( ! dropDups ) throw; - /* we could queue these on disk, but normally there are very few dups, so instead we + /* we could queue these on disk, but normally there are very few dups, so instead we keep in ram and have a limit. */ dupsToDrop.push_back(d.second); @@ -1131,9 +1191,11 @@ namespace mongo { op->setMessage( "index: (3/3) btree-middle" ); log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; btBuilder.commit(); - wassert( btBuilder.getn() == nkeys || dropDups ); + if ( btBuilder.getn() != nkeys && ! dropDups ) { + warning() << "not all entries were added to the index, probably some keys were too large" << endl; + } } - + log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl; for( list::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ) @@ -1142,13 +1204,13 @@ namespace mongo { return n; } - class BackgroundIndexBuildJob : public BackgroundOperation { + class BackgroundIndexBuildJob : public BackgroundOperation { unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { bool dupsAllowed = !idx.unique(); bool dropDups = idx.dropDups(); - ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords ); + ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords ); unsigned long long n = 0; auto_ptr cc; @@ -1156,25 +1218,26 @@ namespace mongo { shared_ptr c = theDataFileMgr.findAll(ns); cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) ); } - CursorId id = cc->cursorid; + CursorId id = cc->cursorid(); - while ( cc->c->ok() ) { - BSONObj js = cc->c->current(); - try { - _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed); - cc->c->advance(); - } catch( AssertionException& e ) { + while ( cc->ok() ) { + BSONObj js = cc->current(); + try { + _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed); + cc->advance(); + } + catch( AssertionException& e ) { if( e.interrupted() ) throw; if ( dropDups ) { - DiskLoc toDelete = cc->c->currLoc(); - bool ok = cc->c->advance(); + DiskLoc toDelete = cc->currLoc(); + bool ok = cc->advance(); cc->updateLocation(); theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true ); if( ClientCursor::find(id, false) == 0 ) { cc.release(); - if( !ok ) { + if( !ok ) { /* we were already at the end. normal. */ } else { @@ -1182,7 +1245,8 @@ namespace mongo { } break; } - } else { + } + else { log() << "background addExistingToIndex exception " << e.what() << endl; throw; } @@ -1200,7 +1264,7 @@ namespace mongo { return n; } - /* we do set a flag in the namespace for quick checking, but this is our authoritative info - + /* we do set a flag in the namespace for quick checking, but this is our authoritative info - that way on a crash/restart, we don't think we are still building one. */ set bgJobsInProgress; @@ -1208,12 +1272,8 @@ namespace mongo { assertInWriteLock(); uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 ); bgJobsInProgress.insert(d); - d->backgroundIndexBuildInProgress = 1; - d->nIndexes--; } void done(const char *ns, NamespaceDetails *d) { - d->nIndexes++; - d->backgroundIndexBuildInProgress = 0; NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache assertInWriteLock(); } @@ -1221,16 +1281,16 @@ namespace mongo { public: BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { } - unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { unsigned long long n = 0; prep(ns.c_str(), d); assert( idxNo == d->nIndexes ); - try { + try { idx.head = BtreeBucket::addBucket(idx); n = addExistingToIndex(ns.c_str(), d, idx, idxNo); } - catch(...) { + catch(...) { if( cc().database() && nsdetails(ns.c_str()) == d ) { assert( idxNo == d->nIndexes ); done(ns.c_str(), d); @@ -1246,25 +1306,51 @@ namespace mongo { } }; + /** + * For the lifetime of this object, an index build is indicated on the specified + * namespace and the newest index is marked as absent. This simplifies + * the cleanup required on recovery. + */ + class RecoverableIndexState { + public: + RecoverableIndexState( NamespaceDetails *d ) : _d( d ) { + indexBuildInProgress() = 1; + nIndexes()--; + } + ~RecoverableIndexState() { + DESTRUCTOR_GUARD ( + nIndexes()++; + indexBuildInProgress() = 0; + ) + } + private: + int &nIndexes() { return getDur().writingInt( _d->nIndexes ); } + int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); } + NamespaceDetails *_d; + }; + // throws DBException - static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { + static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl; Timer t; - unsigned long long n; + unsigned long long n; if( background ) { log(2) << "buildAnIndex: background=true\n"; } assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be... + assert( d->indexBuildInProgress == 0 ); + assertInWriteLock(); + RecoverableIndexState recoverable( d ); if( inDBRepair || !background ) { - n = fastBuildIndex(ns.c_str(), d, idx, idxNo); - assert( !idx.head.isNull() ); - } - else { + n = fastBuildIndex(ns.c_str(), d, idx, idxNo); + assert( !idx.head.isNull() ); + } + else { BackgroundIndexBuildJob j(ns.c_str()); n = j.go(ns, d, idx, idxNo); - } + } tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl; } @@ -1272,20 +1358,20 @@ namespace mongo { static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) { int n = d->nIndexesBeingBuilt(); for ( int i = 0; i < n; i++ ) { - try { + try { bool unique = d->idx(i).unique(); _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique); } - catch( DBException& ) { + catch( DBException& ) { /* try to roll back previously added index entries note <= i (not < i) is important here as the index we were just attempted may be multikey and require some cleanup. */ - for( int j = 0; j <= i; j++ ) { + for( int j = 0; j <= i; j++ ) { try { _unindexRecord(d->idx(j), obj, loc, false); } - catch(...) { + catch(...) { log(3) << "unindex fails on rollback after unique failure\n"; } } @@ -1301,7 +1387,7 @@ namespace mongo { if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) ) return; - d->flags |= NamespaceDetails::Flag_HaveIdIndex; + *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex; { NamespaceDetails::IndexIterator i = d->ii(); @@ -1324,7 +1410,7 @@ namespace mongo { } #pragma pack(1) - struct IDToInsert_ { + struct IDToInsert_ { char type; char _id[4]; OID oid; @@ -1338,13 +1424,13 @@ namespace mongo { IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {} } idToInsert; #pragma pack() - + void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) { BSONObj tmp = o; insertWithObjMod( ns, tmp, god ); logOp( "i", ns, tmp ); } - + DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) { DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god ); if ( !loc.isNull() ) @@ -1356,12 +1442,12 @@ namespace mongo { insert( ns, o.objdata(), o.objsize(), god ); } - bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection); + bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ); // We are now doing two btree scans for all unique indexes (one here, and one when we've // written the record to the collection. This could be made more efficient inserting // dummy data here, keeping pointers to the btree nodes holding the dummy data and then - // updating the dummy data with the DiskLoc of the real record. + // updating the dummy data with the DiskLoc of the real record. void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) { for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) { if( d->idx(idxNo).unique() ) { @@ -1371,19 +1457,19 @@ namespace mongo { BSONObj order = idx.keyPattern(); for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { uassert( 12582, "duplicate key insert for unique index of capped collection", - idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() ); + idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() ); } } - } + } } - /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc + /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc after the call -- that will prevent a double buffer copy in some cases (btree.cpp). */ DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) { bool wouldAddIndex = false; - massert( 10093 , "cannot insert into reserved $ collection", god || nsDollarCheck( ns ) ); - uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 ); + massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) ); + uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); const char *sys = strstr(ns, "system."); if ( sys ) { uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns); @@ -1411,7 +1497,7 @@ namespace mongo { also if this is an addIndex, those checks should happen before this! */ // This may create first file in the database. - cc().database()->allocExtent(ns, initialExtentSize(len), false); + cc().database()->allocExtent(ns, Extent::initialSize(len), false); d = nsdetails(ns); if ( !god ) ensureIdIndexForNewNs(ns); @@ -1421,17 +1507,24 @@ namespace mongo { NamespaceDetails *tableToIndex = 0; string tabletoidxns; + BSONObj fixedIndexObject; if ( addIndex ) { assert( obuf ); BSONObj io((const char *) obuf); - if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) ) + if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) return DiskLoc(); + + if ( ! fixedIndexObject.isEmpty() ) { + obuf = fixedIndexObject.objdata(); + len = fixedIndexObject.objsize(); + } + } const BSONElement *newId = &writeId; int addID = 0; if( !god ) { - /* Check if we have an _id field. If we don't, we'll add it. + /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); @@ -1446,7 +1539,7 @@ namespace mongo { } len += newId->size(); } - + BSONElementManipulator::lookForTimestamps( io ); } @@ -1456,28 +1549,28 @@ namespace mongo { if ( lenWHdr == 0 ) { // old datafiles, backward compatible here. assert( d->paddingFactor == 0 ); - d->paddingFactor = 1.0; + *getDur().writing(&d->paddingFactor) = 1.0; lenWHdr = len + Record::HeaderSize; } - + // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->nIndexes && d->capped && !god ) { checkNoIndexConflicts( d, BSONObj( reinterpret_cast( obuf ) ) ); } - + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); if ( loc.isNull() ) { // out of space if ( d->capped == 0 ) { // size capped doesn't grow log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl; - cc().database()->allocExtent(ns, followupExtentSize(lenWHdr, d->lastExtentSize), false); + cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false); loc = d->alloc(ns, lenWHdr, extentLoc); - if ( loc.isNull() ){ + if ( loc.isNull() ) { log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n"; - for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ){ + for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) { log() << "try #" << zzz << endl; - cc().database()->allocExtent(ns, followupExtentSize(len, d->lastExtentSize), false); + cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false); loc = d->alloc(ns, lenWHdr, extentLoc); if ( ! loc.isNull() ) break; @@ -1492,45 +1585,55 @@ namespace mongo { } Record *r = loc.rec(); - assert( r->lengthWithHeaders >= lenWHdr ); - if( addID ) { - /* a little effort was made here to avoid a double copy when we add an ID */ - ((int&)*r->data) = *((int*) obuf) + newId->size(); - memcpy(r->data+4, newId->rawdata(), newId->size()); - memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4); - } - else { - if( obuf ) - memcpy(r->data, obuf, len); - } - Extent *e = r->myExtent(loc); - if ( e->lastRecord.isNull() ) { - e->firstRecord = e->lastRecord = loc; - r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + { + assert( r->lengthWithHeaders >= lenWHdr ); + r = (Record*) getDur().writingPtr(r, lenWHdr); + if( addID ) { + /* a little effort was made here to avoid a double copy when we add an ID */ + ((int&)*r->data) = *((int*) obuf) + newId->size(); + memcpy(r->data+4, newId->rawdata(), newId->size()); + memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4); + } + else { + if( obuf ) + memcpy(r->data, obuf, len); + } } - else { - Record *oldlast = e->lastRecord.rec(); - r->prevOfs = e->lastRecord.getOfs(); - r->nextOfs = DiskLoc::NullOfs; - oldlast->nextOfs = loc.getOfs(); - e->lastRecord = loc; + { + Extent *e = r->myExtent(loc); + if ( e->lastRecord.isNull() ) { + Extent::FL *fl = getDur().writing(e->fl()); + fl->firstRecord = fl->lastRecord = loc; + r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + r->prevOfs = e->lastRecord.getOfs(); + r->nextOfs = DiskLoc::NullOfs; + getDur().writingInt(oldlast->nextOfs) = loc.getOfs(); + getDur().writingDiskLoc(e->lastRecord) = loc; + } } - d->nrecords++; - d->datasize += r->netLength(); + /* durability todo : this could be a bit annoying / slow to record constantly */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket if ( !god ) NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp(); - + if ( tableToIndex ) { uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos ); BSONObj info = loc.obj(); bool background = info["background"].trueValue(); - if( background && cc().isSyncThread() ) { - /* don't do background indexing on slaves. there are nuances. this could be added later + if( background && cc().isSyncThread() ) { + /* don't do background indexing on slaves. there are nuances. this could be added later but requires more code. */ log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl; @@ -1539,10 +1642,11 @@ namespace mongo { int idxNo = tableToIndex->nIndexes; IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes - idx.info = loc; + getDur().writingDiskLoc(idx.info) = loc; try { buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background); - } catch( DBException& e ) { + } + catch( DBException& e ) { // save our error msg string as an exception or dropIndexes will overwrite our message LastError *le = lastError.get(); int savecode = 0; @@ -1564,7 +1668,7 @@ namespace mongo { if( !ok ) { log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl; } - + assert( le && !saveerrmsg.empty() ); raiseError(savecode,saveerrmsg.c_str()); throw; @@ -1573,20 +1677,20 @@ namespace mongo { /* add this record to our indexes */ if ( d->nIndexes ) { - try { + try { BSONObj obj(r->data); indexRecord(d, obj, loc); - } - catch( AssertionException& e ) { + } + catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->capped ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->capped ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; uassert_nothrow(s.c_str()); - log() << s << '\n'; + error() << s << endl; } - else { + else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; @@ -1594,7 +1698,7 @@ namespace mongo { } } - // out() << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl; + // out() << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl; return loc; } @@ -1619,18 +1723,27 @@ namespace mongo { Extent *e = r->myExtent(loc); if ( e->lastRecord.isNull() ) { - e->firstRecord = e->lastRecord = loc; - r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + Extent::FL *fl = getDur().writing( e->fl() ); + fl->firstRecord = fl->lastRecord = loc; + + Record::NP *np = getDur().writing(r->np()); + np->nextOfs = np->prevOfs = DiskLoc::NullOfs; } else { Record *oldlast = e->lastRecord.rec(); - r->prevOfs = e->lastRecord.getOfs(); - r->nextOfs = DiskLoc::NullOfs; - oldlast->nextOfs = loc.getOfs(); - e->lastRecord = loc; + Record::NP *np = getDur().writing(r->np()); + np->prevOfs = e->lastRecord.getOfs(); + np->nextOfs = DiskLoc::NullOfs; + getDur().writingInt( oldlast->nextOfs ) = loc.getOfs(); + e->lastRecord.writing() = loc; } - d->nrecords++; + /* todo: don't update for oplog? seems wasteful. */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } return r; } @@ -1641,7 +1754,7 @@ namespace mongo { namespace mongo { - void dropAllDatabasesExceptLocal() { + void dropAllDatabasesExceptLocal() { writelock lk(""); vector n; @@ -1658,14 +1771,17 @@ namespace mongo { void dropDatabase(string db) { log(1) << "dropDatabase " << db << endl; - assert( cc().database() ); - assert( cc().database()->name == db ); + Database *d = cc().database(); + assert( d ); + assert( d->name == db ); - BackgroundOperation::assertNoBgOpInProgForDb(db.c_str()); + BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str()); - Client::invalidateDB( db ); + getDur().syncDataAndTruncateJournal(); + + Database::closeDatabase( d->name.c_str(), d->path ); + d = 0; // d is now deleted - closeDatabase( db.c_str() ); _deleteDataFiles( db.c_str() ); } @@ -1674,13 +1790,14 @@ namespace mongo { void boostRenameWrapper( const Path &from, const Path &to ) { try { boost::filesystem::rename( from, to ); - } catch ( const boost::filesystem::filesystem_error & ) { + } + catch ( const boost::filesystem::filesystem_error & ) { // boost rename doesn't work across partitions boost::filesystem::copy_file( from, to); boost::filesystem::remove( from ); } } - + // back up original database files to 'temp' dir void _renameForBackup( const char *database, const Path &reservedPath ) { Path newPath( reservedPath ); @@ -1738,7 +1855,8 @@ namespace mongo { ss << prefix << "_repairDatabase_" << i++; reservedPath = repairPath / ss.str(); BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) ); - } while ( exists ); + } + while ( exists ); return reservedPath; } @@ -1790,12 +1908,15 @@ namespace mongo { stringstream ss; ss << "localhost:" << cmdLine.port; string localhost = ss.str(); - + problem() << "repairDatabase " << dbName << endl; assert( cc().database()->name == dbName ); + assert( cc().database()->path == dbpath ); BackgroundOperation::assertNoBgOpInProgForDb(dbName); + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + boost::intmax_t totalSize = dbSize( dbName ); boost::intmax_t freeSize = freeSpace( repairpath ); if ( freeSize > -1 && freeSize < totalSize ) { @@ -1812,30 +1933,37 @@ namespace mongo { "backup" : "$tmp" ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); string reservedPathString = reservedPath.native_directory_string(); - + bool res; - { // clone to temp location, which effectively does repair + { + // clone to temp location, which effectively does repair Client::Context ctx( dbName, reservedPathString ); assert( ctx.justCreated() ); - - res = cloneFrom(localhost.c_str(), errmsg, dbName, - /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false); - closeDatabase( dbName, reservedPathString.c_str() ); + + res = cloneFrom(localhost.c_str(), errmsg, dbName, + /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false); + Database::closeDatabase( dbName, reservedPathString.c_str() ); } if ( !res ) { problem() << "clone failed for " << dbName << " with error: " << errmsg << endl; if ( !preserveClonedFilesOnFailure ) BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + return false; } + MongoFile::flushAll(true); + Client::Context ctx( dbName ); - closeDatabase( dbName ); + Database::closeDatabase( dbName, dbpath ); if ( backupOriginalFiles ) { _renameForBackup( dbName, reservedPath ); - } else { + } + else { _deleteDataFiles( dbName ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) ); } @@ -1845,12 +1973,14 @@ namespace mongo { if ( !backupOriginalFiles ) BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + return true; } void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) { if ( afterAllocator ) - theFileAllocator().waitUntilFinished(); + FileAllocator::get()->waitUntilFinished(); string c = database; c += '.'; boost::filesystem::path p(path); @@ -1871,8 +2001,8 @@ namespace mongo { q = p / ss.str(); BOOST_CHECK_EXCEPTION( ok = fo.apply(q) ); if ( ok ) { - if ( extra != 10 ){ - log(1) << fo.op() << " file " << q.string() << '\n'; + if ( extra != 10 ) { + log(1) << fo.op() << " file " << q.string() << endl; log() << " _applyOpToDataFiles() warning: extra == " << extra << endl; } } @@ -1883,19 +2013,20 @@ namespace mongo { } NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); } - - bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){ + + bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) { log() << "DatabaseHolder::closeAll path:" << path << endl; dbMutex.assertWriteLocked(); - + map& m = _paths[path]; _size -= m.size(); - + set< string > dbs; for ( map::iterator i = m.begin(); i != m.end(); i++ ) { + wassert( i->second->path == path ); dbs.insert( i->first ); } - + currentClient.get()->getContext()->clear(); BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); @@ -1910,7 +2041,7 @@ namespace mongo { nNotClosed++; } else { - closeDatabase( name.c_str() , path ); + Database::closeDatabase( name.c_str() , path ); bb.append( bb.numStr( n++ ) , name ); } } @@ -1923,6 +2054,17 @@ namespace mongo { return true; } - + + bool isValidNS( const StringData& ns ) { + // TODO: should check for invalid characters + + const char * x = strchr( ns.data() , '.' ); + if ( ! x ) + return false; + + x++; + return *x > 0; + } + } // namespace mongo diff --git a/db/pdfile.h b/db/pdfile.h index d268aac..91f4877 100644 --- a/db/pdfile.h +++ b/db/pdfile.h @@ -29,8 +29,9 @@ #include "../util/mmap.h" #include "diskloc.h" #include "jsobjmanipulator.h" -#include "namespace.h" +#include "namespace-inl.h" #include "client.h" +#include "mongommf.h" namespace mongo { @@ -45,53 +46,60 @@ namespace mongo { /* low level - only drops this ns */ void dropNS(const string& dropNs); - + /* deletes this ns, indexes and cursors */ - void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); + void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0); shared_ptr findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc()); -// -1 if library unavailable. + // -1 if library unavailable. boost::intmax_t freeSpace( const string &path = dbpath ); + bool isValidNS( const StringData& ns ); + /*---------------------------------------------------------------------*/ class MongoDataFile { friend class DataFileMgr; friend class BasicCursor; public: - MongoDataFile(int fn) : fileNo(fn) { } + MongoDataFile(int fn) : _mb(0), fileNo(fn) { } void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false); - /* allocate a new extent from this datafile. + /* allocate a new extent from this datafile. @param capped - true if capped collection @param loops is our recursion check variable - you want to pass in zero */ Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0); - DataFileHeader *getHeader() { - return header; - } + DataFileHeader *getHeader() { return header(); } + + unsigned long long length() const { return mmf.length(); } /* return max size an extent may be */ static int maxSize(); - + + /** fsync */ void flush( bool sync ); - + + /** only use fore debugging */ + Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); } private: void badOfs(int) const; - + void badOfs2(int) const; int defaultSize( const char *filename ) const; - Extent* getExtent(DiskLoc loc); - Extent* _getExtent(DiskLoc loc); + Extent* getExtent(DiskLoc loc) const; + Extent* _getExtent(DiskLoc loc) const; Record* recordAt(DiskLoc dl); Record* makeRecord(DiskLoc dl, int size); - void grow(DiskLoc dl, int size); + void grow(DiskLoc dl, int size); - MMF mmf; - MMF::Pointer _p; - DataFileHeader *header; + char* p() const { return (char *) _mb; } + DataFileHeader* header() { return (DataFileHeader*) _mb; } + + MongoMMF mmf; + void *_mb; // the memory mapped view int fileNo; }; @@ -110,9 +118,9 @@ namespace mongo { NamespaceDetails *d, NamespaceDetailsTransient *nsdt, Record *toupdate, const DiskLoc& dl, - const char *buf, int len, OpDebug& debug, bool &changedId, bool god=false); + const char *buf, int len, OpDebug& debug, bool god=false); - // The object o may be updated if modified on insert. + // The object o may be updated if modified on insert. void insertAndLog( const char *ns, const BSONObj &o, bool god = false ); /** @param obj both and in and out param -- insert can sometimes modify an object (such as add _id). */ @@ -122,7 +130,6 @@ namespace mongo { void insertNoReturnVal(const char *ns, BSONObj o, bool god = false); DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, const BSONElement &writeId = BSONElement(), bool mayAddIndex = true); - void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false); static shared_ptr findAll(const char *ns, const DiskLoc &startLoc = DiskLoc()); /* special version of insert for transaction logging -- streamlined a bit. @@ -134,9 +141,10 @@ namespace mongo { static Extent* getExtent(const DiskLoc& dl); static Record* getRecord(const DiskLoc& dl); static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len); - static void grow(const DiskLoc& dl, int len); - /* does not clean up indexes, etc. : just deletes the record in the pdfile. */ + void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false); + + /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */ void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl); private: @@ -175,7 +183,10 @@ namespace mongo { int extentOfs; int nextOfs; int prevOfs; + + /** be careful when referencing this that your write intent was correct */ char data[4]; + int netLength() { return lengthWithHeaders - HeaderSize; } @@ -192,6 +203,12 @@ namespace mongo { /* get the next record in the namespace, traversing extents as necessary */ DiskLoc getNext(const DiskLoc& myLoc); DiskLoc getPrev(const DiskLoc& myLoc); + + struct NP { + int nextOfs; + int prevOfs; + }; + NP* np() { return (NP*) &nextOfs; } }; /* extents are datafile regions where all the records within the region @@ -206,13 +223,14 @@ namespace mongo { DiskLoc myLoc; DiskLoc xnext, xprev; /* next/prev extent for this namespace */ - /* which namespace this extent is for. this is just for troubleshooting really + /* which namespace this extent is for. this is just for troubleshooting really and won't even be correct if the collection were renamed! */ - Namespace nsDiagnostic; + Namespace nsDiagnostic; int length; /* size of the extent, including these fields */ - DiskLoc firstRecord, lastRecord; + DiskLoc firstRecord; + DiskLoc lastRecord; char _extentData[4]; static int HeaderSize() { return sizeof(Extent)-4; } @@ -224,7 +242,7 @@ namespace mongo { void dump(iostream& s) { s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n'; - s << " nsdiag:" << nsDiagnostic.buf << '\n'; + s << " nsdiag:" << nsDiagnostic.toString() << '\n'; s << " size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n'; } @@ -237,9 +255,8 @@ namespace mongo { /* like init(), but for a reuse case */ DiskLoc reuse(const char *nsname); - void assertOk() { - assert(magic == 0x41424344); - } + bool isOk() const { return magic == 0x41424344; } + void assertOk() const { assert(isOk()); } Record* newRecord(int len); @@ -251,19 +268,38 @@ namespace mongo { return (Record *) (((char *) this) + x); } - Extent* getNextExtent() { - return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); - } - Extent* getPrevExtent() { - return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); - } - + Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); } + Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); } + static int maxSize(); + static int minSize() { return 0x100; } + /** + * @param len lengt of record we need + * @param lastRecord size of last extent which is a factor in next extent size + */ + static int followupSize(int len, int lastExtentLen); + + /** + * @param len lengt of record we need + */ + static int initialSize(int len); + + struct FL { + DiskLoc firstRecord; + DiskLoc lastRecord; + }; + /** often we want to update just the firstRecord and lastRecord fields. + this helper is for that -- for use with getDur().writing() method + */ + FL* fl() { return (FL*) &firstRecord; } + private: + DiskLoc _reuse(const char *nsname); }; - /* + /* a datafile - i.e. the "dbname.<#>" files : + ---------------------- - Header + DataFileHeader ---------------------- Extent (for a particular namespace) Record @@ -273,7 +309,6 @@ namespace mongo { more Extents... ---------------------- */ - class DataFileHeader { public: int version; @@ -287,35 +322,27 @@ namespace mongo { enum { HeaderSize = 8192 }; - bool currentVersion() const { - return ( version == VERSION ) && ( versionMinor == VERSION_MINOR ); - } - - bool uninitialized() const { - if ( version == 0 ) return true; - return false; - } + bool isCurrentVersion() const { return ( version == VERSION ) && ( versionMinor == VERSION_MINOR ); } - /*Record* __getRecord(DiskLoc dl) { - int ofs = dl.getOfs(); - assert( ofs >= HeaderSize ); - return (Record*) (((char *) this) + ofs); - }*/ + bool uninitialized() const { return version == 0; } - void init(int fileno, int filelength) { + void init(int fileno, int filelength, const char* filename) { if ( uninitialized() ) { - assert(filelength > 32768 ); + if( !(filelength > 32768 ) ) { + massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false); + } + getDur().createdFile(filename, filelength); assert( HeaderSize == 8192 ); - fileLength = filelength; - version = VERSION; - versionMinor = VERSION_MINOR; - unused.setOfs( fileno, HeaderSize ); + DataFileHeader *h = getDur().writing(this); + h->fileLength = filelength; + h->version = VERSION; + h->versionMinor = VERSION_MINOR; + h->unused.set( fileno, HeaderSize ); assert( (data-(char*)this) == HeaderSize ); - unusedLength = fileLength - HeaderSize - 16; - //memcpy(data+unusedLength, " \nthe end\n", 16); + h->unusedLength = fileLength - HeaderSize - 16; } } - + bool isEmpty() const { return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 ); } @@ -323,13 +350,13 @@ namespace mongo { #pragma pack() - inline Extent* MongoDataFile::_getExtent(DiskLoc loc) { + inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const { loc.assertOk(); - Extent *e = (Extent *) _p.at(loc.getOfs(), Extent::HeaderSize()); + Extent *e = (Extent *) (p()+loc.getOfs()); return e; } - inline Extent* MongoDataFile::getExtent(DiskLoc loc) { + inline Extent* MongoDataFile::getExtent(DiskLoc loc) const { Extent *e = _getExtent(loc); e->assertOk(); return e; @@ -344,18 +371,13 @@ namespace mongo { inline Record* MongoDataFile::recordAt(DiskLoc dl) { int ofs = dl.getOfs(); if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path - return (Record*) _p.at(ofs, -1); + return (Record*) (p()+ofs); } - inline void MongoDataFile::grow(DiskLoc dl, int size) { - int ofs = dl.getOfs(); - _p.grow(ofs, size); - } - - inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) { + inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) { int ofs = dl.getOfs(); - assert( ofs >= DataFileHeader::HeaderSize ); - return (Record*) _p.at(ofs, size); + if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path + return (Record*) (p()+ofs); } inline DiskLoc Record::getNext(const DiskLoc& myLoc) { @@ -395,50 +417,23 @@ namespace mongo { return BSONObj(rec()); } inline DeletedRecord* DiskLoc::drec() const { - assert( fileNo != -1 ); + assert( _a != -1 ); return (DeletedRecord*) rec(); } inline Extent* DiskLoc::ext() const { return DataFileMgr::getExtent(*this); } - - /*---------------------------------------------------------------------*/ + inline const BtreeBucket* DiskLoc::btree() const { + assert( _a != -1 ); + return (const BtreeBucket *) rec()->data; + } } // namespace mongo -#include "rec.h" #include "database.h" namespace mongo { - // Heritable class to implement an operation that may be applied to all - // files in a database using _applyOpToDataFiles() - class FileOp { - public: - virtual ~FileOp() {} - // Return true if file exists and operation successful - virtual bool apply( const boost::filesystem::path &p ) = 0; - virtual const char * op() const = 0; - }; - - void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); - - inline void _deleteDataFiles(const char *database) { - if ( directoryperdb ) { - BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) ); - return; - } - class : public FileOp { - virtual bool apply( const boost::filesystem::path &p ) { - return boost::filesystem::remove( p ); - } - virtual const char * op() const { - return "remove"; - } - } deleter; - _applyOpToDataFiles( database, deleter, true ); - } - boost::intmax_t dbSize( const char *database ); inline NamespaceIndex* nsindex(const char *ns) { @@ -462,11 +457,6 @@ namespace mongo { return nsindex(ns)->details(ns); } - inline MongoDataFile& DiskLoc::pdf() const { - assert( fileNo != -1 ); - return *cc().database()->getFile(fileNo); - } - inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) { assert( dl.a() != -1 ); return cc().database()->getFile(dl.a())->getExtent(dl); @@ -477,30 +467,30 @@ namespace mongo { return cc().database()->getFile(dl.a())->recordAt(dl); } - BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) ); - - inline void DataFileMgr::grow(const DiskLoc& dl, int len) { - assert( dl.a() != -1 ); - cc().database()->getFile(dl.a())->grow(dl, len); - } + BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) ); - inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) { + inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) { assert( dl.a() != -1 ); return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord)); } - + void ensureHaveIdIndex(const char *ns); - + bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex ); /** - * @return true if ns is ok + * @return true if ns is 'normal'. $ used for collections holding index data, which do not contain BSON objects in their records. + * special case for the local.oplog.$main ns -- naming it as such was a mistake. */ - inline bool nsDollarCheck( const char* ns ){ + inline bool isANormalNSName( const char* ns ) { if ( strchr( ns , '$' ) == 0 ) return true; - return strcmp( ns, "local.oplog.$main" ) == 0; } + + inline BSONObj::BSONObj(const Record *r) { + init(r->data, false); + } + } // namespace mongo diff --git a/db/projection.cpp b/db/projection.cpp new file mode 100644 index 0000000..3dcfef7 --- /dev/null +++ b/db/projection.cpp @@ -0,0 +1,301 @@ +// projection.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" +#include "projection.h" +#include "../util/mongoutils/str.h" + +namespace mongo { + + void Projection::init( const BSONObj& o ) { + massert( 10371 , "can only add to Projection once", _source.isEmpty()); + _source = o; + + BSONObjIterator i( o ); + int true_false = -1; + while ( i.more() ) { + BSONElement e = i.next(); + + if ( ! e.isNumber() ) + _hasNonSimple = true; + + if (e.type() == Object) { + BSONObj obj = e.embeddedObject(); + BSONElement e2 = obj.firstElement(); + if ( strcmp(e2.fieldName(), "$slice") == 0 ) { + if (e2.isNumber()) { + int i = e2.numberInt(); + if (i < 0) + add(e.fieldName(), i, -i); // limit is now positive + else + add(e.fieldName(), 0, i); + + } + else if (e2.type() == Array) { + BSONObj arr = e2.embeddedObject(); + uassert(13099, "$slice array wrong size", arr.nFields() == 2 ); + + BSONObjIterator it(arr); + int skip = it.next().numberInt(); + int limit = it.next().numberInt(); + uassert(13100, "$slice limit must be positive", limit > 0 ); + add(e.fieldName(), skip, limit); + + } + else { + uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false); + } + } + else { + uassert(13097, string("Unsupported projection option: ") + obj.firstElement().fieldName(), false); + } + + } + else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()) { + _includeID = false; + + } + else { + + add (e.fieldName(), e.trueValue()); + + // validate input + if (true_false == -1) { + true_false = e.trueValue(); + _include = !e.trueValue(); + } + else { + uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." , + (bool)true_false == e.trueValue() ); + } + } + } + } + + void Projection::add(const string& field, bool include) { + if (field.empty()) { // this is the field the user referred to + _include = include; + } + else { + _include = !include; + + const size_t dot = field.find('.'); + const string subfield = field.substr(0,dot); + const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); + + boost::shared_ptr& fm = _fields[subfield]; + if (!fm) + fm.reset(new Projection()); + + fm->add(rest, include); + } + } + + void Projection::add(const string& field, int skip, int limit) { + _special = true; // can't include or exclude whole object + + if (field.empty()) { // this is the field the user referred to + _skip = skip; + _limit = limit; + } + else { + const size_t dot = field.find('.'); + const string subfield = field.substr(0,dot); + const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); + + boost::shared_ptr& fm = _fields[subfield]; + if (!fm) + fm.reset(new Projection()); + + fm->add(rest, skip, limit); + } + } + + void Projection::transform( const BSONObj& in , BSONObjBuilder& b ) const { + BSONObjIterator i(in); + while ( i.more() ) { + BSONElement e = i.next(); + if ( mongoutils::str::equals( "_id" , e.fieldName() ) ) { + if ( _includeID ) + b.append( e ); + } + else { + append( b , e ); + } + } + } + + BSONObj Projection::transform( const BSONObj& in ) const { + BSONObjBuilder b; + transform( in , b ); + return b.obj(); + } + + + //b will be the value part of an array-typed BSONElement + void Projection::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const { + int skip = nested ? 0 : _skip; + int limit = nested ? -1 : _limit; + + if (skip < 0) { + skip = max(0, skip + a.nFields()); + } + + int i=0; + BSONObjIterator it(a); + while (it.more()) { + BSONElement e = it.next(); + + if (skip) { + skip--; + continue; + } + + if (limit != -1 && (limit-- == 0)) { + break; + } + + switch(e.type()) { + case Array: { + BSONObjBuilder subb; + appendArray(subb , e.embeddedObject(), true); + b.appendArray(b.numStr(i++), subb.obj()); + break; + } + case Object: { + BSONObjBuilder subb; + BSONObjIterator jt(e.embeddedObject()); + while (jt.more()) { + append(subb , jt.next()); + } + b.append(b.numStr(i++), subb.obj()); + break; + } + default: + if (_include) + b.appendAs(e, b.numStr(i++)); + } + } + } + + void Projection::append( BSONObjBuilder& b , const BSONElement& e ) const { + FieldMap::const_iterator field = _fields.find( e.fieldName() ); + + if (field == _fields.end()) { + if (_include) + b.append(e); + } + else { + Projection& subfm = *field->second; + + if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ) { + if (subfm._include) + b.append(e); + } + else if (e.type() == Object) { + BSONObjBuilder subb; + BSONObjIterator it(e.embeddedObject()); + while (it.more()) { + subfm.append(subb, it.next()); + } + b.append(e.fieldName(), subb.obj()); + + } + else { //Array + BSONObjBuilder subb; + subfm.appendArray(subb, e.embeddedObject()); + b.appendArray(e.fieldName(), subb.obj()); + } + } + } + + Projection::KeyOnly* Projection::checkKey( const BSONObj& keyPattern ) const { + if ( _include ) { + // if we default to including then we can't + // use an index because we don't know what we're missing + return 0; + } + + if ( _hasNonSimple ) + return 0; + + if ( _includeID && keyPattern["_id"].eoo() ) + return 0; + + // at this point we know its all { x : 1 } style + + auto_ptr p( new KeyOnly() ); + + int got = 0; + BSONObjIterator i( keyPattern ); + while ( i.more() ) { + BSONElement k = i.next(); + + if ( _source[k.fieldName()].type() ) { + + if ( strchr( k.fieldName() , '.' ) ) { + // TODO we currently don't support dotted fields + // SERVER-2104 + return 0; + } + + if ( ! _includeID && mongoutils::str::equals( k.fieldName() , "_id" ) ) { + p->addNo(); + } + else { + p->addYes( k.fieldName() ); + got++; + } + } + else if ( mongoutils::str::equals( "_id" , k.fieldName() ) && _includeID ) { + p->addYes( "_id" ); + } + else { + p->addNo(); + } + + } + + int need = _source.nFields(); + if ( ! _includeID ) + need--; + + if ( got == need ) + return p.release(); + + return 0; + } + + BSONObj Projection::KeyOnly::hydrate( const BSONObj& key ) const { + assert( _include.size() == _names.size() ); + + BSONObjBuilder b( key.objsize() + _stringSize + 16 ); + + BSONObjIterator i(key); + unsigned n=0; + while ( i.more() ) { + assert( n < _include.size() ); + BSONElement e = i.next(); + if ( _include[n] ) { + b.appendAs( e , _names[n] ); + } + n++; + } + + return b.obj(); + } +} diff --git a/db/projection.h b/db/projection.h new file mode 100644 index 0000000..fd3b856 --- /dev/null +++ b/db/projection.h @@ -0,0 +1,127 @@ +// projection.h + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pch.h" +#include "jsobj.h" + +namespace mongo { + + /** + * given a document and a projection specification + * can transform the document + * currently supports specifying which fields and $slice + */ + class Projection { + public: + + class KeyOnly { + public: + + KeyOnly() : _stringSize(0) {} + + BSONObj hydrate( const BSONObj& key ) const; + + void addNo() { _add( false , "" ); } + void addYes( const string& name ) { _add( true , name ); } + + private: + + void _add( bool b , const string& name ) { + _include.push_back( b ); + _names.push_back( name ); + _stringSize += name.size(); + } + + vector _include; // one entry per field in key. true iff should be in output + vector _names; // name of field since key doesn't have names + + int _stringSize; + }; + + Projection() : + _include(true) , + _special(false) , + _includeID(true) , + _skip(0) , + _limit(-1) , + _hasNonSimple(false) { + } + + /** + * called once per lifetime + * e.g. { "x" : 1 , "a.y" : 1 } + */ + void init( const BSONObj& spec ); + + /** + * @return the spec init was called with + */ + BSONObj getSpec() const { return _source; } + + /** + * transforms in according to spec + */ + BSONObj transform( const BSONObj& in ) const; + + + /** + * transforms in according to spec + */ + void transform( const BSONObj& in , BSONObjBuilder& b ) const; + + + /** + * @return if the keyPattern has all the information needed to return then + * return a new KeyOnly otherwise null + * NOTE: a key may have modified the actual data + * which has to be handled above this (arrays, geo) + */ + KeyOnly* checkKey( const BSONObj& keyPattern ) const; + + private: + + /** + * appends e to b if user wants it + * will descend into e if needed + */ + void append( BSONObjBuilder& b , const BSONElement& e ) const; + + + void add( const string& field, bool include ); + void add( const string& field, int skip, int limit ); + void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const; + + bool _include; // true if default at this level is to include + bool _special; // true if this level can't be skipped or included without recursing + + //TODO: benchmark vector vs map + typedef map > FieldMap; + FieldMap _fields; + BSONObj _source; + bool _includeID; + + // used for $slice operator + int _skip; + int _limit; + + bool _hasNonSimple; + }; + + +} diff --git a/db/query.cpp b/db/query.cpp index 154fd15..df09fce 100644 --- a/db/query.cpp +++ b/db/query.cpp @@ -30,7 +30,7 @@ #include "replpair.h" #include "scanandorder.h" #include "security.h" -#include "curop.h" +#include "curop-inl.h" #include "commands.h" #include "queryoptimizer.h" #include "lasterror.h" @@ -67,7 +67,7 @@ namespace mongo { _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) ); } return _cc->prepareToYield( _yieldData ); - } + } virtual void recoverFromYield() { if ( !ClientCursor::recoverFromYield( _yieldData ) ) { _cc.reset(); @@ -75,24 +75,28 @@ namespace mongo { massert( 13340, "cursor dropped during delete", false ); } } + virtual long long nscanned() { + assert( c_.get() ); + return c_->nscanned(); + } virtual void next() { if ( !c_->ok() ) { setComplete(); return; } - + DiskLoc rloc = c_->currLoc(); - + if ( matcher()->matches(c_->currKey(), rloc ) ) { if ( !c_->getsetdup(rloc) ) ++count_; } c_->advance(); - ++_nscanned; + _nscanned = c_->nscanned(); if ( count_ > bestCount_ ) bestCount_ = count_; - + if ( count_ > 0 ) { if ( justOne_ ) setComplete(); @@ -115,7 +119,7 @@ namespace mongo { ClientCursor::CleanupPointer _cc; ClientCursor::YieldData _yieldData; }; - + /* ns: namespace, e.g. . pattern: the "where" clause / criteria justOne: stop after 1 match @@ -124,13 +128,13 @@ namespace mongo { long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) { if( !god ) { if ( strstr(ns, ".system.") ) { - /* note a delete from system.indexes would corrupt the db - if done here, as there are pointers into those objects in + /* note a delete from system.indexes would corrupt the db + if done here, as there are pointers into those objects in NamespaceDetails. */ uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) ); } - if ( strchr( ns , '$' ) ){ + if ( strchr( ns , '$' ) ) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 ); } @@ -145,55 +149,56 @@ namespace mongo { int best = 0; shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) ); - shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, true ) ); - + shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) ); + if( !creal->ok() ) return nDeleted; - + shared_ptr< Cursor > cPtr = creal; auto_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) ); cc->setDoingDeletes( true ); - - CursorId id = cc->cursorid; - + + CursorId id = cc->cursorid(); + bool justOne = justOneOrig; bool canYield = !god && !creal->matcher()->docMatcher().atomic(); + do { - if ( canYield && ! cc->yieldSometimes() ){ + if ( canYield && ! cc->yieldSometimes() ) { cc.release(); // has already been deleted elsewhere // TODO should we assert or something? break; } - if ( !cc->c->ok() ) { + if ( !cc->ok() ) { break; // if we yielded, could have hit the end } - + // this way we can avoid calling updateLocation() every time (expensive) // as well as some other nuances handled cc->setDoingDeletes( true ); - - DiskLoc rloc = cc->c->currLoc(); - BSONObj key = cc->c->currKey(); - // NOTE Calling advance() may change the matcher, so it's important + DiskLoc rloc = cc->currLoc(); + BSONObj key = cc->currKey(); + + // NOTE Calling advance() may change the matcher, so it's important // to try to match first. bool match = creal->matcher()->matches( key , rloc ); - - if ( ! cc->c->advance() ) + + if ( ! cc->advance() ) justOne = true; - + if ( ! match ) continue; - - assert( !cc->c->getsetdup(rloc) ); // can't be a dup, we deleted it! - + + assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it! + if ( !justOne ) { /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore blocks. here we might call millions of times which would be bad. */ - cc->c->noteLocation(); + cc->c()->noteLocation(); } - + if ( logop ) { BSONElement e; if( BSONObj( rloc.rec() ).getObjectID( e ) ) { @@ -201,7 +206,8 @@ namespace mongo { b.append( e ); bool replJustOne = true; logOp( "d", ns, b.done(), 0, &replJustOne ); - } else { + } + else { problem() << "deleted object without id, not logging" << endl; } } @@ -214,14 +220,20 @@ namespace mongo { if ( justOne ) { break; } - cc->c->checkLocation(); - - } while ( cc->c->ok() ); + cc->c()->checkLocation(); + + if( !god ) + getDur().commitIfNeeded(); - if ( cc.get() && ClientCursor::find( id , false ) == 0 ){ + if( debug && god && nDeleted == 100 ) + log() << "warning high number of deletes with god=true which could use significant memory" << endl; + } + while ( cc->ok() ); + + if ( cc.get() && ClientCursor::find( id , false ) == 0 ) { cc.release(); } - + return nDeleted; } @@ -246,16 +258,6 @@ namespace mongo { int nCaught = 0; - void killCursors(int n, long long *ids) { - int k = 0; - for ( int i = 0; i < n; i++ ) { - if ( ClientCursor::erase(ids[i]) ) - k++; - } - if ( logLevel > 0 || k != n ){ - log( k == n ) << "killcursors: found " << k << " of " << n << endl; - } - } BSONObj id_obj = fromjson("{\"_id\":1}"); BSONObj empty_obj = fromjson("{}"); @@ -278,21 +280,20 @@ namespace mongo { } QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) { -// log() << "TEMP GETMORE " << ns << ' ' << cursorid << ' ' << pass << endl; exhaust = false; ClientCursor::Pointer p(cursorid); - ClientCursor *cc = p._c; - + ClientCursor *cc = p.c(); + int bufSize = 512; - if ( cc ){ + if ( cc ) { bufSize += sizeof( QueryResult ); - bufSize += ( ntoreturn ? 4 : 1 ) * 1024 * 1024; + bufSize += MaxBytesToReturnToClientAtOnce; } BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); - + int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; @@ -306,23 +307,27 @@ namespace mongo { if ( pass == 0 ) cc->updateSlaveLocation( curop ); - int queryOptions = cc->_queryOptions; + int queryOptions = cc->queryOptions(); if( pass == 0 ) { StringBuilder& ss = curop.debug().str; - ss << " getMore: " << cc->query.toString() << " "; + ss << " getMore: " << cc->query().toString() << " "; } - - start = cc->pos; - Cursor *c = cc->c.get(); + + start = cc->pos(); + Cursor *c = cc->c(); c->checkLocation(); DiskLoc last; + scoped_ptr keyFieldsOnly; + if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields ) + keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) ); + while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { - /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however - advance() can still be retries as a reactivation attempt. when there is new data, it will + /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however + advance() can still be retries as a reactivation attempt. when there is new data, it will return true. that's what we are doing here. */ if ( c->advance() ) @@ -356,27 +361,40 @@ namespace mongo { } else { last = c->currLoc(); - BSONObj js = c->current(); - - // show disk loc should be part of the main query, not in an $or clause, so this should be ok - fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0)); n++; - if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) || - (ntoreturn==0 && b.len()>1*1024*1024) ) { + + if ( keyFieldsOnly ) { + fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) ); + } + else { + BSONObj js = c->current(); + // show disk loc should be part of the main query, not in an $or clause, so this should be ok + fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0)); + } + + if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); - cc->pos += n; + cc->incPos( n ); break; } } } c->advance(); + + if ( ! cc->yieldSometimes() ) { + ClientCursor::erase(cursorid); + cursorid = 0; + cc = 0; + p.deleted(); + break; + } } if ( cc ) { cc->updateLocation(); cc->mayUpgradeStorage(); cc->storeOpForSlave( last ); - exhaust = cc->_queryOptions & QueryOption_Exhaust; + exhaust = cc->queryOptions() & QueryOption_Exhaust; } } @@ -395,104 +413,120 @@ namespace mongo { class CountOp : public QueryOp { public: CountOp( const string& ns , const BSONObj &spec ) : - _ns(ns), count_(), - skip_( spec["skip"].numberLong() ), - limit_( spec["limit"].numberLong() ), - bc_(){ + _ns(ns), _capped(false), _count(), _myCount(), + _skip( spec["skip"].numberLong() ), + _limit( spec["limit"].numberLong() ), + _bc() { } - + virtual void _init() { - c_ = qp().newCursor(); - + _c = qp().newCursor(); + _capped = _c->capped(); if ( qp().exactKeyMatch() && ! matcher()->needRecord() ) { - query_ = qp().simplifiedQuery( qp().indexKey() ); - bc_ = dynamic_cast< BtreeCursor* >( c_.get() ); - bc_->forgetEndKey(); + _query = qp().simplifiedQuery( qp().indexKey() ); + _bc = dynamic_cast< BtreeCursor* >( _c.get() ); + _bc->forgetEndKey(); } } + virtual long long nscanned() { + assert( _c.get() ); + return _c->nscanned(); + } + virtual bool prepareToYield() { if ( ! _cc ) { - _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , _ns.c_str() ) ); + _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _ns.c_str() ) ); } return _cc->prepareToYield( _yieldData ); } - + virtual void recoverFromYield() { if ( !ClientCursor::recoverFromYield( _yieldData ) ) { - c_.reset(); + _c.reset(); _cc.reset(); - massert( 13337, "cursor dropped during count", false ); - // TODO maybe we want to prevent recording the winning plan as well? + + if ( _capped ) { + msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns ); + } + else { + // we don't fail query since we're fine with returning partial data if collection dropped + } } } - + virtual void next() { - if ( !c_->ok() ) { + if ( ! _c || !_c->ok() ) { setComplete(); return; } - if ( bc_ ) { - if ( firstMatch_.isEmpty() ) { - firstMatch_ = bc_->currKeyNode().key; + if ( _bc ) { + if ( _firstMatch.isEmpty() ) { + _firstMatch = _bc->currKeyNode().key.copy(); // if not match - if ( query_.woCompare( firstMatch_, BSONObj(), false ) ) { + if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) { setComplete(); return; } _gotOne(); - } else { - if ( !firstMatch_.woEqual( bc_->currKeyNode().key ) ) { + } + else { + if ( ! _firstMatch.woEqual( _bc->currKeyNode().key ) ) { setComplete(); return; } _gotOne(); } - } + } else { - if ( !matcher()->matches(c_->currKey(), c_->currLoc() ) ) { + if ( !matcher()->matches(_c->currKey(), _c->currLoc() ) ) { } - else if( !c_->getsetdup(c_->currLoc()) ) { + else if( !_c->getsetdup(_c->currLoc()) ) { _gotOne(); - } + } } - c_->advance(); + _c->advance(); } virtual QueryOp *_createChild() const { CountOp *ret = new CountOp( _ns , BSONObj() ); - ret->count_ = count_; - ret->skip_ = skip_; - ret->limit_ = limit_; + ret->_count = _count; + ret->_skip = _skip; + ret->_limit = _limit; return ret; } - long long count() const { return count_; } - virtual bool mayRecordPlan() const { return true; } + long long count() const { return _count; } + virtual bool mayRecordPlan() const { + return ( _myCount > _limit / 2 ) || ( complete() && !stopRequested() ); + } private: - - void _gotOne(){ - if ( skip_ ){ - skip_--; + + void _gotOne() { + if ( _skip ) { + _skip--; return; } - - if ( limit_ > 0 && count_ >= limit_ ){ + + if ( _limit > 0 && _count >= _limit ) { setStop(); return; } - count_++; + _count++; + _myCount++; } string _ns; - - long long count_; - long long skip_; - long long limit_; - shared_ptr c_; - BSONObj query_; - BtreeCursor *bc_; - BSONObj firstMatch_; + bool _capped; + + long long _count; + long long _myCount; + long long _skip; + long long _limit; + shared_ptr _c; + BSONObj _query; + BtreeCursor * _bc; + BSONObj _firstMatch; ClientCursor::CleanupPointer _cc; ClientCursor::YieldData _yieldData; @@ -500,7 +534,7 @@ namespace mongo { /* { count: "collectionname"[, query: ] } returns -1 on ns does not exist error. - */ + */ long long runCount( const char *ns, const BSONObj &cmd, string &err ) { Client::Context cx(ns); NamespaceDetails *d = nsdetails( ns ); @@ -509,10 +543,10 @@ namespace mongo { return -1; } BSONObj query = cmd.getObjectField("query"); - + // count of all objects - if ( query.isEmpty() ){ - return applySkipLimit( d->nrecords , cmd ); + if ( query.isEmpty() ) { + return applySkipLimit( d->stats.nrecords , cmd ); } MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true ); CountOp original( ns , cmd ); @@ -525,8 +559,11 @@ namespace mongo { } return res->count(); } - + class ExplainBuilder { + // Note: by default we filter out allPlans and oldPlan in the shell's + // explain() function. If you add any recursive structures, make sure to + // edit the JS to make sure everything gets filtered. public: ExplainBuilder() : _i() {} void ensureStartScan() { @@ -539,14 +576,16 @@ namespace mongo { b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds(); b.done(); } - void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder, int millis, bool hint ) { + void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder, + int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) { if ( _i == 1 ) { _c.reset( new BSONArrayBuilder() ); *_c << _b->obj(); } if ( _i == 0 ) { _b.reset( new BSONObjBuilder() ); - } else { + } + else { _b.reset( new BSONObjBuilder( _c->subobjStart() ) ); } *_b << "cursor" << c->toString(); @@ -559,6 +598,11 @@ namespace mongo { *_b << "millis" << millis; + *_b << "nYields" << nYields; + *_b << "nChunkSkips" << nChunkSkips; + *_b << "isMultiKey" << c->isMultiKey(); + *_b << "indexOnly" << indexOnly; + *_b << "indexBounds" << c->prettyIndexBounds(); if ( !hint ) { @@ -570,19 +614,20 @@ namespace mongo { _a.reset( 0 ); ++_i; } - BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) { + BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) { if ( _i > 1 ) { BSONObjBuilder b; b << "clauses" << _c->arr(); b.appendNumber( "nscanned", nscanned ); - b.appendNumber( "nscanneObjects", nscannedObjects ); + b.appendNumber( "nscannedObjects", nscannedObjects ); b << "n" << n; b << "millis" << millis; b.appendElements( suffix ); return b.obj(); - } else { + } + else { _b->appendElements( suffix ); - return _b->obj(); + return _b->obj(); } } private: @@ -591,11 +636,11 @@ namespace mongo { auto_ptr< BSONArrayBuilder > _c; int _i; }; - + // Implements database 'query' requests using the query optimizer's QueryOp interface class UserQueryOp : public QueryOp { public: - + UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) : _buf( 32768 ) , // TODO be smarter here _pq( pq ) , @@ -603,8 +648,12 @@ namespace mongo { _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0), _n(0), _oldN(0), - _chunkMatcher(shardingState.getChunkMatcher(pq.ns())), + _nYields(), + _nChunkSkips(), + _chunkManager( shardingState.needShardChunkManager(pq.ns()) ? + shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ), _inMemSort(false), + _capped(false), _saveClientCursor(false), _wouldSaveClientCursor(false), _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ), @@ -612,82 +661,111 @@ namespace mongo { _eb( eb ), _curop( curop ) {} - + virtual void _init() { // only need to put the QueryResult fields there if we're building the first buffer in the message. if ( _response.empty() ) { _buf.skip( sizeof( QueryResult ) ); } - + if ( _oplogReplay ) { _findingStartCursor.reset( new FindingStartCursor( qp() ) ); - } else { + _capped = true; + } + else { _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() ); + _capped = _c->capped(); + + // setup check for if we can only use index to extract + if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) { + _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) ); + } } if ( qp().scanAndOrderRequired() ) { _inMemSort = true; _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) ); } - + if ( _pq.isExplain() ) { _eb.noteCursor( _c.get() ); } + } - + virtual bool prepareToYield() { if ( _findingStartCursor.get() ) { return _findingStartCursor->prepareToYield(); - } else { + } + else { if ( ! _cc ) { _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) ); } return _cc->prepareToYield( _yieldData ); } } - + virtual void recoverFromYield() { + _nYields++; + if ( _findingStartCursor.get() ) { _findingStartCursor->recoverFromYield(); - } else { - if ( !ClientCursor::recoverFromYield( _yieldData ) ) { - _c.reset(); - _cc.reset(); - _so.reset(); - massert( 13338, "cursor dropped during query", false ); - // TODO maybe we want to prevent recording the winning plan as well? - } + } + else if ( ! ClientCursor::recoverFromYield( _yieldData ) ) { + _c.reset(); + _cc.reset(); + _so.reset(); + + if ( _capped ) { + msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() ); + } + else { + // we don't fail query since we're fine with returning partial data if collection dropped + + // todo: this is wrong. the cursor could be gone if closeAllDatabases command just ran + } + } } - + + virtual long long nscanned() { + if ( _findingStartCursor.get() ) { + return 0; // should only be one query plan, so value doesn't really matter. + } + assert( _c.get() ); + return _c->nscanned(); + } + virtual void next() { if ( _findingStartCursor.get() ) { if ( _findingStartCursor->done() ) { _c = _findingStartCursor->cRelease(); _findingStartCursor.reset( 0 ); - } else { + } + else { _findingStartCursor->next(); } + _capped = true; return; } - - if ( !_c->ok() ) { + + if ( !_c || !_c->ok() ) { finish( false ); return; } bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors; - - if( 0 ) { + + if( 0 ) { cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl; } - - if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ){ + + if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) { finish( true ); //? return; } - _nscanned++; + _nscanned = _c->nscanned(); if ( !matcher()->matches(_c->currKey(), _c->currLoc() , &_details ) ) { // not a match, continue onward if ( _details.loadedObject ) @@ -696,22 +774,23 @@ namespace mongo { else { _nscannedObjects++; DiskLoc cl = _c->currLoc(); - if ( _chunkMatcher && ! _chunkMatcher->belongsToMe( _c->currKey(), _c->currLoc() ) ){ - // cout << "TEMP skipping un-owned chunk: " << _c->current() << endl; + if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { + _nChunkSkips++; + // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl; } - else if( _c->getsetdup(cl) ) { + else if( _c->getsetdup(cl) ) { // dup } else { // got a match. - + if ( _inMemSort ) { // note: no cursors for non-indexed, ordered results. results must be fairly small. _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 ); } else if ( _ntoskip > 0 ) { _ntoskip--; - } + } else { if ( _pq.isExplain() ) { _n++; @@ -723,16 +802,19 @@ namespace mongo { } else { - if ( _pq.returnKey() ){ + if ( _pq.returnKey() ) { BSONObjBuilder bb( _buf ); bb.appendKeys( _c->indexKeyPattern() , _c->currKey() ); bb.done(); } + else if ( _keyFieldsOnly ) { + fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) ); + } else { BSONObj js = _c->current(); assert( js.isValid() ); - if ( _oplogReplay ){ + if ( _oplogReplay ) { BSONElement e = js["ts"]; if ( e.type() == Date || e.type() == Timestamp ) _slaveReadTill = e._opTime(); @@ -741,13 +823,13 @@ namespace mongo { fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0)); } _n++; - if ( ! _c->supportGetMore() ){ - if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ){ + if ( ! _c->supportGetMore() ) { + if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) { finish( true ); return; } } - else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ){ + else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) { /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */ if ( mayCreateCursor1 ) { _wouldSaveClientCursor = true; @@ -763,60 +845,73 @@ namespace mongo { } } } - _c->advance(); + _c->advance(); } // this plan won, so set data for response broadly void finish( bool stop ) { + if ( _pq.isExplain() ) { _n = _inMemSort ? _so->size() : _n; - } + } else if ( _inMemSort ) { if( _so.get() ) _so->fill( _buf, _pq.getFields() , _n ); } - - if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 ) - _c->setTailable(); - - // If the tailing request succeeded. - if ( _c->tailable() ) - _saveClientCursor = true; - - if ( _pq.isExplain()) { - _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(), _curop.elapsedMillis(), useHints && !_pq.getHint().eoo() ); - } else { - if (_buf.len()) { + + if ( _c.get() ) { + _nscanned = _c->nscanned(); + + if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 ) + _c->setTailable(); + + // If the tailing request succeeded. + if ( _c->tailable() ) + _saveClientCursor = true; + } + + if ( _pq.isExplain() ) { + massert( 13638, "client cursor dropped during explain query yield", _c.get() ); + _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(), + _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields , + _nChunkSkips, _keyFieldsOnly.get() > 0 ); + } + else { + if ( _buf.len() ) { _response.appendData( _buf.buf(), _buf.len() ); _buf.decouple(); } } + if ( stop ) { setStop(); - } else { + } + else { setComplete(); } } - + void finishExplain( const BSONObj &suffix ) { - BSONObj obj = _eb.finishWithSuffix( nscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix); + BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix); fillQueryResultFromObj(_buf, 0, obj); _n = 1; _oldN = 0; _response.appendData( _buf.buf(), _buf.len() ); _buf.decouple(); } - - virtual bool mayRecordPlan() const { return _pq.getNumToReturn() != 1; } - + + virtual bool mayRecordPlan() const { + return ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) ); + } + virtual QueryOp *_createChild() const { if ( _pq.isExplain() ) { _eb.ensureStartScan(); } UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop ); ret->_oldN = n(); - ret->_oldNscanned = nscanned(); + ret->_oldNscanned = totalNscanned(); ret->_oldNscannedObjects = nscannedObjects(); ret->_ntoskip = _ntoskip; return ret; @@ -825,19 +920,20 @@ namespace mongo { bool scanAndOrderRequired() const { return _inMemSort; } shared_ptr cursor() { return _c; } int n() const { return _oldN + _n; } - long long nscanned() const { return _nscanned + _oldNscanned; } + long long totalNscanned() const { return _nscanned + _oldNscanned; } long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; } bool saveClientCursor() const { return _saveClientCursor; } bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; } - - void finishForOplogReplay( ClientCursor * cc ){ + + void finishForOplogReplay( ClientCursor * cc ) { if ( _oplogReplay && ! _slaveReadTill.isNull() ) - cc->_slaveReadTill = _slaveReadTill; + cc->slaveReadTill( _slaveReadTill ); } private: BufBuilder _buf; const ParsedQuery& _pq; + scoped_ptr _keyFieldsOnly; long long _ntoskip; long long _nscanned; @@ -846,30 +942,36 @@ namespace mongo { long long _oldNscannedObjects; int _n; // found so far int _oldN; - + + int _nYields; + int _nChunkSkips; + MatchDetails _details; - ChunkMatcherPtr _chunkMatcher; - + ShardChunkManagerPtr _chunkManager; + bool _inMemSort; auto_ptr< ScanAndOrder > _so; - + shared_ptr _c; ClientCursor::CleanupPointer _cc; ClientCursor::YieldData _yieldData; + bool _capped; bool _saveClientCursor; bool _wouldSaveClientCursor; bool _oplogReplay; auto_ptr< FindingStartCursor > _findingStartCursor; - + Message &_response; ExplainBuilder &_eb; CurOp &_curop; OpTime _slaveReadTill; }; - - /* run a query -- includes checking for and running a Command */ + + /* run a query -- includes checking for and running a Command \ + @return points to ns if exhaust mode. 0=normal mode + */ const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { StringBuilder& ss = curop.debug().str; shared_ptr pq_shared( new ParsedQuery(q) ); @@ -878,25 +980,26 @@ namespace mongo { BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; - + if( logLevel >= 2 ) log() << "query: " << ns << jsobj << endl; - + ss << ns; { - // only say ntoreturn if nonzero. + // only say ntoreturn if nonzero. int n = pq.getNumToReturn(); - if( n ) + if( n ) ss << " ntoreturn:" << n; } curop.setQuery(jsobj); - + if ( pq.couldBeCommand() ) { BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { - ss << " command: " << jsobj.toString(); + ss << " command: "; + jsobj.toString( ss ); curop.markCommand(); auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); @@ -910,9 +1013,12 @@ namespace mongo { qr->nReturned = 1; result.setData( qr.release(), true ); } - return false; + else { + uasserted(13530, "bad or malformed command request?"); + } + return 0; } - + /* --- regular query --- */ int n = 0; @@ -932,7 +1038,7 @@ namespace mongo { out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } - + /* --- read lock --- */ mongolock lk(false); @@ -947,17 +1053,18 @@ namespace mongo { const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; - } else { + } + else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } - + BSONObj snapshotHint; // put here to keep the data in scope - if( snapshot ) { + if( snapshot ) { NamespaceDetails *d = nsdetails(ns); - if ( d ){ + if ( d ) { int i = d->findIdIndex(); - if( i < 0 ) { + if( i < 0 ) { if ( strstr( ns , ".system." ) == 0 ) log() << "warning: no _id index on $snapshot query, ns:" << ns << endl; } @@ -973,7 +1080,7 @@ namespace mongo { } } } - + if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { bool nsFound = false; bool indexFound = false; @@ -981,12 +1088,12 @@ namespace mongo { BSONObj resObject; Client& c = cc(); bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); - if ( nsFound == false || indexFound == true ){ + if ( nsFound == false || indexFound == true ) { BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); bb.skip(sizeof(QueryResult)); - + ss << " idhack "; - if ( found ){ + if ( found ) { n = 1; fillQueryResultFromObj( bb , pq.getFields() , resObject ); } @@ -999,16 +1106,16 @@ namespace mongo { qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; - qr->nReturned = n; + qr->nReturned = n; result.setData( qr.release(), true ); return false; - } + } } - + // regular, not QO bypass query - + BSONObj oldPlan; - if ( explain && ! pq.hasIndexSpecifier() ){ + if ( explain && ! pq.hasIndexSpecifier() ) { MultiPlanScanner mps( ns, query, order ); if ( mps.usingPrerecordedPlan() ) oldPlan = mps.oldExplain(); @@ -1031,7 +1138,7 @@ namespace mongo { dqo.finishExplain( explainSuffix ); } n = dqo.n(); - long long nscanned = dqo.nscanned(); + long long nscanned = dqo.totalNscanned(); if ( dqo.scanAndOrderRequired() ) ss << " scanAndOrder "; shared_ptr cursor = dqo.cursor(); @@ -1046,18 +1153,19 @@ namespace mongo { // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher(), dqo ) ); cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned()); - } else { + } + else { cursor->setMatcher( dqo.matcher() ); cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() ); } - cursorid = cc->cursorid; + cursorid = cc->cursorid(); DEV tlog(2) << "query has more, cursorid: " << cursorid << endl; - cc->pos = n; + cc->setPos( n ); cc->pq = pq_shared; cc->fields = pq.getFieldPtr(); cc->originalMessage = m; cc->updateLocation(); - if ( !cc->c->ok() && cc->c->tailable() ) + if ( !cc->ok() && cc->c()->tailable() ) DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl; if( queryOptions & QueryOption_Exhaust ) { exhaust = ns; @@ -1087,6 +1195,6 @@ namespace mongo { } ss << " nreturned:" << n; return exhaust; - } - + } + } // namespace mongo diff --git a/db/query.h b/db/query.h index cc88e5c..5de7ced 100644 --- a/db/query.h +++ b/db/query.h @@ -23,6 +23,7 @@ #include "dbmessage.h" #include "jsobj.h" #include "diskloc.h" +#include "projection.h" /* db request message format @@ -37,29 +38,29 @@ a series of JSObjects dbDelete: string collection; - int flags=0; // 1=DeleteSingle + int flags=0; // 1=DeleteSingle JSObject query; dbUpdate: string collection; - int flags; // 1=upsert + int flags; // 1=upsert JSObject query; - JSObject objectToUpdate; + JSObject objectToUpdate; objectToUpdate may include { $inc: } or { $set: ... }, see struct Mod. dbQuery: string collection; - int nToSkip; - int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit) + int nToSkip; + int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit) // greater than zero is simply a hint on how many objects to send back per "cursor batch". // a negative number indicates a hard limit. JSObject query; - [JSObject fieldsToReturn] + [JSObject fieldsToReturn] dbGetMore: - string collection; // redundant, might use for security. + string collection; // redundant, might use for security. int nToReturn; int64 cursorID; dbKillCursors=2007: int n; - int64 cursorIDs[n]; + int64 cursorIDs[n]; Note that on Update, there is only one object, which is different from insert where you can pass a list of objects to insert in the db. @@ -77,7 +78,7 @@ namespace mongo { struct GetMoreWaitException { }; QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust); - + struct UpdateResult { bool existing; // if existing objects were modified bool mod; // was this a $ mod @@ -85,25 +86,25 @@ namespace mongo { OID upserted; // if something was upserted, the new _id of the object UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() ) - : existing(e) , mod(m), num(n){ + : existing(e) , mod(m), num(n) { upserted.clear(); BSONElement id = upsertedObject["_id"]; - if ( ! e && n == 1 && id.type() == jstOID ){ + if ( ! e && n == 1 && id.type() == jstOID ) { upserted = id.OID(); } } - + }; class RemoveSaver; - + /* returns true if an existing object was updated, false if no existing object was found. multi - update multiple objects - mostly useful with things like $set god - allow access to system namespaces */ UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug ); - UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern, + UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 ); // If justOne is true, deletedId is set to the id of the deleted object. @@ -112,7 +113,7 @@ namespace mongo { long long runCount(const char *ns, const BSONObj& cmd, string& err); const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result); - + /* This is for languages whose "objects" are not well ordered (JSON is well ordered). [ { a : ... } , { b : ... } ] -> { a : ..., b : ... } */ @@ -144,24 +145,24 @@ namespace mongo { class ParsedQuery { public: ParsedQuery( QueryMessage& qm ) - : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ){ + : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) { init( qm.query ); initFields( qm.fields ); } ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields ) - : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ){ + : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) { init( query ); initFields( fields ); } - - ~ParsedQuery(){} + + ~ParsedQuery() {} const char * ns() const { return _ns; } bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; } const BSONObj& getFilter() const { return _filter; } - FieldMatcher* getFields() const { return _fields.get(); } - shared_ptr getFieldPtr() const { return _fields; } + Projection* getFields() const { return _fields.get(); } + shared_ptr getFieldPtr() const { return _fields; } int getSkip() const { return _ntoskip; } int getNumToReturn() const { return _ntoreturn; } @@ -169,7 +170,7 @@ namespace mongo { int getOptions() const { return _options; } bool hasOption( int x ) const { return x & _options; } - + bool isExplain() const { return _explain; } bool isSnapshot() const { return _snapshot; } bool returnKey() const { return _returnKey; } @@ -180,7 +181,7 @@ namespace mongo { const BSONObj& getOrder() const { return _order; } const BSONElement& getHint() const { return _hint; } int getMaxScan() const { return _maxScan; } - + bool couldBeCommand() const { /* we assume you are using findOne() for running a cmd... */ return _ntoreturn == 1 && strstr( _ns , ".$cmd" ); @@ -193,7 +194,7 @@ namespace mongo { /* if ntoreturn is zero, we return up to 101 objects. on the subsequent getmore, there is only a size limit. The idea is that on a find() where one doesn't use much results, we don't return much, but once getmore kicks in, we start pushing significant quantities. - + The n limit (vs. size) is important when someone fetches only one small field from big objects, which causes massive scanning server-side. */ @@ -208,14 +209,14 @@ namespace mongo { return false; return n >= _ntoreturn; } - + private: - void init( const BSONObj& q ){ + void init( const BSONObj& q ) { _reset(); uassert( 10105 , "bad skip value in query", _ntoskip >= 0); - - if ( _ntoreturn < 0 ){ - /* _ntoreturn greater than zero is simply a hint on how many objects to send back per + + if ( _ntoreturn < 0 ) { + /* _ntoreturn greater than zero is simply a hint on how many objects to send back per "cursor batch". A negative number indicates a hard limit. */ @@ -223,12 +224,12 @@ namespace mongo { _ntoreturn = -_ntoreturn; } - + BSONElement e = q["query"]; if ( ! e.isABSONObj() ) e = q["$query"]; - - if ( e.isABSONObj() ){ + + if ( e.isABSONObj() ) { _filter = e.embeddedObject(); _initTop( q ); } @@ -237,7 +238,7 @@ namespace mongo { } } - void _reset(){ + void _reset() { _wantMore = true; _explain = false; _snapshot = false; @@ -246,20 +247,23 @@ namespace mongo { _maxScan = 0; } - void _initTop( const BSONObj& top ){ + void _initTop( const BSONObj& top ) { BSONObjIterator i( top ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); const char * name = e.fieldName(); if ( strcmp( "$orderby" , name ) == 0 || - strcmp( "orderby" , name ) == 0 ){ - if ( e.type() == Object ) + strcmp( "orderby" , name ) == 0 ) { + if ( e.type() == Object ) { _order = e.embeddedObject(); - else if ( e.type() == Array ) + } + else if ( e.type() == Array ) { _order = transformOrderFromArrayFormat( _order ); - else - assert( 0 ); + } + else { + uassert(13513, "sort must be an object or array", 0); + } } else if ( strcmp( "$explain" , name ) == 0 ) _explain = e.trueValue(); @@ -277,25 +281,25 @@ namespace mongo { _maxScan = e.numberInt(); else if ( strcmp( "$showDiskLoc" , name ) == 0 ) _showDiskLoc = e.trueValue(); - + } - if ( _snapshot ){ + if ( _snapshot ) { uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() ); uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() ); } - + } - void initFields( const BSONObj& fields ){ + void initFields( const BSONObj& fields ) { if ( fields.isEmpty() ) return; - _fields.reset( new FieldMatcher() ); - _fields->add( fields ); + _fields.reset( new Projection() ); + _fields->init( fields ); } - ParsedQuery( const ParsedQuery& other ){ + ParsedQuery( const ParsedQuery& other ) { assert(0); } @@ -303,10 +307,10 @@ namespace mongo { int _ntoskip; int _ntoreturn; int _options; - + BSONObj _filter; - shared_ptr< FieldMatcher > _fields; - + shared_ptr< Projection > _fields; + bool _wantMore; bool _explain; @@ -319,7 +323,7 @@ namespace mongo { BSONObj _order; int _maxScan; }; - + } // namespace mongo diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp index e7068c2..0b9dce7 100644 --- a/db/queryoptimizer.cpp +++ b/db/queryoptimizer.cpp @@ -24,24 +24,25 @@ #include "queryoptimizer.h" #include "cmdline.h" #include "clientcursor.h" +#include //#define DEBUGQO(x) cout << x << endl; #define DEBUGQO(x) namespace mongo { - void checkTableScanAllowed( const char * ns ){ - if ( ! cmdLine.notablescan ) + void checkTableScanAllowed( const char * ns ) { + if ( ! cmdLine.noTableScan ) return; - + if ( strstr( ns , ".system." ) || - strstr( ns , "local." ) ) + strstr( ns , "local." ) ) return; - + if ( ! nsdetails( ns ) ) return; - uassert( 10111 , (string)"table scans not allowed:" + ns , ! cmdLine.notablescan ); + uassert( 10111 , (string)"table scans not allowed:" + ns , ! cmdLine.noTableScan ); } double elementDirection( const BSONElement &e ) { @@ -49,58 +50,59 @@ namespace mongo { return e.number(); return 1; } - - QueryPlan::QueryPlan( - NamespaceDetails *_d, int _idxNo, - const FieldRangeSet &fbs, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) : - d(_d), idxNo(_idxNo), - fbs_( fbs ), - _originalQuery( originalQuery ), - order_( order ), - index_( 0 ), - optimal_( false ), - scanAndOrderRequired_( true ), - exactKeyMatch_( false ), - direction_( 0 ), - endKeyInclusive_( endKey.isEmpty() ), - unhelpful_( false ), - _special( special ), - _type(0), - _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ){ - - if ( !fbs_.matchPossible() ) { - unhelpful_ = true; - scanAndOrderRequired_ = false; + + QueryPlan::QueryPlan( + NamespaceDetails *d, int idxNo, + const FieldRangeSet &fbs, const FieldRangeSet &originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) : + _d(d), _idxNo(idxNo), + _fbs( fbs ), + _originalQuery( originalQuery ), + _order( order ), + _index( 0 ), + _optimal( false ), + _scanAndOrderRequired( true ), + _exactKeyMatch( false ), + _direction( 0 ), + _endKeyInclusive( endKey.isEmpty() ), + _unhelpful( false ), + _special( special ), + _type(0), + _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ) { + + if ( !_fbs.matchPossible() ) { + _unhelpful = true; + _scanAndOrderRequired = false; return; } - if( idxNo >= 0 ) { - index_ = &d->idx(idxNo); - } else { + if( _idxNo >= 0 ) { + _index = &d->idx(_idxNo); + } + else { // full table scan case - if ( order_.isEmpty() || !strcmp( order_.firstElement().fieldName(), "$natural" ) ) - scanAndOrderRequired_ = false; + if ( _order.isEmpty() || !strcmp( _order.firstElement().fieldName(), "$natural" ) ) + _scanAndOrderRequired = false; return; } - if ( _special.size() ){ - optimal_ = true; - _type = index_->getSpec().getType(); + if ( _special.size() ) { + _optimal = true; + _type = _index->getSpec().getType(); massert( 13040 , (string)"no type for special: " + _special , _type ); // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet - scanAndOrderRequired_ = _type->scanAndOrderRequired( _originalQuery , order ); + _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order ); return; } - BSONObj idxKey = index_->keyPattern(); + BSONObj idxKey = _index->keyPattern(); BSONObjIterator o( order ); BSONObjIterator k( idxKey ); if ( !o.moreWithEOO() ) - scanAndOrderRequired_ = false; + _scanAndOrderRequired = false; while( o.moreWithEOO() ) { BSONElement oe = o.next(); if ( oe.eoo() ) { - scanAndOrderRequired_ = false; + _scanAndOrderRequired = false; break; } if ( !k.moreWithEOO() ) @@ -116,14 +118,14 @@ namespace mongo { goto doneCheckOrder; } int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1; - if ( direction_ == 0 ) - direction_ = d; - else if ( direction_ != d ) + if ( _direction == 0 ) + _direction = d; + else if ( _direction != d ) break; } - doneCheckOrder: - if ( scanAndOrderRequired_ ) - direction_ = 0; +doneCheckOrder: + if ( _scanAndOrderRequired ) + _direction = 0; BSONObjIterator i( idxKey ); int exactIndexedQueryCount = 0; int optimalIndexedQueryCount = 0; @@ -140,7 +142,8 @@ namespace mongo { ++optimalIndexedQueryCount; if ( !fb.equality() ) stillOptimalIndexedQueryCount = false; - } else { + } + else { if ( fb.nontrivial() ) optimalIndexedQueryCount = -1; } @@ -151,16 +154,17 @@ namespace mongo { } orderFieldsUnindexed.erase( e.fieldName() ); } - if ( !scanAndOrderRequired_ && - ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) ) - optimal_ = true; + if ( !_scanAndOrderRequired && + ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) ) + _optimal = true; if ( exactIndexedQueryCount == fbs.nNontrivialRanges() && - orderFieldsUnindexed.size() == 0 && - exactIndexedQueryCount == index_->keyPattern().nFields() && - exactIndexedQueryCount == _originalQuery.nFields() ) { - exactKeyMatch_ = true; + orderFieldsUnindexed.size() == 0 && + exactIndexedQueryCount == _index->keyPattern().nFields() && + exactIndexedQueryCount == _originalQuery.nFields() ) { + _exactKeyMatch = true; } - _frv.reset( new FieldRangeVector( fbs, idxKey, direction_ ) ); + _frv.reset( new FieldRangeVector( fbs, idxKey, _direction ) ); + _originalFrv.reset( new FieldRangeVector( originalFrs, idxKey, _direction ) ); if ( _startOrEndSpec ) { BSONObj newStart, newEnd; if ( !startKey.isEmpty() ) @@ -173,100 +177,124 @@ namespace mongo { _endKey = _frv->endKey(); } - if ( ( scanAndOrderRequired_ || order_.isEmpty() ) && - !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() ) { - unhelpful_ = true; + if ( ( _scanAndOrderRequired || _order.isEmpty() ) && + !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() ) { + _unhelpful = true; } } - + shared_ptr QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const { if ( _type ) { - // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet - return _type->newCursor( _originalQuery , order_ , numWanted ); + // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet + return _type->newCursor( _originalQuery , _order , numWanted ); } - - if ( !fbs_.matchPossible() ){ - if ( fbs_.nNontrivialRanges() ) - checkTableScanAllowed( fbs_.ns() ); + + if ( !_fbs.matchPossible() ) { + if ( _fbs.nNontrivialRanges() ) + checkTableScanAllowed( _fbs.ns() ); return shared_ptr( new BasicCursor( DiskLoc() ) ); } - if ( !index_ ){ - if ( fbs_.nNontrivialRanges() ) - checkTableScanAllowed( fbs_.ns() ); - return findTableScan( fbs_.ns(), order_, startLoc ); + if ( !_index ) { + if ( _fbs.nNontrivialRanges() ) + checkTableScanAllowed( _fbs.ns() ); + return findTableScan( _fbs.ns(), _order, startLoc ); } massert( 10363 , "newCursor() with start location not implemented for indexed plans", startLoc.isNull() ); - + if ( _startOrEndSpec ) { - // we are sure to spec endKeyInclusive_ - return shared_ptr( new BtreeCursor( d, idxNo, *index_, _startKey, _endKey, endKeyInclusive_, direction_ >= 0 ? 1 : -1 ) ); - } else if ( index_->getSpec().getType() ) { - return shared_ptr( new BtreeCursor( d, idxNo, *index_, _frv->startKey(), _frv->endKey(), true, direction_ >= 0 ? 1 : -1 ) ); - } else { - return shared_ptr( new BtreeCursor( d, idxNo, *index_, _frv, direction_ >= 0 ? 1 : -1 ) ); + // we are sure to spec _endKeyInclusive + return shared_ptr( new BtreeCursor( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) ); + } + else if ( _index->getSpec().getType() ) { + return shared_ptr( new BtreeCursor( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) ); + } + else { + return shared_ptr( new BtreeCursor( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) ); } } - + shared_ptr QueryPlan::newReverseCursor() const { - if ( !fbs_.matchPossible() ) + if ( !_fbs.matchPossible() ) return shared_ptr( new BasicCursor( DiskLoc() ) ); - if ( !index_ ) { - int orderSpec = order_.getIntField( "$natural" ); + if ( !_index ) { + int orderSpec = _order.getIntField( "$natural" ); if ( orderSpec == INT_MIN ) orderSpec = 1; - return findTableScan( fbs_.ns(), BSON( "$natural" << -orderSpec ) ); + return findTableScan( _fbs.ns(), BSON( "$natural" << -orderSpec ) ); } massert( 10364 , "newReverseCursor() not implemented for indexed plans", false ); return shared_ptr(); } - + BSONObj QueryPlan::indexKey() const { - if ( !index_ ) + if ( !_index ) return BSON( "$natural" << 1 ); - return index_->keyPattern(); + return _index->keyPattern(); } - + void QueryPlan::registerSelf( long long nScanned ) const { - if ( fbs_.matchPossible() ) { + if ( _fbs.matchPossible() ) { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); - NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( fbs_.pattern( order_ ), indexKey(), nScanned ); - } - } - - QueryPlanSet::QueryPlanSet( const char *_ns, auto_ptr< FieldRangeSet > frs, const BSONObj &originalQuery, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) : - ns(_ns), - _originalQuery( originalQuery ), - fbs_( frs ), - mayRecordPlan_( true ), - usingPrerecordedPlan_( false ), - hint_( BSONObj() ), - order_( order.getOwned() ), - oldNScanned_( 0 ), - honorRecordedPlan_( honorRecordedPlan ), - min_( min.getOwned() ), - max_( max.getOwned() ), - _bestGuessOnly( bestGuessOnly ), - _mayYield( mayYield ), - _yieldSometimesTracker( 256, 20 ){ + NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _fbs.pattern( _order ), indexKey(), nScanned ); + } + } + + bool QueryPlan::isMultiKey() const { + if ( _idxNo < 0 ) + return false; + return _d->isMultikey( _idxNo ); + } + + QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr< FieldRangeSet > frs, auto_ptr< FieldRangeSet > originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) : + _ns(ns), + _originalQuery( originalQuery ), + _fbs( frs ), + _originalFrs( originalFrs ), + _mayRecordPlan( true ), + _usingPrerecordedPlan( false ), + _hint( BSONObj() ), + _order( order.getOwned() ), + _oldNScanned( 0 ), + _honorRecordedPlan( honorRecordedPlan ), + _min( min.getOwned() ), + _max( max.getOwned() ), + _bestGuessOnly( bestGuessOnly ), + _mayYield( mayYield ), + _yieldSometimesTracker( 256, 20 ) { if ( hint && !hint->eoo() ) { - hint_ = hint->wrap(); + _hint = hint->wrap(); } init(); } - + + bool QueryPlanSet::modifiedKeys() const { + for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) + if ( (*i)->isMultiKey() ) + return true; + return false; + } + + bool QueryPlanSet::hasMultiKey() const { + for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) + if ( (*i)->isMultiKey() ) + return true; + return false; + } + + void QueryPlanSet::addHint( IndexDetails &id ) { - if ( !min_.isEmpty() || !max_.isEmpty() ) { + if ( !_min.isEmpty() || !_max.isEmpty() ) { string errmsg; BSONObj keyPattern = id.keyPattern(); - // This reformats min_ and max_ to be used for index lookup. - massert( 10365 , errmsg, indexDetailsForRange( fbs_->ns(), errmsg, min_, max_, keyPattern ) ); + // This reformats _min and _max to be used for index lookup. + massert( 10365 , errmsg, indexDetailsForRange( _fbs->ns(), errmsg, _min, _max, keyPattern ) ); } - NamespaceDetails *d = nsdetails(ns); - plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(id), *fbs_, _originalQuery, order_, min_, max_ ) ) ); + NamespaceDetails *d = nsdetails(_ns); + _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_fbs, *_originalFrs, _originalQuery, _order, _min, _max ) ) ); } - + // returns an IndexDetails * for a hint, 0 if hint is $natural. // hint must not be eoo() IndexDetails *parseHint( const BSONElement &hint, NamespaceDetails *d ) { @@ -281,7 +309,7 @@ namespace mongo { } } } - else if( hint.type() == Object ) { + else if( hint.type() == Object ) { BSONObj hintobj = hint.embeddedObject(); uassert( 10112 , "bad hint", !hintobj.isEmpty() ); if ( !strcmp( hintobj.firstElement().fieldName(), "$natural" ) ) { @@ -294,92 +322,93 @@ namespace mongo { return ⅈ } } - } + } uassert( 10113 , "bad hint", false ); return 0; } - + void QueryPlanSet::init() { DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery ); - plans_.clear(); - mayRecordPlan_ = true; - usingPrerecordedPlan_ = false; - - const char *ns = fbs_->ns(); + _plans.clear(); + _mayRecordPlan = true; + _usingPrerecordedPlan = false; + + const char *ns = _fbs->ns(); NamespaceDetails *d = nsdetails( ns ); - if ( !d || !fbs_->matchPossible() ) { + if ( !d || !_fbs->matchPossible() ) { // Table scan plan, when no matches are possible - plans_.push_back( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ) ); + _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) ); return; } - - BSONElement hint = hint_.firstElement(); + + BSONElement hint = _hint.firstElement(); if ( !hint.eoo() ) { - mayRecordPlan_ = false; + _mayRecordPlan = false; IndexDetails *id = parseHint( hint, d ); if ( id ) { addHint( *id ); - } else { - massert( 10366 , "natural order cannot be specified with $min/$max", min_.isEmpty() && max_.isEmpty() ); + } + else { + massert( 10366 , "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() ); // Table scan plan - plans_.push_back( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ) ); + _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) ); } return; } - - if ( !min_.isEmpty() || !max_.isEmpty() ) { + + if ( !_min.isEmpty() || !_max.isEmpty() ) { string errmsg; BSONObj keyPattern; - IndexDetails *idx = indexDetailsForRange( ns, errmsg, min_, max_, keyPattern ); + IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern ); massert( 10367 , errmsg, idx ); - plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(*idx), *fbs_, _originalQuery, order_, min_, max_ ) ) ); + _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_fbs, *_originalFrs, _originalQuery, _order, _min, _max ) ) ); return; } - if ( isSimpleIdQuery( _originalQuery ) ){ + if ( isSimpleIdQuery( _originalQuery ) ) { int idx = d->findIdIndex(); - if ( idx >= 0 ){ - usingPrerecordedPlan_ = true; - mayRecordPlan_ = false; - plans_.push_back( PlanPtr( new QueryPlan( d , idx , *fbs_ , _originalQuery, order_ ) ) ); + if ( idx >= 0 ) { + _usingPrerecordedPlan = true; + _mayRecordPlan = false; + _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_fbs , *_fbs , _originalQuery, _order ) ) ); return; } } - if ( _originalQuery.isEmpty() && order_.isEmpty() ){ - plans_.push_back( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ) ); + if ( _originalQuery.isEmpty() && _order.isEmpty() ) { + _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) ); return; } - DEBUGQO( "\t special : " << fbs_->getSpecial() ); - if ( fbs_->getSpecial().size() ){ - _special = fbs_->getSpecial(); + DEBUGQO( "\t special : " << _fbs->getSpecial() ); + if ( _fbs->getSpecial().size() ) { + _special = _fbs->getSpecial(); NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { int j = i.pos(); IndexDetails& ii = i.next(); const IndexSpec& spec = ii.getSpec(); - if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , order_ ) ){ - usingPrerecordedPlan_ = true; - mayRecordPlan_ = false; - plans_.push_back( PlanPtr( new QueryPlan( d , j , *fbs_ , _originalQuery, order_ , - BSONObj() , BSONObj() , _special ) ) ); + if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) { + _usingPrerecordedPlan = true; + _mayRecordPlan = false; + _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_fbs , *_fbs , _originalQuery, _order , + BSONObj() , BSONObj() , _special ) ) ); return; } } uassert( 13038 , (string)"can't find special index: " + _special + " for: " + _originalQuery.toString() , 0 ); } - if ( honorRecordedPlan_ ) { + if ( _honorRecordedPlan ) { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns ); - BSONObj bestIndex = nsd.indexForPattern( fbs_->pattern( order_ ) ); + BSONObj bestIndex = nsd.indexForPattern( _fbs->pattern( _order ) ); if ( !bestIndex.isEmpty() ) { - PlanPtr p; - oldNScanned_ = nsd.nScannedForPattern( fbs_->pattern( order_ ) ); + QueryPlanPtr p; + _oldNScanned = nsd.nScannedForPattern( _fbs->pattern( _order ) ); if ( !strcmp( bestIndex.firstElement().fieldName(), "$natural" ) ) { // Table scan plan - p.reset( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ); + p.reset( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ); } NamespaceDetails::IndexIterator i = d->ii(); @@ -387,55 +416,56 @@ namespace mongo { int j = i.pos(); IndexDetails& ii = i.next(); if( ii.keyPattern().woCompare(bestIndex) == 0 ) { - p.reset( new QueryPlan( d, j, *fbs_, _originalQuery, order_ ) ); + p.reset( new QueryPlan( d, j, *_fbs, *_originalFrs, _originalQuery, _order ) ); } } massert( 10368 , "Unable to locate previously recorded index", p.get() ); if ( !( _bestGuessOnly && p->scanAndOrderRequired() ) ) { - usingPrerecordedPlan_ = true; - mayRecordPlan_ = false; - plans_.push_back( p ); + _usingPrerecordedPlan = true; + _mayRecordPlan = false; + _plans.push_back( p ); return; } } } - + addOtherPlans( false ); } - + void QueryPlanSet::addOtherPlans( bool checkFirst ) { - const char *ns = fbs_->ns(); + const char *ns = _fbs->ns(); NamespaceDetails *d = nsdetails( ns ); if ( !d ) return; // If table scan is optimal or natural order requested or tailable cursor requested - if ( !fbs_->matchPossible() || ( fbs_->nNontrivialRanges() == 0 && order_.isEmpty() ) || - ( !order_.isEmpty() && !strcmp( order_.firstElement().fieldName(), "$natural" ) ) ) { + if ( !_fbs->matchPossible() || ( _fbs->nNontrivialRanges() == 0 && _order.isEmpty() ) || + ( !_order.isEmpty() && !strcmp( _order.firstElement().fieldName(), "$natural" ) ) ) { // Table scan plan - addPlan( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ), checkFirst ); + addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ), checkFirst ); return; } - - bool normalQuery = hint_.isEmpty() && min_.isEmpty() && max_.isEmpty(); + + bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty(); PlanSet plans; for( int i = 0; i < d->nIndexes; ++i ) { IndexDetails& id = d->idx(i); const IndexSpec& spec = id.getSpec(); IndexSuitability suitability = HELPFUL; - if ( normalQuery ){ - suitability = spec.suitability( fbs_->simplifiedQuery() , order_ ); + if ( normalQuery ) { + suitability = spec.suitability( _fbs->simplifiedQuery() , _order ); if ( suitability == USELESS ) continue; } - PlanPtr p( new QueryPlan( d, i, *fbs_, _originalQuery, order_ ) ); + QueryPlanPtr p( new QueryPlan( d, i, *_fbs, *_originalFrs, _originalQuery, _order ) ); if ( p->optimal() ) { addPlan( p, checkFirst ); return; - } else if ( !p->unhelpful() ) { + } + else if ( !p->unhelpful() ) { plans.push_back( p ); } } @@ -443,29 +473,29 @@ namespace mongo { addPlan( *i, checkFirst ); // Table scan plan - addPlan( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ), checkFirst ); + addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ), checkFirst ); } - + shared_ptr< QueryOp > QueryPlanSet::runOp( QueryOp &op ) { - if ( usingPrerecordedPlan_ ) { + if ( _usingPrerecordedPlan ) { Runner r( *this, op ); shared_ptr< QueryOp > res = r.run(); - // plans_.size() > 1 if addOtherPlans was called in Runner::run(). - if ( _bestGuessOnly || res->complete() || plans_.size() > 1 ) + // _plans.size() > 1 if addOtherPlans was called in Runner::run(). + if ( _bestGuessOnly || res->complete() || _plans.size() > 1 ) return res; { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); - NamespaceDetailsTransient::get_inlock( fbs_->ns() ).registerIndexForPattern( fbs_->pattern( order_ ), BSONObj(), 0 ); + NamespaceDetailsTransient::get_inlock( _fbs->ns() ).registerIndexForPattern( _fbs->pattern( _order ), BSONObj(), 0 ); } init(); } Runner r( *this, op ); return r.run(); } - + BSONObj QueryPlanSet::explain() const { vector< BSONObj > arr; - for( PlanSet::const_iterator i = plans_.begin(); i != plans_.end(); ++i ) { + for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) { shared_ptr c = (*i)->newCursor(); BSONObjBuilder explain; explain.append( "cursor", c->toString() ); @@ -477,37 +507,37 @@ namespace mongo { return b.obj(); } - QueryPlanSet::PlanPtr QueryPlanSet::getBestGuess() const { - assert( plans_.size() ); - if ( plans_[ 0 ]->scanAndOrderRequired() ){ - for ( unsigned i=1; iscanAndOrderRequired() ) - return plans_[i]; + QueryPlanSet::QueryPlanPtr QueryPlanSet::getBestGuess() const { + assert( _plans.size() ); + if ( _plans[ 0 ]->scanAndOrderRequired() ) { + for ( unsigned i=1; i<_plans.size(); i++ ) { + if ( ! _plans[i]->scanAndOrderRequired() ) + return _plans[i]; } - + stringstream ss; ss << "best guess plan requested, but scan and order required:"; - ss << " query: " << fbs_->simplifiedQuery(); - ss << " order: " << order_; + ss << " query: " << _fbs->simplifiedQuery(); + ss << " order: " << _order; ss << " choices: "; - for ( unsigned i=0; iindexKey() << " "; + for ( unsigned i=0; i<_plans.size(); i++ ) { + ss << _plans[i]->indexKey() << " "; } string s = ss.str(); msgassertedNoTrace( 13284, s.c_str() ); } - return plans_[0]; + return _plans[0]; } - + QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) : - op_( op ), - plans_( plans ) { + _op( op ), + _plans( plans ) { } - + void QueryPlanSet::Runner::mayYield( const vector< shared_ptr< QueryOp > > &ops ) { - if ( plans_._mayYield ) { - if ( plans_._yieldSometimesTracker.ping() ) { + if ( _plans._mayYield ) { + if ( _plans._yieldSometimesTracker.ping() ) { int micros = ClientCursor::yieldSuggest(); if ( micros > 0 ) { for( vector< shared_ptr< QueryOp > >::const_iterator i = ops.begin(); i != ops.end(); ++i ) { @@ -515,28 +545,38 @@ namespace mongo { return; } } - ClientCursor::staticYield( micros ); + ClientCursor::staticYield( micros , _plans._ns ); for( vector< shared_ptr< QueryOp > >::const_iterator i = ops.begin(); i != ops.end(); ++i ) { recoverFromYield( **i ); - } + } } } - } + } } - + + struct OpHolder { + OpHolder( const shared_ptr< QueryOp > &op ) : _op( op ), _offset() {} + shared_ptr< QueryOp > _op; + long long _offset; + bool operator<( const OpHolder &other ) const { + return _op->nscanned() + _offset > other._op->nscanned() + other._offset; + } + }; + shared_ptr< QueryOp > QueryPlanSet::Runner::run() { - massert( 10369 , "no plans", plans_.plans_.size() > 0 ); - + massert( 10369 , "no plans", _plans._plans.size() > 0 ); + vector< shared_ptr< QueryOp > > ops; - if ( plans_._bestGuessOnly ) { - shared_ptr< QueryOp > op( op_.createChild() ); - op->setQueryPlan( plans_.getBestGuess().get() ); - ops.push_back( op ); - } else { - if ( plans_.plans_.size() > 1 ) - log(1) << " running multiple plans" << endl; - for( PlanSet::iterator i = plans_.plans_.begin(); i != plans_.plans_.end(); ++i ) { - shared_ptr< QueryOp > op( op_.createChild() ); + if ( _plans._bestGuessOnly ) { + shared_ptr< QueryOp > op( _op.createChild() ); + op->setQueryPlan( _plans.getBestGuess().get() ); + ops.push_back( op ); + } + else { + if ( _plans._plans.size() > 1 ) + log(1) << " running multiple plans" << endl; + for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) { + shared_ptr< QueryOp > op( _op.createChild() ); op->setQueryPlan( i->get() ); ops.push_back( op ); } @@ -547,53 +587,51 @@ namespace mongo { if ( (*i)->complete() ) return *i; } - - long long nScanned = 0; - long long nScannedBackup = 0; - while( 1 ) { - ++nScanned; - unsigned errCount = 0; - bool first = true; - for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) { - mayYield( ops ); - QueryOp &op = **i; - nextOp( op ); - if ( op.complete() ) { - if ( first ) { - nScanned += nScannedBackup; - } - if ( plans_.mayRecordPlan_ && op.mayRecordPlan() ) { - op.qp().registerSelf( nScanned ); - } - return *i; + + std::priority_queue< OpHolder > queue; + for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) { + if ( !(*i)->error() ) { + queue.push( *i ); + } + } + + while( !queue.empty() ) { + mayYield( ops ); + OpHolder holder = queue.top(); + queue.pop(); + QueryOp &op = *holder._op; + nextOp( op ); + if ( op.complete() ) { + if ( _plans._mayRecordPlan && op.mayRecordPlan() ) { + op.qp().registerSelf( op.nscanned() ); } - if ( op.error() ) - ++errCount; - first = false; + return holder._op; } - if ( errCount == ops.size() ) - break; - if ( !plans_._bestGuessOnly && plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 && plans_._special.empty() ) { - plans_.addOtherPlans( true ); - PlanSet::iterator i = plans_.plans_.begin(); + if ( op.error() ) { + continue; + } + queue.push( holder ); + if ( !_plans._bestGuessOnly && _plans._usingPrerecordedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) { + holder._offset = -op.nscanned(); + _plans.addOtherPlans( true ); + PlanSet::iterator i = _plans._plans.begin(); ++i; - for( ; i != plans_.plans_.end(); ++i ) { - shared_ptr< QueryOp > op( op_.createChild() ); + for( ; i != _plans._plans.end(); ++i ) { + shared_ptr< QueryOp > op( _op.createChild() ); op->setQueryPlan( i->get() ); ops.push_back( op ); initOp( *op ); if ( op->complete() ) return op; - } - plans_.mayRecordPlan_ = true; - plans_.usingPrerecordedPlan_ = false; - nScannedBackup = nScanned; - nScanned = 0; + queue.push( op ); + } + _plans._mayRecordPlan = true; + _plans._usingPrerecordedPlan = false; } } return ops[ 0 ]; } - + #define GUARD_OP_EXCEPTION( op, expression ) \ try { \ expression; \ @@ -607,8 +645,8 @@ namespace mongo { catch ( ... ) { \ op.setException( ExceptionInfo( "Caught unknown exception" , 0 ) ); \ } - - + + void QueryPlanSet::Runner::initOp( QueryOp &op ) { GUARD_OP_EXCEPTION( op, op.init() ); } @@ -619,39 +657,39 @@ namespace mongo { bool QueryPlanSet::Runner::prepareToYield( QueryOp &op ) { GUARD_OP_EXCEPTION( op, - if ( op.error() ) { - return true; - } else { - return op.prepareToYield(); - } ); + if ( op.error() ) { + return true; + } + else { + return op.prepareToYield(); + } ); return true; } void QueryPlanSet::Runner::recoverFromYield( QueryOp &op ) { GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } ); } - - + + MultiPlanScanner::MultiPlanScanner( const char *ns, - const BSONObj &query, - const BSONObj &order, - const BSONElement *hint, - bool honorRecordedPlan, - const BSONObj &min, - const BSONObj &max, - bool bestGuessOnly, - bool mayYield ) : - _ns( ns ), - _or( !query.getField( "$or" ).eoo() ), - _query( query.getOwned() ), - _fros( ns, _query ), - _i(), - _honorRecordedPlan( honorRecordedPlan ), - _bestGuessOnly( bestGuessOnly ), - _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ), - _mayYield( mayYield ), - _tableScanned() - { + const BSONObj &query, + const BSONObj &order, + const BSONElement *hint, + bool honorRecordedPlan, + const BSONObj &min, + const BSONObj &max, + bool bestGuessOnly, + bool mayYield ) : + _ns( ns ), + _or( !query.getField( "$or" ).eoo() ), + _query( query.getOwned() ), + _fros( ns, _query ), + _i(), + _honorRecordedPlan( honorRecordedPlan ), + _bestGuessOnly( bestGuessOnly ), + _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ), + _mayYield( mayYield ), + _tableScanned() { if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() || !_fros.getSpecial().empty() ) { _or = false; } @@ -661,8 +699,10 @@ namespace mongo { // if _or == false, don't use or clauses for index selection if ( !_or ) { auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns, _query ) ); - _currentQps.reset( new QueryPlanSet( ns, frs, _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) ); - } else { + auto_ptr< FieldRangeSet > oldFrs( new FieldRangeSet( *frs ) ); + _currentQps.reset( new QueryPlanSet( ns, frs, oldFrs, _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) ); + } + else { BSONElement e = _query.getField( "$or" ); massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 ); } @@ -676,16 +716,17 @@ namespace mongo { } ++_i; auto_ptr< FieldRangeSet > frs( _fros.topFrs() ); + auto_ptr< FieldRangeSet > originalFrs( _fros.topFrsOriginal() ); BSONElement hintElt = _hint.firstElement(); - _currentQps.reset( new QueryPlanSet( _ns, frs, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) ); + _currentQps.reset( new QueryPlanSet( _ns, frs, originalFrs, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) ); shared_ptr< QueryOp > ret( _currentQps->runOp( op ) ); if ( ret->qp().willScanTable() ) { _tableScanned = true; } - _fros.popOrClause(); + _fros.popOrClause( ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() ); return ret; } - + shared_ptr< QueryOp > MultiPlanScanner::runOp( QueryOp &op ) { shared_ptr< QueryOp > ret = runOpOnce( op ); while( !ret->stopRequested() && mayRunMore() ) { @@ -693,7 +734,7 @@ namespace mongo { } return ret; } - + bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const { NamespaceDetails *nsd = nsdetails( _ns ); if ( !nsd ) { @@ -713,7 +754,8 @@ namespace mongo { if ( id->getSpec().suitability( *i, BSONObj() ) == USELESS ) { return true; } - } else { + } + else { bool useful = false; NamespaceDetails::IndexIterator j = nsd->ii(); while( j.more() ) { @@ -725,12 +767,12 @@ namespace mongo { } if ( !useful ) { return true; - } + } } } return false; } - + bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) { BSONObjIterator p( idxPattern ); BSONObjIterator k( sampleKey ); @@ -761,19 +803,19 @@ namespace mongo { int idxDirection = e.number() >= 0 ? 1 : -1; int direction = idxDirection * baseDirection; switch( direction ) { - case 1: - b.appendMaxKey( e.fieldName() ); - break; - case -1: - b.appendMinKey( e.fieldName() ); - break; - default: - assert( false ); + case 1: + b.appendMaxKey( e.fieldName() ); + break; + case -1: + b.appendMinKey( e.fieldName() ); + break; + default: + assert( false ); } } - return b.obj(); + return b.obj(); } - + pair< int, int > keyAudit( const BSONObj &min, const BSONObj &max ) { int direction = 0; int firstSignificantField = 0; @@ -802,18 +844,19 @@ namespace mongo { pair< int, int > flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) { if ( min.isEmpty() || max.isEmpty() ) { return make_pair( 1, -1 ); - } else { + } + else { return keyAudit( min, max ); } } - + // NOTE min, max, and keyPattern will be updated to be consistent with the selected index. IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) { if ( min.isEmpty() && max.isEmpty() ) { errmsg = "one of min or max must be specified"; return 0; } - + Client::Context ctx( ns ); IndexDetails *id = 0; NamespaceDetails *d = nsdetails( ns ); @@ -821,7 +864,7 @@ namespace mongo { errmsg = "ns not found"; return 0; } - + pair< int, int > ret = flexibleKeyAudit( min, max ); if ( ret == make_pair( -1, -1 ) ) { errmsg = "min and max keys do not share pattern"; @@ -832,15 +875,16 @@ namespace mongo { while( i.more() ) { IndexDetails& ii = i.next(); if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) { - if ( ii.getSpec().getType() == 0 ){ + if ( ii.getSpec().getType() == 0 ) { id = ⅈ keyPattern = ii.keyPattern(); break; } } } - - } else { + + } + else { if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) { errmsg = "requested keyPattern does not match specified keys"; return 0; @@ -853,30 +897,31 @@ namespace mongo { break; } if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 && - IndexDetails::isIdIndexPattern( keyPattern ) && - ii.isIdIndex() ){ + IndexDetails::isIdIndexPattern( keyPattern ) && + ii.isIdIndex() ) { id = ⅈ break; } - + } } if ( min.isEmpty() ) { min = extremeKeyForIndex( keyPattern, -1 ); - } else if ( max.isEmpty() ) { + } + else if ( max.isEmpty() ) { max = extremeKeyForIndex( keyPattern, 1 ); } - + if ( !id ) { errmsg = (string)"no index found for specified keyPattern: " + keyPattern.toString(); return 0; } - + min = min.extractFieldsUnDotted( keyPattern ); max = max.extractFieldsUnDotted( keyPattern ); return id; } - + } // namespace mongo diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h index 8314bfa..cf3180a 100644 --- a/db/queryoptimizer.h +++ b/db/queryoptimizer.h @@ -25,15 +25,17 @@ #include "../util/message.h" namespace mongo { - + class IndexDetails; class IndexType; class QueryPlan : boost::noncopyable { public: - QueryPlan(NamespaceDetails *_d, - int _idxNo, // -1 = no index + + QueryPlan(NamespaceDetails *d, + int idxNo, // -1 = no index const FieldRangeSet &fbs, + const FieldRangeSet &originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey = BSONObj(), @@ -41,44 +43,50 @@ namespace mongo { string special="" ); /* If true, no other index can do better. */ - bool optimal() const { return optimal_; } + bool optimal() const { return _optimal; } /* ScanAndOrder processing will be required if true */ - bool scanAndOrderRequired() const { return scanAndOrderRequired_; } + bool scanAndOrderRequired() const { return _scanAndOrderRequired; } /* When true, the index we are using has keys such that it can completely resolve the query expression to match by itself without ever checking the main object. */ - bool exactKeyMatch() const { return exactKeyMatch_; } - /* If true, the startKey and endKey are unhelpful and the index order doesn't match the + bool exactKeyMatch() const { return _exactKeyMatch; } + /* If true, the startKey and endKey are unhelpful and the index order doesn't match the requested sort order */ - bool unhelpful() const { return unhelpful_; } - int direction() const { return direction_; } + bool unhelpful() const { return _unhelpful; } + int direction() const { return _direction; } shared_ptr newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const; shared_ptr newReverseCursor() const; BSONObj indexKey() const; - bool willScanTable() const { return !index_ && fbs_.matchPossible(); } - const char *ns() const { return fbs_.ns(); } - NamespaceDetails *nsd() const { return d; } + bool indexed() const { return _index; } + bool willScanTable() const { return !_index && _fbs.matchPossible(); } + const char *ns() const { return _fbs.ns(); } + NamespaceDetails *nsd() const { return _d; } BSONObj originalQuery() const { return _originalQuery; } - BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return fbs_.simplifiedQuery( fields ); } - const FieldRange &range( const char *fieldName ) const { return fbs_.range( fieldName ); } + BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _fbs.simplifiedQuery( fields ); } + const FieldRange &range( const char *fieldName ) const { return _fbs.range( fieldName ); } void registerSelf( long long nScanned ) const; + shared_ptr< FieldRangeVector > originalFrv() const { return _originalFrv; } + // just for testing shared_ptr< FieldRangeVector > frv() const { return _frv; } + bool isMultiKey() const; + private: - NamespaceDetails *d; - int idxNo; - const FieldRangeSet &fbs_; + NamespaceDetails * _d; + int _idxNo; + const FieldRangeSet &_fbs; const BSONObj &_originalQuery; - const BSONObj &order_; - const IndexDetails *index_; - bool optimal_; - bool scanAndOrderRequired_; - bool exactKeyMatch_; - int direction_; + const BSONObj &_order; + const IndexDetails * _index; + bool _optimal; + bool _scanAndOrderRequired; + bool _exactKeyMatch; + int _direction; shared_ptr< FieldRangeVector > _frv; + shared_ptr< FieldRangeVector > _originalFrv; BSONObj _startKey; BSONObj _endKey; - bool endKeyInclusive_; - bool unhelpful_; + bool _endKeyInclusive; + bool _unhelpful; string _special; IndexType * _type; bool _startOrEndSpec; @@ -93,16 +101,17 @@ namespace mongo { // Used when handing off from one QueryOp type to another QueryOp( const QueryOp &other ) : - _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ), - _orConstraint( other._orConstraint ) {} - + _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ), + _orConstraint( other._orConstraint ) {} + virtual ~QueryOp() {} - + /** these gets called after a query plan is set */ - void init() { + void init() { if ( _oldMatcher.get() ) { _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) ); - } else { + } + else { _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) ); } _init(); @@ -110,10 +119,12 @@ namespace mongo { virtual void next() = 0; virtual bool mayRecordPlan() const = 0; - + virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; } virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); } - + + virtual long long nscanned() = 0; + /** @return a copy of the inheriting class, which will be run with its own query plan. If multiple plan sets are required for an $or query, the QueryOp of the winning plan from a given set will be cloned @@ -143,17 +154,17 @@ namespace mongo { shared_ptr< CoveredIndexMatcher > matcher() const { return _matcher; } protected: void setComplete() { - _orConstraint = qp().frv(); + _orConstraint = qp().originalFrv(); _complete = true; } void setStop() { setComplete(); _stopRequested = true; } virtual void _init() = 0; - + virtual QueryOp *_createChild() const = 0; - + virtual bool alwaysUseRecord() const { return false; } - + private: bool _complete; bool _stopRequested; @@ -164,42 +175,47 @@ namespace mongo { shared_ptr< CoveredIndexMatcher > _oldMatcher; shared_ptr< FieldRangeVector > _orConstraint; }; - + // Set of candidate query plans for a particular query. Used for running // a QueryOp on these plans. class QueryPlanSet { public: - typedef boost::shared_ptr< QueryPlan > PlanPtr; - typedef vector< PlanPtr > PlanSet; + typedef boost::shared_ptr< QueryPlan > QueryPlanPtr; + typedef vector< QueryPlanPtr > PlanSet; QueryPlanSet( const char *ns, - auto_ptr< FieldRangeSet > frs, - const BSONObj &originalQuery, - const BSONObj &order, - const BSONElement *hint = 0, - bool honorRecordedPlan = true, - const BSONObj &min = BSONObj(), - const BSONObj &max = BSONObj(), - bool bestGuessOnly = false, - bool mayYield = false); - int nPlans() const { return plans_.size(); } + auto_ptr< FieldRangeSet > frs, + auto_ptr< FieldRangeSet > originalFrs, + const BSONObj &originalQuery, + const BSONObj &order, + const BSONElement *hint = 0, + bool honorRecordedPlan = true, + const BSONObj &min = BSONObj(), + const BSONObj &max = BSONObj(), + bool bestGuessOnly = false, + bool mayYield = false); + int nPlans() const { return _plans.size(); } shared_ptr< QueryOp > runOp( QueryOp &op ); template< class T > shared_ptr< T > runOp( T &op ) { return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) ); } BSONObj explain() const; - bool usingPrerecordedPlan() const { return usingPrerecordedPlan_; } - PlanPtr getBestGuess() const; + bool usingPrerecordedPlan() const { return _usingPrerecordedPlan; } + QueryPlanPtr getBestGuess() const; //for testing - const FieldRangeSet &fbs() const { return *fbs_; } + const FieldRangeSet &fbs() const { return *_fbs; } + const FieldRangeSet &originalFrs() const { return *_originalFrs; } + bool modifiedKeys() const; + bool hasMultiKey() const; + private: void addOtherPlans( bool checkFirst ); - void addPlan( PlanPtr plan, bool checkFirst ) { - if ( checkFirst && plan->indexKey().woCompare( plans_[ 0 ]->indexKey() ) == 0 ) + void addPlan( QueryPlanPtr plan, bool checkFirst ) { + if ( checkFirst && plan->indexKey().woCompare( _plans[ 0 ]->indexKey() ) == 0 ) return; - plans_.push_back( plan ); + _plans.push_back( plan ); } void init(); void addHint( IndexDetails &id ); @@ -207,25 +223,27 @@ namespace mongo { Runner( QueryPlanSet &plans, QueryOp &op ); shared_ptr< QueryOp > run(); void mayYield( const vector< shared_ptr< QueryOp > > &ops ); - QueryOp &op_; - QueryPlanSet &plans_; + QueryOp &_op; + QueryPlanSet &_plans; static void initOp( QueryOp &op ); static void nextOp( QueryOp &op ); static bool prepareToYield( QueryOp &op ); static void recoverFromYield( QueryOp &op ); }; - const char *ns; + + const char *_ns; BSONObj _originalQuery; - auto_ptr< FieldRangeSet > fbs_; - PlanSet plans_; - bool mayRecordPlan_; - bool usingPrerecordedPlan_; - BSONObj hint_; - BSONObj order_; - long long oldNScanned_; - bool honorRecordedPlan_; - BSONObj min_; - BSONObj max_; + auto_ptr< FieldRangeSet > _fbs; + auto_ptr< FieldRangeSet > _originalFrs; + PlanSet _plans; + bool _mayRecordPlan; + bool _usingPrerecordedPlan; + BSONObj _hint; + BSONObj _order; + long long _oldNScanned; + bool _honorRecordedPlan; + BSONObj _min; + BSONObj _max; string _special; bool _bestGuessOnly; bool _mayYield; @@ -258,24 +276,24 @@ namespace mongo { class MultiPlanScanner { public: MultiPlanScanner( const char *ns, - const BSONObj &query, - const BSONObj &order, - const BSONElement *hint = 0, - bool honorRecordedPlan = true, - const BSONObj &min = BSONObj(), - const BSONObj &max = BSONObj(), - bool bestGuessOnly = false, - bool mayYield = false); + const BSONObj &query, + const BSONObj &order, + const BSONElement *hint = 0, + bool honorRecordedPlan = true, + const BSONObj &min = BSONObj(), + const BSONObj &max = BSONObj(), + bool bestGuessOnly = false, + bool mayYield = false); shared_ptr< QueryOp > runOp( QueryOp &op ); template< class T > shared_ptr< T > runOp( T &op ) { return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) ); - } + } shared_ptr< QueryOp > runOpOnce( QueryOp &op ); template< class T > shared_ptr< T > runOpOnce( T &op ) { return dynamic_pointer_cast< T >( runOpOnce( static_cast< QueryOp& >( op ) ) ); - } + } bool mayRunMore() const { return _or ? ( !_tableScanned && !_fros.orFinished() ) : _i == 0; } BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); } // just report this when only one query op @@ -284,6 +302,9 @@ namespace mongo { } void setBestGuessOnly() { _bestGuessOnly = true; } void mayYield( bool val ) { _mayYield = val; } + bool modifiedKeys() const { return _currentQps->modifiedKeys(); } + bool hasMultiKey() const { return _currentQps->hasMultiKey(); } + private: void assertNotOr() const { massert( 13266, "not implemented for $or query", !_or ); @@ -301,21 +322,22 @@ namespace mongo { bool _mayYield; bool _tableScanned; }; - + class MultiCursor : public Cursor { public: class CursorOp : public QueryOp { public: CursorOp() {} CursorOp( const QueryOp &other ) : QueryOp( other ) {} - virtual shared_ptr< Cursor > newCursor() const = 0; + virtual shared_ptr< Cursor > newCursor() const = 0; }; // takes ownership of 'op' MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr< CursorOp > op = shared_ptr< CursorOp >(), bool mayYield = false ) - : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ) { + : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() { if ( op.get() ) { _op = op; - } else { + } + else { _op.reset( new NoOp() ); } if ( _mps->mayRunMore() ) { @@ -323,13 +345,14 @@ namespace mongo { if ( !ok() ) { advance(); } - } else { + } + else { _c.reset( new BasicCursor( DiskLoc() ) ); } } // used to handoff a query to a getMore() MultiCursor( auto_ptr< MultiPlanScanner > mps, const shared_ptr< Cursor > &c, const shared_ptr< CoveredIndexMatcher > &matcher, const QueryOp &op ) - : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ) { + : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( -1 ) { _mps->setBestGuessOnly(); _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet if ( !ok() ) { @@ -355,16 +378,24 @@ namespace mongo { } virtual void checkLocation() { _c->checkLocation(); - } + } virtual bool supportGetMore() { return true; } virtual bool supportYields() { return _c->supportYields(); } + // with update we could potentially get the same document on multiple // indexes, but update appears to already handle this with seenObjects // so we don't have to do anything special here. virtual bool getsetdup(DiskLoc loc) { - return _c->getsetdup( loc ); + return _c->getsetdup( loc ); } + + virtual bool modifiedKeys() const { return _mps->modifiedKeys(); } + + virtual bool isMultiKey() const { return _mps->hasMultiKey(); } + virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); } + // return -1 if we're a getmore handoff + virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; } // just for testing shared_ptr< Cursor > sub_c() const { return _c; } private: @@ -377,8 +408,12 @@ namespace mongo { virtual bool mayRecordPlan() const { return false; } virtual QueryOp *_createChild() const { return new NoOp(); } virtual shared_ptr< Cursor > newCursor() const { return qp().newCursor(); } + virtual long long nscanned() { assert( false ); return 0; } }; void nextClause() { + if ( _nscanned >= 0 && _c.get() ) { + _nscanned += _c->nscanned(); + } shared_ptr< CursorOp > best = _mps->runOpOnce( *_op ); if ( ! best->complete() ) throw MsgAssertionException( best->exception() ); @@ -390,12 +425,13 @@ namespace mongo { shared_ptr< Cursor > _c; auto_ptr< MultiPlanScanner > _mps; shared_ptr< CoveredIndexMatcher > _matcher; + long long _nscanned; }; - + // NOTE min, max, and keyPattern will be updated to be consistent with the selected index. IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ); - inline bool isSimpleIdQuery( const BSONObj& query ){ + inline bool isSimpleIdQuery( const BSONObj& query ) { BSONObjIterator i(query); if( !i.more() ) return false; BSONElement e = i.next(); @@ -403,14 +439,16 @@ namespace mongo { if( strcmp("_id", e.fieldName()) != 0 ) return false; return e.isSimpleType(); // e.g. not something like { _id : { $gt : ... } - + // matcher() will always work on the returned cursor inline shared_ptr< Cursor > bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) { if( !query.getField( "$or" ).eoo() ) { return shared_ptr< Cursor >( new MultiCursor( ns, query, sort ) ); - } else { + } + else { auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns, query ) ); - shared_ptr< Cursor > ret = QueryPlanSet( ns, frs, query, sort ).getBestGuess()->newCursor(); + auto_ptr< FieldRangeSet > origFrs( new FieldRangeSet( *frs ) ); + shared_ptr< Cursor > ret = QueryPlanSet( ns, frs, origFrs, query, sort ).getBestGuess()->newCursor(); if ( !query.isEmpty() ) { shared_ptr< CoveredIndexMatcher > matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) ); ret->setMatcher( matcher ); @@ -418,5 +456,5 @@ namespace mongo { return ret; } } - + } // namespace mongo diff --git a/db/queryutil.cpp b/db/queryutil.cpp index 2153046..1cd750b 100644 --- a/db/queryutil.cpp +++ b/db/queryutil.cpp @@ -23,111 +23,119 @@ #include "queryoptimizer.h" #include "../util/unittest.h" #include "dbmessage.h" +#include "indexkey.h" namespace mongo { extern BSONObj staticNull; - + /** returns a string that when used as a matcher, would match a super set of regex() returns "" for complex regular expressions used to optimize queries in some simple regex cases that start with '^' if purePrefix != NULL, sets it to whether the regex can be converted to a range query */ - string simpleRegex(const char* regex, const char* flags, bool* purePrefix){ + string simpleRegex(const char* regex, const char* flags, bool* purePrefix) { string r = ""; if (purePrefix) *purePrefix = false; bool multilineOK; - if ( regex[0] == '\\' && regex[1] == 'A'){ + if ( regex[0] == '\\' && regex[1] == 'A') { multilineOK = true; regex += 2; - } else if (regex[0] == '^') { + } + else if (regex[0] == '^') { multilineOK = false; regex += 1; - } else { + } + else { return r; } bool extended = false; - while (*flags){ - switch (*(flags++)){ - case 'm': // multiline - if (multilineOK) - continue; - else - return r; - case 'x': // extended - extended = true; - break; - default: - return r; // cant use index + while (*flags) { + switch (*(flags++)) { + case 'm': // multiline + if (multilineOK) + continue; + else + return r; + case 'x': // extended + extended = true; + break; + default: + return r; // cant use index } } stringstream ss; - while(*regex){ + while(*regex) { char c = *(regex++); - if ( c == '*' || c == '?' ){ + if ( c == '*' || c == '?' ) { // These are the only two symbols that make the last char optional r = ss.str(); r = r.substr( 0 , r.size() - 1 ); return r; //breaking here fails with /^a?/ - } else if (c == '\\'){ + } + else if (c == '\\') { // slash followed by non-alphanumeric represents the following char c = *(regex++); if ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - (c >= '0' && c <= '0') || - (c == '\0')) - { + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '0') || + (c == '\0')) { r = ss.str(); break; - } else { + } + else { ss << c; } - } else if (strchr("^$.[|()+{", c)){ + } + else if (strchr("^$.[|()+{", c)) { // list of "metacharacters" from man pcrepattern r = ss.str(); break; - } else if (extended && c == '#'){ + } + else if (extended && c == '#') { // comment r = ss.str(); break; - } else if (extended && isspace(c)){ + } + else if (extended && isspace(c)) { continue; - } else { + } + else { // self-matching char ss << c; } } - if ( r.empty() && *regex == 0 ){ + if ( r.empty() && *regex == 0 ) { r = ss.str(); if (purePrefix) *purePrefix = !r.empty(); } return r; } - inline string simpleRegex(const BSONElement& e){ - switch(e.type()){ - case RegEx: - return simpleRegex(e.regex(), e.regexFlags()); - case Object:{ - BSONObj o = e.embeddedObject(); - return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe()); - } - default: assert(false); return ""; //return squashes compiler warning + inline string simpleRegex(const BSONElement& e) { + switch(e.type()) { + case RegEx: + return simpleRegex(e.regex(), e.regexFlags()); + case Object: { + BSONObj o = e.embeddedObject(); + return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe()); + } + default: assert(false); return ""; //return squashes compiler warning } } string simpleRegexEnd( string regex ) { ++regex[ regex.length() - 1 ]; return regex; - } - - + } + + FieldRange::FieldRange( const BSONElement &e, bool isNot, bool optimize ) { // NOTE with $not, we could potentially form a complementary set of intervals. if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) { @@ -139,7 +147,8 @@ namespace mongo { BSONElement ie = i.next(); if ( ie.type() == RegEx ) { regexes.push_back( FieldRange( ie, false, optimize ) ); - } else { + } + else { vals.insert( ie ); } } @@ -149,22 +158,22 @@ namespace mongo { for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i ) *this |= *i; - + return; } - - if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ){ - + + if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ) { + _intervals.push_back( FieldInterval(e) ); - + const BSONElement& temp = e.embeddedObject().firstElement(); - if ( ! temp.eoo() ){ + if ( ! temp.eoo() ) { if ( temp < e ) _intervals.insert( _intervals.begin() , temp ); else _intervals.push_back( FieldInterval(temp) ); } - + return; } @@ -181,17 +190,19 @@ namespace mongo { if ( e.eoo() ) return; + int op = e.getGtLtOp(); if ( e.type() == RegEx - || (e.type() == Object && !e.embeddedObject()["$regex"].eoo()) - ) - { + || (e.type() == Object && !e.embeddedObject()["$regex"].eoo()) + ) { + uassert( 13454, "invalid regular expression operator", op == BSONObj::Equality || op == BSONObj::opREGEX ); if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes const string r = simpleRegex(e); if ( r.size() ) { lower = addObj( BSON( "" << r ) ).firstElement(); upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement(); upperInclusive = false; - } else { + } + else { BSONObjBuilder b1(32), b2(32); b1.appendMinForType( "" , String ); lower = addObj( b1.obj() ).firstElement(); @@ -202,10 +213,11 @@ namespace mongo { } // regex matches self - regex type > string type - if (e.type() == RegEx){ + if (e.type() == RegEx) { BSONElement re = addObj( BSON( "" << e ) ).firstElement(); _intervals.push_back( FieldInterval(re) ); - } else { + } + else { BSONObj orig = e.embeddedObject(); BSONObjBuilder b; b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe()); @@ -216,38 +228,53 @@ namespace mongo { } return; } - int op = e.getGtLtOp(); if ( isNot ) { switch( op ) { - case BSONObj::Equality: - case BSONObj::opALL: - case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in) - case BSONObj::opTYPE: - op = BSONObj::NE; // no bound calculation - break; - case BSONObj::NE: - op = BSONObj::Equality; - break; - case BSONObj::LT: - op = BSONObj::GTE; - break; - case BSONObj::LTE: - op = BSONObj::GT; - break; - case BSONObj::GT: - op = BSONObj::LTE; - break; - case BSONObj::GTE: - op = BSONObj::LT; - break; - default: // otherwise doesn't matter - break; + case BSONObj::Equality: + return; +// op = BSONObj::NE; +// break; + case BSONObj::opALL: + case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in) + case BSONObj::opTYPE: + // no bound calculation + return; + case BSONObj::NE: + op = BSONObj::Equality; + break; + case BSONObj::LT: + op = BSONObj::GTE; + break; + case BSONObj::LTE: + op = BSONObj::GT; + break; + case BSONObj::GT: + op = BSONObj::LTE; + break; + case BSONObj::GTE: + op = BSONObj::LT; + break; + default: // otherwise doesn't matter + break; } } switch( op ) { case BSONObj::Equality: lower = upper = e; break; + case BSONObj::NE: { + // this will invalidate the upper/lower references above + _intervals.push_back( FieldInterval() ); + // optimize doesn't make sense for negative ranges + _intervals[ 0 ]._upper._bound = e; + _intervals[ 0 ]._upper._inclusive = false; + _intervals[ 1 ]._lower._bound = e; + _intervals[ 1 ]._lower._inclusive = false; + _intervals[ 1 ]._upper._bound = maxKey.firstElement(); + _intervals[ 1 ]._upper._inclusive = true; + optimize = false; // don't run optimize code below + break; + } case BSONObj::LT: upperInclusive = false; case BSONObj::LTE: @@ -262,9 +289,9 @@ namespace mongo { massert( 10370 , "$all requires array", e.type() == Array ); BSONObjIterator i( e.embeddedObject() ); bool bound = false; - while ( i.more() ){ + while ( i.more() ) { BSONElement x = i.next(); - if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){ + if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) { // taken care of elsewhere } else if ( x.type() != RegEx ) { @@ -299,7 +326,7 @@ namespace mongo { BSONObjBuilder b; b.appendMaxForType( "" , NumberDouble ); upper = addObj( b.obj() ).firstElement(); - } + } break; } case BSONObj::opTYPE: { @@ -314,7 +341,7 @@ namespace mongo { b.appendMaxForType( "" , t ); upper = addObj( b.obj() ).firstElement(); } - + break; } case BSONObj::opREGEX: @@ -332,14 +359,14 @@ namespace mongo { default: break; } - - if ( optimize ){ - if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ){ // TODO: get rid of isSimpleType + + if ( optimize ) { + if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ) { // TODO: get rid of isSimpleType BSONObjBuilder b; b.appendMaxForType( lower.fieldName() , lower.type() ); upper = addObj( b.obj() ).firstElement(); } - else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ){ // TODO: get rid of isSimpleType + else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType BSONObjBuilder b; b.appendMinForType( upper.fieldName() , upper.type() ); lower = addObj( b.obj() ).firstElement(); @@ -355,7 +382,7 @@ namespace mongo { if ( _special.size() == 0 && other._special.size() ) _special = other._special; } - + // as called, these functions find the max/min of a bound in the // opposite direction, so inclusive bounds are considered less // superlative @@ -378,41 +405,46 @@ namespace mongo { result._upper = minFieldBound( one._upper, two._upper ); return result.strictValid(); } - - // NOTE Not yet tested for complex $or bounds, just for simple bounds generated by $in + const FieldRange &FieldRange::operator&=( const FieldRange &other ) { vector< FieldInterval > newIntervals; vector< FieldInterval >::const_iterator i = _intervals.begin(); vector< FieldInterval >::const_iterator j = other._intervals.begin(); while( i != _intervals.end() && j != other._intervals.end() ) { FieldInterval overlap; - if ( fieldIntervalOverlap( *i, *j, overlap ) ) + if ( fieldIntervalOverlap( *i, *j, overlap ) ) { newIntervals.push_back( overlap ); - if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) + } + if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) { ++i; - else - ++j; + } + else { + ++j; + } } finishOperation( newIntervals, other ); return *this; } - + void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) { if ( low._bound.eoo() ) { low = lower._lower; high = lower._upper; - } else { - if ( high._bound.woCompare( lower._lower._bound, false ) < 0 ) { // when equal but neither inclusive, just assume they overlap, since current btree scanning code just as efficient either way + } + else { + int cmp = high._bound.woCompare( lower._lower._bound, false ); + if ( ( cmp < 0 ) || ( cmp == 0 && !high._inclusive && !lower._lower._inclusive ) ) { FieldInterval tmp; tmp._lower = low; tmp._upper = high; newIntervals.push_back( tmp ); - low = lower._lower; high = lower._upper; - } else { + low = lower._lower; high = lower._upper; + } + else { high = lower._upper; } - } + } } - + const FieldRange &FieldRange::operator|=( const FieldRange &other ) { vector< FieldInterval > newIntervals; FieldBound low; @@ -424,90 +456,107 @@ namespace mongo { if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) { handleInterval( *i, low, high, newIntervals ); ++i; - } else { + } + else { handleInterval( *j, low, high, newIntervals ); ++j; - } + } } while( i != _intervals.end() ) { handleInterval( *i, low, high, newIntervals ); - ++i; + ++i; } while( j != other._intervals.end() ) { handleInterval( *j, low, high, newIntervals ); - ++j; + ++j; } FieldInterval tmp; tmp._lower = low; tmp._upper = high; - newIntervals.push_back( tmp ); + newIntervals.push_back( tmp ); finishOperation( newIntervals, other ); - return *this; + return *this; } - + const FieldRange &FieldRange::operator-=( const FieldRange &other ) { + vector< FieldInterval > newIntervals; vector< FieldInterval >::iterator i = _intervals.begin(); vector< FieldInterval >::const_iterator j = other._intervals.begin(); while( i != _intervals.end() && j != other._intervals.end() ) { int cmp = i->_lower._bound.woCompare( j->_lower._bound, false ); if ( cmp < 0 || - ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) { + ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) { int cmp2 = i->_upper._bound.woCompare( j->_lower._bound, false ); if ( cmp2 < 0 ) { + newIntervals.push_back( *i ); ++i; - } else if ( cmp2 == 0 ) { - if ( i->_upper._inclusive && j->_lower._inclusive ) { - i->_upper._inclusive = false; + } + else if ( cmp2 == 0 ) { + newIntervals.push_back( *i ); + if ( newIntervals.back()._upper._inclusive && j->_lower._inclusive ) { + newIntervals.back()._upper._inclusive = false; } ++i; - } else { + } + else { + newIntervals.push_back( *i ); + newIntervals.back()._upper = j->_lower; + newIntervals.back()._upper.flipInclusive(); int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false ); if ( cmp3 < 0 || - ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) { - i->_upper = j->_lower; - i->_upper.flipInclusive(); + ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) { ++i; - } else { + } + else { + i->_lower = j->_upper; + i->_lower.flipInclusive(); ++j; } } - } else { + } + else { int cmp2 = i->_lower._bound.woCompare( j->_upper._bound, false ); if ( cmp2 > 0 || - ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_lower._inclusive ) ) ) { + ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_upper._inclusive ) ) ) { ++j; - } else { + } + else { int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false ); if ( cmp3 < 0 || - ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) { - i = _intervals.erase( i ); - } else { + ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) { + ++i; + } + else { i->_lower = j->_upper; - i->_lower.flipInclusive(); + i->_lower.flipInclusive(); ++j; } - } + } } } - finishOperation( _intervals, other ); - return *this; + while( i != _intervals.end() ) { + newIntervals.push_back( *i ); + ++i; + } + finishOperation( newIntervals, other ); + return *this; } - + // TODO write a proper implementation that doesn't do a full copy bool FieldRange::operator<=( const FieldRange &other ) { FieldRange temp = *this; temp -= other; return temp.empty(); } - + BSONObj FieldRange::addObj( const BSONObj &o ) { _objData.push_back( o ); return o; } - + string FieldRangeSet::getSpecial() const { string s = ""; - for ( map::iterator i=_ranges.begin(); i!=_ranges.end(); i++ ){ + for ( map::iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) { if ( i->second.getSpecial().size() == 0 ) continue; uassert( 13033 , "can't have 2 special fields" , s.size() == 0 ); @@ -533,34 +582,35 @@ namespace mongo { } if ( op2 == BSONObj::opELEM_MATCH ) { BSONObjIterator k( g.embeddedObjectUserCheck() ); - while ( k.more() ){ + while ( k.more() ) { BSONElement h = k.next(); StringBuilder buf(32); buf << fieldName << "." << h.fieldName(); string fullname = buf.str(); - + int op3 = getGtLtOp( h ); - if ( op3 == BSONObj::Equality ){ + if ( op3 == BSONObj::Equality ) { _ranges[ fullname ] &= FieldRange( h , isNot , optimize ); } else { BSONObjIterator l( h.embeddedObject() ); - while ( l.more() ){ + while ( l.more() ) { _ranges[ fullname ] &= FieldRange( l.next() , isNot , optimize ); } } - } - } else { + } + } + else { _ranges[ fieldName ] &= FieldRange( f , isNot , optimize ); - } + } } - + void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) { bool equality = ( getGtLtOp( e ) == BSONObj::Equality ); if ( equality && e.type() == Object ) { equality = ( strcmp( e.embeddedObject().firstElement().fieldName(), "$not" ) != 0 ); } - + if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) { _ranges[ e.fieldName() ] &= FieldRange( e , false , optimize ); } @@ -570,67 +620,69 @@ namespace mongo { BSONElement f = j.next(); if ( strcmp( f.fieldName(), "$not" ) == 0 ) { switch( f.type() ) { - case Object: { - BSONObjIterator k( f.embeddedObject() ); - while( k.more() ) { - BSONElement g = k.next(); - uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality ); - processOpElement( e.fieldName(), g, true, optimize ); - } - break; + case Object: { + BSONObjIterator k( f.embeddedObject() ); + while( k.more() ) { + BSONElement g = k.next(); + uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality ); + processOpElement( e.fieldName(), g, true, optimize ); } - case RegEx: - processOpElement( e.fieldName(), f, true, optimize ); - break; - default: - uassert( 13041, "invalid use of $not", false ); + break; } - } else { + case RegEx: + processOpElement( e.fieldName(), f, true, optimize ); + break; + default: + uassert( 13041, "invalid use of $not", false ); + } + } + else { processOpElement( e.fieldName(), f, false, optimize ); } - } - } + } + } } - + FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query , bool optimize ) : _ns( ns ), _queries( 1, query.getOwned() ) { - BSONObjIterator i( _queries[ 0 ] ); - - while( i.more() ) { - BSONElement e = i.next(); - // e could be x:1 or x:{$gt:1} - - if ( strcmp( e.fieldName(), "$where" ) == 0 ) { - continue; - } - - if ( strcmp( e.fieldName(), "$or" ) == 0 ) { - continue; - } - - if ( strcmp( e.fieldName(), "$nor" ) == 0 ) { - continue; - } - - processQueryField( e, optimize ); - } + BSONObjIterator i( _queries[ 0 ] ); + + while( i.more() ) { + BSONElement e = i.next(); + // e could be x:1 or x:{$gt:1} + + if ( strcmp( e.fieldName(), "$where" ) == 0 ) { + continue; + } + + if ( strcmp( e.fieldName(), "$or" ) == 0 ) { + continue; + } + + if ( strcmp( e.fieldName(), "$nor" ) == 0 ) { + continue; + } + + processQueryField( e, optimize ); } + } FieldRangeOrSet::FieldRangeOrSet( const char *ns, const BSONObj &query , bool optimize ) : _baseSet( ns, query, optimize ), _orFound() { BSONObjIterator i( _baseSet._queries[ 0 ] ); - + while( i.more() ) { BSONElement e = i.next(); - if ( strcmp( e.fieldName(), "$or" ) == 0 ) { - massert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 ); - BSONObjIterator j( e.embeddedObject() ); - while( j.more() ) { - BSONElement f = j.next(); - massert( 13263, "$or array must contain objects", f.type() == Object ); + if ( strcmp( e.fieldName(), "$or" ) == 0 ) { + massert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 ); + BSONObjIterator j( e.embeddedObject() ); + while( j.more() ) { + BSONElement f = j.next(); + massert( 13263, "$or array must contain objects", f.type() == Object ); _orSets.push_back( FieldRangeSet( ns, f.embeddedObject(), optimize ) ); massert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() ); + _originalOrSets.push_back( _orSets.back() ); } _orFound = true; continue; @@ -638,13 +690,41 @@ namespace mongo { } } + void FieldRangeOrSet::popOrClause( const BSONObj &indexSpec ) { + massert( 13274, "no or clause to pop", !orFinished() ); + auto_ptr< FieldRangeSet > holder; + FieldRangeSet *toDiff = &_originalOrSets.front(); + if ( toDiff->matchPossible() && !indexSpec.isEmpty() ) { + holder.reset( toDiff->subset( indexSpec ) ); + toDiff = holder.get(); + } + list< FieldRangeSet >::iterator i = _orSets.begin(); + list< FieldRangeSet >::iterator j = _originalOrSets.begin(); + ++i; + ++j; + while( i != _orSets.end() ) { + *i -= *toDiff; + if( !i->matchPossible() ) { + i = _orSets.erase( i ); + j = _originalOrSets.erase( j ); + } + else { + ++i; + ++j; + } + } + _oldOrSets.push_front( _orSets.front() ); + _orSets.pop_front(); + _originalOrSets.pop_front(); + } + FieldRange *FieldRangeSet::trivialRange_ = 0; FieldRange &FieldRangeSet::trivialRange() { if ( trivialRange_ == 0 ) trivialRange_ = new FieldRange(); return *trivialRange_; } - + BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const { BSONObj fields = _fields; if ( fields.isEmpty() ) { @@ -676,14 +756,15 @@ namespace mongo { } return b.obj(); } - + QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const { QueryPattern qp; for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { assert( !i->second.empty() ); if ( i->second.equality() ) { qp._fieldTypes[ i->first ] = QueryPattern::Equality; - } else if ( i->second.nontrivial() ) { + } + else if ( i->second.nontrivial() ) { bool upper = i->second.max().type() != MaxKey; bool lower = i->second.min().type() != MinKey; if ( upper && lower ) @@ -691,18 +772,18 @@ namespace mongo { else if ( upper ) qp._fieldTypes[ i->first ] = QueryPattern::UpperBound; else if ( lower ) - qp._fieldTypes[ i->first ] = QueryPattern::LowerBound; + qp._fieldTypes[ i->first ] = QueryPattern::LowerBound; } } qp.setSort( sort ); return qp; } - + // TODO get rid of this BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const { typedef vector< pair< shared_ptr< BSONObjBuilder >, shared_ptr< BSONObjBuilder > > > BoundBuilders; BoundBuilders builders; - builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) ); + builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) ); BSONObjIterator i( keyPattern ); bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds while( i.more() ) { @@ -716,7 +797,8 @@ namespace mongo { j->first->appendAs( fr.min(), "" ); j->second->appendAs( fr.min(), "" ); } - } else { + } + else { if ( !fr.inQuery() ) { ineq = true; } @@ -725,18 +807,21 @@ namespace mongo { for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) { BSONObj first = i->first->obj(); BSONObj second = i->second->obj(); + + const unsigned maxCombinations = 4000000; if ( forward ) { for( vector< FieldInterval >::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) { - uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < 1000000 ); + uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations ); newBuilders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) ); newBuilders.back().first->appendElements( first ); newBuilders.back().second->appendElements( second ); newBuilders.back().first->appendAs( j->_lower._bound, "" ); newBuilders.back().second->appendAs( j->_upper._bound, "" ); } - } else { + } + else { for( vector< FieldInterval >::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) { - uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < 1000000 ); + uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations ); newBuilders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) ); newBuilders.back().first->appendElements( first ); newBuilders.back().second->appendElements( second ); @@ -747,7 +832,8 @@ namespace mongo { } builders = newBuilders; } - } else { + } + else { for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) { j->first->appendAs( forward ? fr.min() : fr.max(), "" ); j->second->appendAs( forward ? fr.max() : fr.min(), "" ); @@ -758,204 +844,45 @@ namespace mongo { for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) ret.push_back( make_pair( i->first->obj(), i->second->obj() ) ); return ret; - } - - /////////////////// - // FieldMatcher // - /////////////////// - - void FieldMatcher::add( const BSONObj& o ){ - massert( 10371 , "can only add to FieldMatcher once", _source.isEmpty()); - _source = o; - - BSONObjIterator i( o ); - int true_false = -1; - while ( i.more() ){ - BSONElement e = i.next(); - - if (e.type() == Object){ - BSONObj obj = e.embeddedObject(); - BSONElement e2 = obj.firstElement(); - if ( strcmp(e2.fieldName(), "$slice") == 0 ){ - if (e2.isNumber()){ - int i = e2.numberInt(); - if (i < 0) - add(e.fieldName(), i, -i); // limit is now positive - else - add(e.fieldName(), 0, i); - - } else if (e2.type() == Array) { - BSONObj arr = e2.embeddedObject(); - uassert(13099, "$slice array wrong size", arr.nFields() == 2 ); - - BSONObjIterator it(arr); - int skip = it.next().numberInt(); - int limit = it.next().numberInt(); - uassert(13100, "$slice limit must be positive", limit > 0 ); - add(e.fieldName(), skip, limit); - - } else { - uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false); - } - } else { - uassert(13097, string("Unsupported projection option: ") + obj.firstElement().fieldName(), false); - } - - } else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()){ - _includeID = false; - - } else { - - add (e.fieldName(), e.trueValue()); - - // validate input - if (true_false == -1){ - true_false = e.trueValue(); - _include = !e.trueValue(); - } - else{ - uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." , - (bool)true_false == e.trueValue() ); - } - } - } - } - - void FieldMatcher::add(const string& field, bool include){ - if (field.empty()){ // this is the field the user referred to - _include = include; - } else { - _include = !include; - - const size_t dot = field.find('.'); - const string subfield = field.substr(0,dot); - const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); - - boost::shared_ptr& fm = _fields[subfield]; - if (!fm) - fm.reset(new FieldMatcher()); - - fm->add(rest, include); - } - } - - void FieldMatcher::add(const string& field, int skip, int limit){ - _special = true; // can't include or exclude whole object - - if (field.empty()){ // this is the field the user referred to - _skip = skip; - _limit = limit; - } else { - const size_t dot = field.find('.'); - const string subfield = field.substr(0,dot); - const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); - - boost::shared_ptr& fm = _fields[subfield]; - if (!fm) - fm.reset(new FieldMatcher()); - - fm->add(rest, skip, limit); - } } - BSONObj FieldMatcher::getSpec() const{ - return _source; - } - - //b will be the value part of an array-typed BSONElement - void FieldMatcher::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const { - int skip = nested ? 0 : _skip; - int limit = nested ? -1 : _limit; - - if (skip < 0){ - skip = max(0, skip + a.nFields()); - } - - int i=0; - BSONObjIterator it(a); - while (it.more()){ - BSONElement e = it.next(); - - if (skip){ - skip--; - continue; - } - - if (limit != -1 && (limit-- == 0)){ - break; - } - - switch(e.type()){ - case Array:{ - BSONObjBuilder subb; - appendArray(subb , e.embeddedObject(), true); - b.appendArray(b.numStr(i++), subb.obj()); - break; - } - case Object:{ - BSONObjBuilder subb; - BSONObjIterator jt(e.embeddedObject()); - while (jt.more()){ - append(subb , jt.next()); - } - b.append(b.numStr(i++), subb.obj()); - break; - } - default: - if (_include) - b.appendAs(e, b.numStr(i++)); + FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const { + FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj() ); + BSONObjIterator i( fields ); + while( i.more() ) { + BSONElement e = i.next(); + if ( _ranges[ e.fieldName() ].nontrivial() ) { + ret->_ranges[ e.fieldName() ] = _ranges[ e.fieldName() ]; } } + ret->_queries = _queries; + return ret; } - void FieldMatcher::append( BSONObjBuilder& b , const BSONElement& e ) const { - FieldMap::const_iterator field = _fields.find( e.fieldName() ); - - if (field == _fields.end()){ - if (_include) - b.append(e); - } - else { - FieldMatcher& subfm = *field->second; - - if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ){ - if (subfm._include) - b.append(e); - } - else if (e.type() == Object){ - BSONObjBuilder subb; - BSONObjIterator it(e.embeddedObject()); - while (it.more()){ - subfm.append(subb, it.next()); - } - b.append(e.fieldName(), subb.obj()); - - } - else { //Array - BSONObjBuilder subb; - subfm.appendArray(subb, e.embeddedObject()); - b.appendArray(e.fieldName(), subb.obj()); - } - } - } - bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const { - int l = matchingLowElement( e, i, forward ); - return ( l % 2 == 0 ); // if we're inside an interval + bool eq; + int l = matchingLowElement( e, i, forward, eq ); + return ( l % 2 == 0 ); // if we're inside an interval } - + // binary search for interval containing the specified element // an even return value indicates that the element is contained within a valid interval - int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward ) const { + int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward, bool &lowEquality ) const { + lowEquality = false; int l = -1; int h = _ranges[ i ].intervals().size() * 2; while( l + 1 < h ) { int m = ( l + h ) / 2; BSONElement toCmp; + bool toCmpInclusive; + const FieldInterval &interval = _ranges[ i ].intervals()[ m / 2 ]; if ( m % 2 == 0 ) { - toCmp = _ranges[ i ].intervals()[ m / 2 ]._lower._bound; - } else { - toCmp = _ranges[ i ].intervals()[ m / 2 ]._upper._bound; + toCmp = interval._lower._bound; + toCmpInclusive = interval._lower._inclusive; + } + else { + toCmp = interval._upper._bound; + toCmpInclusive = interval._upper._inclusive; } int cmp = toCmp.woCompare( e, false ); if ( !forward ) { @@ -963,41 +890,60 @@ namespace mongo { } if ( cmp < 0 ) { l = m; - } else if ( cmp > 0 ) { + } + else if ( cmp > 0 ) { h = m; - } else { - return ( m % 2 == 0 ) ? m : m - 1; + } + else { + if ( m % 2 == 0 ) { + lowEquality = true; + } + int ret = m; + // if left match and inclusive, all good + // if left match and not inclusive, return right before left bound + // if right match and inclusive, return left bound + // if right match and not inclusive, return right bound + if ( ( m % 2 == 0 && !toCmpInclusive ) || ( m % 2 == 1 && toCmpInclusive ) ) { + --ret; + } + return ret; } } assert( l + 1 == h ); return l; } - + bool FieldRangeVector::matches( const BSONObj &obj ) const { - BSONObjIterator k( _keyPattern ); - for( int i = 0; i < (int)_ranges.size(); ++i ) { - if ( _ranges[ i ].empty() ) { - return false; - } - BSONElement kk = k.next(); - int number = (int) kk.number(); - bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0; - BSONElementSet keys; - obj.getFieldsDotted( kk.fieldName(), keys ); - bool match = false; - for( BSONElementSet::const_iterator j = keys.begin(); j != keys.end(); ++j ) { - if ( matchesElement( *j, i, forward ) ) { - match = true; + if ( !_indexSpec.get() ) { + _indexSpec.reset( new IndexSpec( _keyPattern ) ); + } + // TODO The representation of matching keys could potentially be optimized + // more for the case at hand. (For example, we can potentially consider + // fields individually instead of constructing several bson objects using + // multikey arrays.) But getKeys() canonically defines the key set for a + // given object and for now we are using it as is. + BSONObjSetDefaultOrder keys; + _indexSpec->getKeys( obj, keys ); + for( BSONObjSetDefaultOrder::const_iterator i = keys.begin(); i != keys.end(); ++i ) { + BSONObjIterator j( *i ); + BSONObjIterator k( _keyPattern ); + bool match = true; + for( int l = 0; l < (int)_ranges.size(); ++l ) { + int number = (int) k.next().number(); + bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0; + if ( !matchesElement( j.next(), l, forward ) ) { + match = false; break; } } - if ( !match ) { - return false; + if ( match ) { + // The *i key matched a valid range for every element. + return true; } } - return true; + return false; } - + // TODO optimize more int FieldRangeVector::Iterator::advance( const BSONObj &curr ) { BSONObjIterator j( curr ); @@ -1009,7 +955,8 @@ namespace mongo { for( int i = 0; i < (int)_i.size(); ++i ) { if ( i > 0 && !_v._ranges[ i - 1 ].intervals()[ _i[ i - 1 ] ].equality() ) { // if last bound was inequality, we don't know anything about where we are for this field - // TODO if possible avoid this certain cases when field in prev key is the same + // TODO if possible avoid this certain cases when value in previous field of the previous + // key is the same as value of previous field in current key setMinus( i ); } bool eq = false; @@ -1017,20 +964,23 @@ namespace mongo { bool reverse = ( ( oo.number() < 0 ) ^ ( _v._direction < 0 ) ); BSONElement jj = j.next(); if ( _i[ i ] == -1 ) { // unknown position for this field, do binary search - int l = _v.matchingLowElement( jj, i, !reverse ); + bool lowEquality; + int l = _v.matchingLowElement( jj, i, !reverse, lowEquality ); if ( l % 2 == 0 ) { // we are in a valid range for this field _i[ i ] = l / 2; int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ]; if ( diff > 1 ) { latestNonEndpoint = i; - } else if ( diff == 1 ) { + } + else if ( diff == 1 ) { int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false ); if ( x != 0 ) { latestNonEndpoint = i; } } continue; - } else { // not in a valid range for this field - determine if and how to advance + } + else { // not in a valid range for this field - determine if and how to advance // check if we're after the last interval for this field if ( l == (int)_v._ranges[ i ].intervals().size() * 2 - 1 ) { if ( latestNonEndpoint == -1 ) { @@ -1038,18 +988,24 @@ namespace mongo { } setZero( latestNonEndpoint + 1 ); // skip to curr / latestNonEndpoint + 1 / superlative - for( int j = latestNonEndpoint + 1; j < (int)_i.size(); ++j ) { - _cmp[ j ] = _superlative[ j ]; - } - return latestNonEndpoint + 1; + _after = true; + return latestNonEndpoint + 1; } _i[ i ] = ( l + 1 ) / 2; + if ( lowEquality ) { + // skip to curr / i + 1 / superlative + _after = true; + return i + 1; + } // skip to curr / i / nextbounds _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound; + _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive; for( int j = i + 1; j < (int)_i.size(); ++j ) { _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound; + _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive; } - return i; + _after = false; + return i; } } bool first = true; @@ -1062,7 +1018,7 @@ namespace mongo { if ( reverse ) { x = -x; } - if ( x == 0 ) { + if ( x == 0 && _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._inclusive ) { eq = true; break; } @@ -1081,16 +1037,27 @@ namespace mongo { x = -x; } } + // if we're equal to and not inclusive the lower bound, advance + if ( ( x == 0 && !_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive ) ) { + setZero( i + 1 ); + // skip to curr / i + 1 / superlative + _after = true; + return i + 1; + } // if we're less than the lower bound, advance if ( x > 0 ) { setZero( i + 1 ); // skip to curr / i / nextbounds _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound; + _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive; for( int j = i + 1; j < (int)_i.size(); ++j ) { _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound; + _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive; } + _after = false; return i; - } else { + } + else { break; } } @@ -1101,26 +1068,32 @@ namespace mongo { } int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ]; if ( diff > 1 || ( !eq && diff == 1 ) ) { - // check if we're not at the end of valid values for this field + // check if we're not at the end of valid values for this field latestNonEndpoint = i; - } else if ( diff == 0 ) { // check if we're past the last interval for this field + } + else if ( diff == 0 ) { // check if we're past the last interval for this field if ( latestNonEndpoint == -1 ) { return -2; } // more values possible, skip... setZero( latestNonEndpoint + 1 ); // skip to curr / latestNonEndpoint + 1 / superlative - for( int j = latestNonEndpoint + 1; j < (int)_i.size(); ++j ) { - _cmp[ j ] = _superlative[ j ]; - } + _after = true; return latestNonEndpoint + 1; } } - return -1; + return -1; } - + + void FieldRangeVector::Iterator::prepDive() { + for( int j = 0; j < (int)_i.size(); ++j ) { + _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound; + _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive; + } + } + struct SimpleRegexUnitTest : UnitTest { - void run(){ + void run() { { BSONObjBuilder b; b.appendRegex("r", "^foo"); @@ -1179,38 +1152,39 @@ namespace mongo { } simple_regex_unittest; - long long applySkipLimit( long long num , const BSONObj& cmd ){ + long long applySkipLimit( long long num , const BSONObj& cmd ) { BSONElement s = cmd["skip"]; BSONElement l = cmd["limit"]; - - if ( s.isNumber() ){ + + if ( s.isNumber() ) { num = num - s.numberLong(); if ( num < 0 ) { num = 0; } } - - if ( l.isNumber() ){ + + if ( l.isNumber() ) { long long limit = l.numberLong(); - if ( limit < num ){ + if ( limit < num ) { num = limit; } } - return num; + return num; } - string debugString( Message& m ){ + string debugString( Message& m ) { stringstream ss; ss << "op: " << opToString( m.operation() ) << " len: " << m.size(); - if ( m.operation() >= 2000 && m.operation() < 2100 ){ + if ( m.operation() >= 2000 && m.operation() < 2100 ) { DbMessage d(m); ss << " ns: " << d.getns(); - switch ( m.operation() ){ + switch ( m.operation() ) { case dbUpdate: { int flags = d.pullInt(); BSONObj q = d.nextJsObj(); - ss << " flags: " << flags << " query: " << q; + BSONObj o = d.nextJsObj(); + ss << " flags: " << flags << " query: " << q << " update: " << o; break; } case dbInsert: @@ -1225,10 +1199,10 @@ namespace mongo { default: ss << " CANNOT HANDLE YET"; } - - + + } return ss.str(); - } + } } // namespace mongo diff --git a/db/queryutil.h b/db/queryutil.h index 37dfa2a..2746695 100644 --- a/db/queryutil.h +++ b/db/queryutil.h @@ -26,7 +26,7 @@ namespace mongo { bool _inclusive; bool operator==( const FieldBound &other ) const { return _bound.woCompare( other._bound ) == 0 && - _inclusive == other._inclusive; + _inclusive == other._inclusive; } void flipInclusive() { _inclusive = !_inclusive; } }; @@ -59,8 +59,6 @@ namespace mongo { FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true ); const FieldRange &operator&=( const FieldRange &other ); const FieldRange &operator|=( const FieldRange &other ); - // does not remove fully contained ranges (eg [1,3] - [2,2] doesn't remove anything) - // in future we can change so that an or on $in:[3] combined with $in:{$gt:2} doesn't scan 3 a second time const FieldRange &operator-=( const FieldRange &other ); // true iff other includes this bool operator<=( const FieldRange &other ); @@ -79,7 +77,7 @@ namespace mongo { if ( equality() ) { return true; } - for( vector< FieldInterval >::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) { + for( vector< FieldInterval >::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) { if ( !i->equality() ) { return false; } @@ -88,13 +86,14 @@ namespace mongo { } bool nontrivial() const { return - ! empty() && - ( minKey.firstElement().woCompare( min(), false ) != 0 || + ! empty() && + ( _intervals.size() != 1 || + minKey.firstElement().woCompare( min(), false ) != 0 || maxKey.firstElement().woCompare( max(), false ) != 0 ); } bool empty() const { return _intervals.empty(); } void makeEmpty() { _intervals.clear(); } - const vector< FieldInterval > &intervals() const { return _intervals; } + const vector< FieldInterval > &intervals() const { return _intervals; } string getSpecial() const { return _special; } void setExclusiveBounds() { for( vector< FieldInterval >::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) { @@ -122,7 +121,7 @@ namespace mongo { vector< BSONObj > _objData; string _special; }; - + // implements query pattern matching, used to determine if a query is // similar to an earlier query and should use the same plan class QueryPattern { @@ -193,8 +192,8 @@ namespace mongo { // the specified direction of traversal. For example, given a simple index {i:1} // and direction +1, one valid BoundList is: (1, 2); (4, 6). The same BoundList // would be valid for index {i:-1} with direction -1. - typedef vector< pair< BSONObj, BSONObj > > BoundList; - + typedef vector< pair< BSONObj, BSONObj > > BoundList; + // ranges of fields' value that may be determined from query -- used to // determine index limits class FieldRangeSet { @@ -210,19 +209,20 @@ namespace mongo { map< string, FieldRange >::const_iterator f = _ranges.find( fieldName ); if ( f == _ranges.end() ) return trivialRange(); - return f->second; + return f->second; } FieldRange &range( const char *fieldName ) { map< string, FieldRange >::iterator f = _ranges.find( fieldName ); if ( f == _ranges.end() ) return trivialRange(); - return f->second; + return f->second; } int nNontrivialRanges() const { int count = 0; - for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) + for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { if ( i->second.nontrivial() ) ++count; + } return count; } const char *ns() const { return _ns; } @@ -236,6 +236,18 @@ namespace mongo { } QueryPattern pattern( const BSONObj &sort = BSONObj() ) const; string getSpecial() const; + // Btree scanning for a multidimentional key range will yield a + // multidimensional box. The idea here is that if an 'other' + // multidimensional box contains the current box we don't have to scan + // the current box. If the 'other' box contains the current box in + // all dimensions but one, we can safely subtract the values of 'other' + // along that one dimension from the values for the current box on the + // same dimension. In other situations, subtracting the 'other' + // box from the current box yields a result that is not a box (but + // rather can be expressed as a union of boxes). We don't support + // such splitting currently in calculating index ranges. Note that + // where I have said 'box' above, I actually mean sets of boxes because + // a field range can consist of multiple intervals. const FieldRangeSet &operator-=( const FieldRangeSet &other ) { int nUnincluded = 0; string unincludedKey; @@ -246,22 +258,25 @@ namespace mongo { if ( cmp == 0 ) { if ( i->second <= j->second ) { // nothing - } else { + } + else { ++nUnincluded; unincludedKey = i->first; } ++i; ++j; - } else if ( cmp < 0 ) { + } + else if ( cmp < 0 ) { ++i; - } else { + } + else { // other has a bound we don't, nothing can be done return *this; } } if ( j != other._ranges.end() ) { // other has a bound we don't, nothing can be done - return *this; + return *this; } if ( nUnincluded > 1 ) { return *this; @@ -284,27 +299,37 @@ namespace mongo { i->second &= j->second; ++i; ++j; - } else if ( cmp < 0 ) { + } + else if ( cmp < 0 ) { ++i; - } else { + } + else { _ranges[ j->first ] = j->second; ++j; } } while( j != other._ranges.end() ) { _ranges[ j->first ] = j->second; - ++j; + ++j; } appendQueries( other ); return *this; } // TODO get rid of this BoundList indexBounds( const BSONObj &keyPattern, int direction ) const; + + /** + * @param return - A new FieldRangeSet based on this FieldRangeSet, but with only + * a subset of the fields. + * @param fields - Only fields which are represented as field names in this object + * will be included in the returned FieldRangeSet. + */ + FieldRangeSet *subset( const BSONObj &fields ) const; private: void appendQueries( const FieldRangeSet &other ) { for( vector< BSONObj >::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) { - _queries.push_back( *i ); - } + _queries.push_back( *i ); + } } void makeEmpty() { for( map< string, FieldRange >::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { @@ -321,11 +346,21 @@ namespace mongo { vector< BSONObj > _queries; }; + class IndexSpec; + + /** + * This class manages the ranges of valid element values for each field in + * an ordered list of signed fields corresponding to an index specification. + */ class FieldRangeVector { public: + /** + * @param frs The valid ranges for all fields, as defined by the query spec + * @prarm keyPattern The index key pattern + * @param direction The direction of index traversal + */ FieldRangeVector( const FieldRangeSet &frs, const BSONObj &keyPattern, int direction ) - :_keyPattern( keyPattern ), _direction( direction >= 0 ? 1 : -1 ) - { + :_keyPattern( keyPattern ), _direction( direction >= 0 ? 1 : -1 ) { _queries = frs._queries; BSONObjIterator i( _keyPattern ); while( i.more() ) { @@ -334,7 +369,8 @@ namespace mongo { bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 ); if ( forward ) { _ranges.push_back( frs.range( e.fieldName() ) ); - } else { + } + else { _ranges.push_back( FieldRange() ); frs.range( e.fieldName() ).reverse( _ranges.back() ); } @@ -348,14 +384,14 @@ namespace mongo { ret *= i->intervals().size(); } return ret; - } + } BSONObj startKey() const { BSONObjBuilder b; for( vector< FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { const FieldInterval &fi = i->intervals().front(); b.appendAs( fi._lower._bound, "" ); } - return b.obj(); + return b.obj(); } BSONObj endKey() const { BSONObjBuilder b; @@ -363,7 +399,7 @@ namespace mongo { const FieldInterval &fi = i->intervals().back(); b.appendAs( fi._upper._bound, "" ); } - return b.obj(); + return b.obj(); } BSONObj obj() const { BSONObjBuilder b; @@ -371,27 +407,23 @@ namespace mongo { for( int i = 0; i < (int)_ranges.size(); ++i ) { BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) ); for( vector< FieldInterval >::const_iterator j = _ranges[ i ].intervals().begin(); - j != _ranges[ i ].intervals().end(); ++j ) { + j != _ranges[ i ].intervals().end(); ++j ) { a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() ); } a.done(); } return b.obj(); } + /** + * @return true iff the provided document matches valid ranges on all + * of this FieldRangeVector's fields, which is the case iff this document + * would be returned while scanning the index corresponding to this + * FieldRangeVector. This function is used for $or clause deduping. + */ bool matches( const BSONObj &obj ) const; class Iterator { public: - Iterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _superlative( _v._ranges.size(), 0 ) { - static BSONObj minObj = minObject(); - static BSONElement minElt = minObj.firstElement(); - static BSONObj maxObj = maxObject(); - static BSONElement maxElt = maxObj.firstElement(); - BSONObjIterator i( _v._keyPattern ); - for( int j = 0; j < (int)_superlative.size(); ++j ) { - int number = (int) i.next().number(); - bool forward = ( ( number >= 0 ? 1 : -1 ) * ( _v._direction >= 0 ? 1 : -1 ) > 0 ); - _superlative[ j ] = forward ? &maxElt : &minElt; - } + Iterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() { } static BSONObj minObject() { BSONObjBuilder b; @@ -413,7 +445,8 @@ namespace mongo { for( unsigned j = i + 1; j < _i.size(); ++j ) { _i[ j ] = 0; } - } else { + } + else { _i[ 0 ] = _v._ranges[ 0 ].intervals().size(); } return ok(); @@ -424,6 +457,9 @@ namespace mongo { // >= 0 skip parameter int advance( const BSONObj &curr ); const vector< const BSONElement * > &cmp() const { return _cmp; } + const vector< bool > &inc() const { return _inc; } + bool after() const { return _after; } + void prepDive(); void setZero( int i ) { for( int j = i; j < (int)_i.size(); ++j ) { _i[ j ] = 0; @@ -452,55 +488,61 @@ namespace mongo { const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ]; b.appendAs( fi._upper._bound, "" ); } - return b.obj(); + return b.obj(); } // check private: const FieldRangeVector &_v; vector< int > _i; vector< const BSONElement* > _cmp; - vector< const BSONElement* > _superlative; + vector< bool > _inc; + bool _after; }; private: - int matchingLowElement( const BSONElement &e, int i, bool direction ) const; + int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const; bool matchesElement( const BSONElement &e, int i, bool direction ) const; vector< FieldRange > _ranges; BSONObj _keyPattern; int _direction; vector< BSONObj > _queries; // make sure mem owned + // This IndexSpec is lazily constructed directly from _keyPattern if needed. + mutable shared_ptr< IndexSpec > _indexSpec; }; - + // generages FieldRangeSet objects, accounting for or clauses class FieldRangeOrSet { public: FieldRangeOrSet( const char *ns, const BSONObj &query , bool optimize=true ); // if there's a useless or clause, we won't use or ranges to help with scanning bool orFinished() const { return _orFound && _orSets.empty(); } - // removes first or clause, and removes the field ranges it covers from all subsequent or clauses - // this could invalidate the result of the last topFrs() - void popOrClause() { - massert( 13274, "no or clause to pop", !orFinished() ); - const FieldRangeSet &toPop = _orSets.front(); - list< FieldRangeSet >::iterator i = _orSets.begin(); - ++i; - while( i != _orSets.end() ) { - *i -= toPop; - if( !i->matchPossible() ) { - i = _orSets.erase( i ); - } else { - ++i; - } - } - _oldOrSets.push_front( toPop ); - _orSets.pop_front(); - } + /** + * Removes the top or clause, which would have been recently scanned, and + * removes the field ranges it covers from all subsequent or clauses. As a + * side effect, this function may invalidate the return values of topFrs() + * calls made before this function was called. + * @param indexSpec - Keys of the index that was used to satisfy the last or + * clause. Used to determine the range of keys that were scanned. If + * empty we do not constrain the previous clause's ranges using index keys, + * which may reduce opportunities for range elimination. + */ + void popOrClause( const BSONObj &indexSpec = BSONObj() ); FieldRangeSet *topFrs() const { FieldRangeSet *ret = new FieldRangeSet( _baseSet ); - if (_orSets.size()){ + if (_orSets.size()) { *ret &= _orSets.front(); } return ret; } + // while the original bounds are looser, they are composed of fewer + // ranges and it is faster to do operations with them; when they can be + // used instead of more precise bounds, they should + FieldRangeSet *topFrsOriginal() const { + FieldRangeSet *ret = new FieldRangeSet( _baseSet ); + if (_originalOrSets.size()) { + *ret &= _originalOrSets.front(); + } + return ret; + } void allClausesSimplified( vector< BSONObj > &ret ) const { for( list< FieldRangeSet >::const_iterator i = _orSets.begin(); i != _orSets.end(); ++i ) { if ( i->matchPossible() ) { @@ -514,47 +556,10 @@ namespace mongo { private: FieldRangeSet _baseSet; list< FieldRangeSet > _orSets; + list< FieldRangeSet > _originalOrSets; list< FieldRangeSet > _oldOrSets; // make sure memory is owned bool _orFound; }; - - /** - used for doing field limiting - */ - class FieldMatcher { - public: - FieldMatcher() - : _include(true) - , _special(false) - , _includeID(true) - , _skip(0) - , _limit(-1) - {} - - void add( const BSONObj& o ); - - void append( BSONObjBuilder& b , const BSONElement& e ) const; - - BSONObj getSpec() const; - bool includeID() { return _includeID; } - private: - - void add( const string& field, bool include ); - void add( const string& field, int skip, int limit ); - void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const; - - bool _include; // true if default at this level is to include - bool _special; // true if this level can't be skipped or included without recursing - //TODO: benchmark vector vs map - typedef map > FieldMap; - FieldMap _fields; - BSONObj _source; - bool _includeID; - - // used for $slice operator - int _skip; - int _limit; - }; /** returns a string that when used as a matcher, would match a super set of regex() returns "" for complex regular expressions diff --git a/db/rec.h b/db/rec.h deleted file mode 100644 index 7b79c73..0000000 --- a/db/rec.h +++ /dev/null @@ -1,137 +0,0 @@ -// rec.h -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - - -/* TODO for _RECSTORE - - _ support > 2GB data per file - _ multiple files, not just indexes.dat - _ lazier writes? (may be done?) - _ configurable cache size - _ fix on abnormal terminations to be able to restart some -*/ - -#pragma once - -#include "reci.h" -//#include "reccache.h" - -namespace mongo { - -/* -------------------------------------------------------------------------- - A RecStoreInterface for the normal mongo mem mapped file (MongoDataFile) - storage -*/ - -NamespaceDetails* nsdetails_notinline(const char *ns); - -class MongoMemMapped_RecStore : public RecStoreInterface { -public: - VIRT char* get(DiskLoc d, unsigned len) { return d.rec()->data; } - - VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { - return theDataFileMgr.insert(ns, obuf, len, god); - } - - VIRT void deleteRecord(const char *ns, DiskLoc d) { - theDataFileMgr._deleteRecord(nsdetails_notinline(ns), ns, d.rec(), d); - } - - VIRT void modified(DiskLoc d) { } - - VIRT void drop(const char *ns) { - dropNS(ns); - } - - VIRT void rename(const char *fromNs, const char *toNs) { - renameNamespace( fromNs, toNs ); - } - - /* close datafiles associated with the db specified. */ - VIRT void closeFiles(string dbname, string path) { - /* as this is only used for indexes so far, and we are in the same - PDFiles as the nonindex data, we just rely on them having been closed - at the same time. one day this may need to change. - */ - } - -}; - -/* An in memory RecStoreInterface implementation ---------------------------- -*/ - -#if 0 -class InMem_RecStore : public RecStoreInterface { - enum InmemfileValue { INMEMFILE = 0x70000000 }; -public: - static char* get(DiskLoc d, unsigned len) { - assert( d.a() == INMEMFILE ); -#ifdef __LP64__ - massert( 10372 , "64 bit not done", false); - return 0; -#else - return (char *) d.getOfs(); -#endif - } - - static DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { -#ifdef __LP64__ - assert( 0 ); - throw -1; -#else - char *p = (char *) malloc(len); - assert( p ); - memcpy(p, obuf, len); - int b = (int) p; - assert( b > 0 ); - return DiskLoc(INMEMFILE, b); -#endif - } - - static void modified(DiskLoc d) { } - - static void drop(const char *ns) { - log() << "warning: drop() not yet implemented for InMem_RecStore" << endl; - } - - virtual void rename(const char *fromNs, const char *toNs) { - massert( 10373 , "rename not yet implemented for InMem_RecStore", false ); - } -}; -#endif - -/* Glue btree to RecStoreInterface: ---------------------------- */ - -typedef MongoMemMapped_RecStore StoreToUse; - -extern StoreToUse *btreeStore; - -const int BucketSize = 8192; - -inline BtreeBucket* DiskLoc::btree() const { - assert( fileNo != -1 ); - return (BtreeBucket*) btreeStore->get(*this, BucketSize); -} - -inline BtreeBucket* DiskLoc::btreemod() const { - assert( fileNo != -1 ); - BtreeBucket *b = (BtreeBucket*) btreeStore->get(*this, BucketSize); - btreeStore->modified(*this); - return b; -} - -} diff --git a/db/reccache.cpp b/db/reccache.cpp deleted file mode 100644 index eb20728..0000000 --- a/db/reccache.cpp +++ /dev/null @@ -1,419 +0,0 @@ -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -// storage.cpp - -#include "pch.h" -#include "pdfile.h" -//#include "reccache.h" -#include "rec.h" -#include "db.h" - -#error deprecated - do not include in project - -namespace mongo { - -//RecCache theRecCache(BucketSize); - -// 100k * 8KB = 800MB -unsigned RecCache::MAXNODES = 50000; - -void setRecCacheSize(unsigned mb) { - unsigned long long MB = mb; - log(2) << "reccache size: " << MB << "MB\n"; - uassert( 10114 , "bad cache size", MB > 0 && MB < 1000000 ); - RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192; - log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n'; -} - -void writerThread() { - sleepsecs(10); - while( 1 ) { - try { - theRecCache.writeLazily(); - } - catch(...) { - log() << "exception in writerThread()" << endl; - sleepsecs(3); - } - } -} - -// called on program exit. -void recCacheCloseAll() { -#if defined(_RECSTORE) - theRecCache.closing(); -#endif -} - -int ndirtywritten; - -inline static string escape(const char *ns) { - char buf[256]; - char *p = buf; - while( 1 ) { - if( *ns == '$' ) *p = '~'; - else - *p = *ns; - if( *ns == 0 ) - break; - p++; ns++; - } - assert( p - buf < (int) sizeof(buf) ); - return buf; -} - -inline static string unescape(const char *ns) { - char buf[256]; - char *p = buf; - while( 1 ) { - if( *ns == '~' ) *p = '$'; - else - *p = *ns; - if( *ns == 0 ) - break; - p++; ns++; - } - assert( p - buf < (int) sizeof(buf) ); - return buf; -} - -string RecCache::directory() { - return cc().database()->path; -} - -/* filename format is - - -.idx -*/ - -BasicRecStore* RecCache::_initStore(string fname) { - - assert( strchr(fname.c_str(), '/') == 0 ); - assert( strchr(fname.c_str(), '\\') == 0 ); - - stringstream ss(fname); - int n; - ss >> n; - assert( n >= 0 ); - char ch; - ss >> ch; - assert( ch == '-' ); - string rest; - ss >> rest; - const char *p = rest.c_str(); - const char *q = strstr(p, ".idx"); - assert( q ); - string escaped_ns(p, q-p); - - // arbitrary limit. if you are hitting, we should use fewer files and put multiple - // indexes in a single file (which is easy to do) - massert( 10374 , "too many index files", n < 10000 ); - - if( stores.size() < (unsigned)n+1 ) - stores.resize(n+1); - assert( stores[n] == 0 ); - BasicRecStore *rs = new BasicRecStore(n); - path pf(directory()); - pf /= fname; - string full = pf.string(); - rs->init(full.c_str(), recsize); - stores[n] = rs; - string ns = unescape(escaped_ns.c_str()); - storesByNsKey[mknskey(ns.c_str())] = rs; - return rs; -} - -BasicRecStore* RecCache::initStore(int n) { - string ns; - { - stringstream ss; - ss << '/' << n << '-'; - ns = ss.str(); - } - - /* this will be slow if there are thousands of files */ - path dir(directory()); - directory_iterator end; - try { - directory_iterator i(dir); - while ( i != end ) { - string s = i->string(); - const char *p = strstr(s.c_str(), ns.c_str()); - if( p && strstr(p, ".idx") ) { - // found it - path P = *i; - return _initStore(P.leaf()); - } - i++; - } - } - catch( DBException & ) { - throw; - } - catch (...) { - string s = string("i/o error looking for .idx file in ") + directory(); - massert( 10375 , s, false); - } - stringstream ss; - ss << "index datafile missing? n=" << n; - uasserted(12500,ss.str()); - return 0; -} - -/* find the filename for a given ns. - format is - -.idx - returns filename. found is true if found. If false, a proposed name is returned for (optional) creation - of the file. -*/ -string RecCache::findStoreFilename(const char *_ns, bool& found) { - string namefrag; - { - stringstream ss; - ss << '-'; - ss << escape(_ns); - ss << ".idx"; - namefrag = ss.str(); - } - - path dir(directory()); - directory_iterator end; - int nmax = -1; - try { - directory_iterator i(dir); - while ( i != end ) { - string s = path(*i).leaf(); - const char *p = strstr(s.c_str(), namefrag.c_str()); - if( p ) { - found = true; - return s; - } - if( strstr(s.c_str(), ".idx") ) { - stringstream ss(s); - int n = -1; - ss >> n; - if( n > nmax ) - nmax = n; - } - i++; - } - } - catch (...) { - string s = string("i/o error looking for .idx file in ") + directory(); - massert( 10376 , s, false); - } - - // DNE. return a name that would work. - stringstream ss; - ss << nmax+1 << namefrag; - found = false; - return ss.str(); -} - -void RecCache::initStoreByNs(const char *_ns, const string& nskey) { - bool found; - string fn = findStoreFilename(_ns, found); - _initStore(fn); -} - -inline void RecCache::writeIfDirty(Node *n) { - if( n->dirty ) { - ndirtywritten++; - n->dirty = false; - store(n->loc).update(fileOfs(n->loc), n->data, recsize); - } -} - -void RecCache::closeFiles(string dbname, string path) { - assertInWriteLock(); - scoped_lock lk(rcmutex); - - // first we write all dirty pages. it is not easy to check which Nodes are for a particular - // db, so we just write them all. - writeDirty( dirtyl.begin(), true ); - - string key = path + dbname + '.'; - unsigned sz = key.size(); - for( map::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) { - map::iterator j = i; - i++; - if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) { - assert( stores[j->second->fileNumber] != 0 ); - stores[j->second->fileNumber] = 0; - delete j->second; - storesByNsKey.erase(j); - } - } -} - -void RecCache::closing() { - scoped_lock lk(rcmutex); - (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush(); - writeDirty( dirtyl.begin(), true ); - for( unsigned i = 0; i < stores.size(); i++ ) { - if( stores[i] ) { - delete stores[i]; - } - } - (cout << "TEMP: write dirty done\n").flush(); -} - -/* note that this is written in order, as much as possible, given that dirtyl is of type set. */ -void RecCache::writeDirty( set::iterator startAt, bool rawLog ) { - try { - ndirtywritten=0; - for( set::iterator i = startAt; i != dirtyl.end(); i++ ) { - map::iterator j = m.find(*i); - if( j != m.end() ) - writeIfDirty(j->second); - } - OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl; - } - catch(...) { - const char *message = "Problem: bad() in RecCache::writeDirty, file io error\n"; - - if ( rawLog ) - rawOut( message ); - else - ( log() << message ).flush(); - } - dirtyl.clear(); -} - -void RecCache::writeLazily() { - int sleep = 0; - int k; - { - scoped_lock lk(rcmutex); - Timer t; - set::iterator i = dirtyl.end(); - for( k = 0; k < 100; k++ ) { - if( i == dirtyl.begin() ) { - // we're not very far behind - sleep = k < 20 ? 2000 : 1000; - break; - } - i--; - } - writeDirty(i); - if( sleep == 0 ) { - sleep = t.millis() * 4 + 10; - } - } - - OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n'; - sleepmillis(sleep); -} - -void RecCache::_ejectOld() { - scoped_lock lk(rcmutex); - if( nnodes <= MAXNODES ) - return; - Node *n = oldest; - while( 1 ) { - if( nnodes <= MAXNODES - 4 ) { - n->older = 0; - oldest = n; - assert( oldest ) ; - break; - } - nnodes--; - assert(n); - Node *nxt = n->newer; - writeIfDirty(n); - m.erase(n->loc); - delete n; - n = nxt; - } -} - -void RecCache::dump() { - Node *n = oldest; - Node *last = 0; - while( n ) { - assert( n->older == last ); - last = n; -// cout << n << ' ' << n->older << ' ' << n->newer << '\n'; - n=n->newer; - } - assert( newest == last ); -// cout << endl; -} - -/* cleans up everything EXCEPT storesByNsKey. - note this function is slow should not be invoked often -*/ -void RecCache::closeStore(BasicRecStore *rs) { - int n = rs->fileNumber + Base; - for( set::iterator i = dirtyl.begin(); i != dirtyl.end(); ) { - DiskLoc k = *i++; - if( k.a() == n ) - dirtyl.erase(k); - } - - for( map::iterator i = m.begin(); i != m.end(); ) { - DiskLoc k = i->first; - i++; - if( k.a() == n ) - m.erase(k); - } - - assert( stores[rs->fileNumber] != 0 ); - stores[rs->fileNumber] = 0; -/* - for( unsigned i = 0; i < stores.size(); i++ ) { - if( stores[i] == rs ) { - stores[i] = 0; - break; - } - }*/ - delete rs; // closes file -} - -void RecCache::drop(const char *_ns) { - // todo: test with a non clean shutdown file - scoped_lock lk(rcmutex); - - map::iterator it = storesByNsKey.find(mknskey(_ns)); - string fname; - if( it != storesByNsKey.end() ) { - fname = it->second->filename; - closeStore(it->second); // cleans up stores[] etc. - storesByNsKey.erase(it); - } - else { - bool found; - fname = findStoreFilename(_ns, found); - if( !found ) { - log() << "RecCache::drop: no idx file found for " << _ns << endl; - return; - } - path pf(directory()); - pf /= fname; - fname = pf.string(); - } - try { - if( !boost::filesystem::exists(fname) ) - log() << "RecCache::drop: can't find file to remove " << fname << endl; - boost::filesystem::remove(fname); - } - catch(...) { - log() << "RecCache::drop: exception removing file " << fname << endl; - } -} - -} diff --git a/db/reccache.h b/db/reccache.h deleted file mode 100644 index d0fd118..0000000 --- a/db/reccache.h +++ /dev/null @@ -1,262 +0,0 @@ -// reccache.h -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - - -/* CachedBasicRecStore - This is our store which implements a traditional page-cache type of storage - (not memory mapped files). -*/ - -/* LOCK HIERARCHY - - dblock - RecCache::rcmutex - - i.e. always lock dblock first if you lock both - -*/ - -#pragma once - -#error deprecated - -#include "reci.h" -#include "recstore.h" - -namespace mongo { - -class RecCache { - struct Node { - Node(void* _data) : data((char *) _data) { dirty = false; newer = 0; } - ~Node() { - free(data); - data = 0; - } - char *data; - DiskLoc loc; - bool dirty; - Node *older, *newer; // lru - }; - mongo::mutex rcmutex; // mainly to coordinate with the lazy writer thread - unsigned recsize; - map m; // the cache - Node *newest, *oldest; - unsigned nnodes; - set dirtyl; - vector stores; // DiskLoc::a() indicates the index into this vector - map storesByNsKey; // nskey -> BasicRecStore* -public: - static unsigned MAXNODES; - enum BaseValue { Base = 10000 }; -private: - BasicRecStore* _initStore(string fname); - BasicRecStore* initStore(int n); - string findStoreFilename(const char *_ns, bool& found); - void initStoreByNs(const char *ns, const string& nskey); - void closeStore(BasicRecStore *rs); - - static string directory(); - static string mknskey(const char *ns) { - return directory() + ns; - } - - /* get the right file for a given diskloc */ - BasicRecStore& store(DiskLoc& d) { - int n = d.a() - Base; - if( (int) stores.size() > n ) { - BasicRecStore *rs = stores[n]; - if( rs ) { - assert( rs->fileNumber == n ); - return *rs; - } - } - return *initStore(n); - } - BasicRecStore& store(const char *ns) { - string nskey = mknskey(ns); - BasicRecStore *&rs = storesByNsKey[nskey]; - if( rs ) - return *rs; - initStoreByNs(ns, nskey); - return *rs; - } - - void writeDirty( set::iterator i, bool rawLog = false ); - void writeIfDirty(Node *n); - void touch(Node* n) { - if( n == newest ) - return; - if( n == oldest ) { - oldest = oldest->newer; - assert( oldest || nnodes == 1 ); - } - if( n->older ) - n->older->newer = n->newer; - if( n->newer ) - n->newer->older = n->older; - n->newer = 0; - n->older = newest; - newest->newer = n; - newest = n; - } - Node* mkNode() { - Node *n = new Node(calloc(recsize,1)); // calloc is TEMP for testing. change to malloc - n->older = newest; - if( newest ) - newest->newer = n; - else { - assert( oldest == 0 ); - oldest = n; - } - newest = n; - nnodes++; - return n; - } - fileofs fileOfs(DiskLoc d) { - return ((fileofs) d.getOfs()) * recsize; - } - - void dump(); - void _ejectOld(); - -public: - /* all public functions (except constructor) should use the mutex */ - - RecCache(unsigned recsz) : recsize(recsz) { - nnodes = 0; - newest = oldest = 0; - } - - /* call this after doing some work, after you are sure you are done with modifications. - we call it from dbunlocking(). - */ - void ejectOld() { - if( nnodes > MAXNODES ) // just enough here to be inlineable for speed reasons. _ejectOld does the real work - _ejectOld(); - } - - /* bg writer thread invokes this */ - void writeLazily(); - - /* Note that this may be called BEFORE the actual writing to the node - takes place. We do flushing later on a dbunlocking() call, which happens - after the writing. - */ - void dirty(DiskLoc d) { - assert( d.a() >= Base ); - scoped_lock lk(rcmutex); - map::iterator i = m.find(d); - if( i != m.end() ) { - Node *n = i->second; - if( !n->dirty ) { - n->dirty = true; - dirtyl.insert(n->loc); - } - } - } - - char* get(DiskLoc d, unsigned len) { - assert( d.a() >= Base ); - assert( len == recsize ); - - scoped_lock lk(rcmutex); - map::iterator i = m.find(d); - if( i != m.end() ) { - touch(i->second); - return i->second->data; - } - - Node *n = mkNode(); - n->loc = d; - store(d).get(fileOfs(d), n->data, recsize); // could throw exception - m.insert( pair(d, n) ); - return n->data; - } - - void drop(const char *ns); - - DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { - scoped_lock lk(rcmutex); - BasicRecStore& rs = store(ns); - fileofs o = rs.insert((const char *) obuf, len); - assert( o % recsize == 0 ); - fileofs recnum = o / recsize; - massert( 10377 , "RecCache file too large?", recnum <= 0x7fffffff ); - Node *n = mkNode(); - memcpy(n->data, obuf, len); - DiskLoc d(rs.fileNumber + Base, (int) recnum); - n->loc = d; - m[d] = n; - return d; - } - - void closeFiles(string dbname, string path); - - // at termination: write dirty pages and close all files - void closing(); -}; - -extern RecCache theRecCache; - -class CachedBasicRecStore : public RecStoreInterface { -public: - VIRT char* get(DiskLoc d, unsigned len) { - return theRecCache.get(d, len); - } - - VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { - return theRecCache.insert(ns, obuf, len, god); - } - - VIRT void modified(DiskLoc d) { - theRecCache.dirty(d); - } - - /* drop collection */ - VIRT void drop(const char *ns) { - theRecCache.drop(ns); - } - - VIRT void rename(const char *fromNs, const char *toNs) { - massert( 10378 , "rename not yet implemented for CachedBasicRecStore", false ); - } - - /* close datafiles associated with the db specified. */ - VIRT void closeFiles(string dbname, string path) { - theRecCache.closeFiles(dbname, dbpath); - } -}; - -/* see concurrency.h - note on a lock reset from read->write we don't - call dbunlocking_read, we just wait for the final dbunlocking_write - call -*/ - -//inline void dbunlocking_read() { - /* - Client *c = currentClient.get(); - if ( c ) - c->top.clientStop(); - */ -//} - -//inline void dbunlocking_write() { - //theRecCache.ejectOld(); -// dbunlocking_read(); -//} - -} /*namespace*/ diff --git a/db/reci.h b/db/reci.h deleted file mode 100644 index a22f1f1..0000000 --- a/db/reci.h +++ /dev/null @@ -1,64 +0,0 @@ -// reci.h -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - - -#pragma once - -#include "diskloc.h" - -namespace mongo { - -// #define VIRT virtual -#define VIRT - -/* Subclass this and implement your real storage interface. -*/ -class RecStoreInterface { -public: - //VIRT ~RecStoreInterface() {} - - /* Get a pointer to the data at diskloc d. Pointer guaranteed to stay in - scope through the current database operation's life. - */ - //VIRT char* get(DiskLoc d, unsigned len) = 0; - - /* indicate that the diskloc specified has been updated. note that as-is today, the modification may come AFTER this - call -- we handle that currently -- until the dblock finishes. - */ - //VIRT void modified(DiskLoc d) = 0; - - /* insert specified data as a record */ - //VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) = 0; - - //VIRT void deleteRecord(const char *ns, DiskLoc d) { massert( 10379 , "not implemented RecStoreInterface::deleteRecord", false); } - - /* drop the collection */ - //VIRT void drop(const char *ns) = 0; - - /* rename collection */ - //VIRT void rename(const char *fromNs, const char *toNs) = 0; - - /* close datafiles associated with the db specified. */ - //VIRT void closeFiles(string dbname, string path) = 0; - - /* todo add: - closeFiles(dbname) - eraseFiles(dbname) - */ -}; - -} diff --git a/db/recstore.h b/db/recstore.h deleted file mode 100644 index 913070f..0000000 --- a/db/recstore.h +++ /dev/null @@ -1,126 +0,0 @@ -// recstore.h -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - - -#pragma once - -#error deprecated - -#include "../util/file.h" - -namespace mongo { - -using boost::uint32_t; -using boost::uint64_t; - -/* Current version supports only consistent record sizes within a store. */ - -class BasicRecStore { - struct RecStoreHeader { - uint32_t version; - uint32_t recsize; - uint64_t leof; // logical eof, actual file might be prealloc'd further - uint64_t firstDeleted; // 0 = no deleted recs - uint32_t cleanShutdown; // 0 = clean - char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal - RecStoreHeader() { - version = 65; - recsize = 0; - leof = sizeof(RecStoreHeader); - firstDeleted = 0; - cleanShutdown = 1; - memset(reserved, 0, sizeof(reserved)); - } - }; - -public: - BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { } - ~BasicRecStore(); - void init(const char *fn, unsigned recsize); - fileofs insert(const char *buf, unsigned len); - void update(fileofs o, const char *buf, unsigned len); - void remove(fileofs o, unsigned len); - void get(fileofs o, char *buf, unsigned len); - - int fileNumber; // this goes in DiskLoc::a - - string filename; - -private: - - void writeHeader(); - File f; - fileofs len; - RecStoreHeader h; // h.reserved is wasteful here; fix later. - void write(fileofs ofs, const char *data, unsigned len) { - f.write(ofs, data, len); - massert( 10380 , "basicrecstore write io error", !f.bad()); - } -}; - -/* --- implementation --- */ - -inline BasicRecStore::~BasicRecStore() { - h.cleanShutdown = 0; - if( f.is_open() ) { - writeHeader(); - f.fsync(); - } -} - -inline void BasicRecStore::writeHeader() { - write(0, (const char *) &h, 28); // update header in file for new leof - uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad()); -} - -inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) { - if( h.firstDeleted ) { - uasserted(11500, "deleted not yet implemented recstoreinsert"); - } - massert( 10381 , "bad len", reclen == h.recsize); - fileofs ofs = h.leof; - h.leof += reclen; - if( h.leof > len ) { - // grow the file. we grow quite a bit to avoid excessive file system fragmentations - len += (len / 8) + h.recsize; - uassert( 10116 , "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 ); - write(len, "", 0); - } - writeHeader(); - write(ofs, buf, reclen); - uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad()); - return ofs; -} - -/* so far, it's ok to read or update a subset of a record */ - -inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) { - assert(o <= h.leof && o >= sizeof(RecStoreHeader)); - write(o, buf, len); -} - -inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) { - assert(o <= h.leof && o >= sizeof(RecStoreHeader)); - f.read(o, buf, len); - massert( 10382 , "basicrestore::get I/O error", !f.bad()); -} - -inline void BasicRecStore::remove(fileofs o, unsigned len) { - uasserted(11501, "not yet implemented recstoreremove"); -} - -} diff --git a/db/repl.cpp b/db/repl.cpp index ea0eab9..b14034d 100644 --- a/db/repl.cpp +++ b/db/repl.cpp @@ -25,7 +25,7 @@ local.sources - indicates what sources we pull from as a "slave", and the last update of each local.oplog.$main - our op log as "master" - local.dbinfo. + local.dbinfo. - no longer used??? local.pair.startup - can contain a special value indicating for a pair that we have the master copy. used when replacing other half of the pair which has permanently failed. local.pair.sync - { initialsynccomplete: 1 } @@ -49,13 +49,13 @@ #include "repl/rs.h" namespace mongo { - + // our config from command line etc. ReplSettings replSettings; /* if 1 sync() is running */ volatile int syncing = 0; - static volatile int relinquishSyncingSome = 0; + static volatile int relinquishSyncingSome = 0; /* if true replace our peer in a replication pair -- don't worry about if his local.oplog.$main is empty. @@ -68,9 +68,9 @@ namespace mongo { const char *replAllDead = 0; time_t lastForcedResync = 0; - + IdTracker &idTracker = *( new IdTracker() ); - + } // namespace mongo #include "replpair.h" @@ -159,8 +159,8 @@ namespace mongo { break; { dbtemprelease t; - relinquishSyncingSome = 1; - sleepmillis(1); + relinquishSyncingSome = 1; + sleepmillis(1); } } if ( syncing ) { @@ -206,7 +206,7 @@ namespace mongo { return true; } } cmdForceDead; - + /* operator requested resynchronization of replication (on the slave). { resync : 1 } */ class CmdResync : public Command { public: @@ -221,22 +221,28 @@ namespace mongo { void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; } CmdResync() : Command("resync") { } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( cmdLine.usingReplSets() ) { + errmsg = "resync command not currently supported with replica sets. See RS102 info in the mongodb documentations"; + result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member"); + return false; + } + if ( cmdObj.getBoolField( "force" ) ) { if ( !waitForSyncToFinish( errmsg ) ) return false; replAllDead = "resync forced"; - } + } if ( !replAllDead ) { errmsg = "not dead, no need to resync"; return false; } if ( !waitForSyncToFinish( errmsg ) ) return false; - + ReplSource::forceResyncDead( "client" ); result.append( "info", "triggered resync for all sources" ); - return true; - } + return true; + } bool waitForSyncToFinish( string &errmsg ) const { // Wait for slave thread to finish syncing, so sources will be be // reloaded with new saved state on next pass. @@ -246,7 +252,7 @@ namespace mongo { break; { dbtemprelease t; - relinquishSyncingSome = 1; + relinquishSyncingSome = 1; sleepmillis(1); } } @@ -257,16 +263,31 @@ namespace mongo { return true; } } cmdResync; - - bool anyReplEnabled(){ - return replPair || replSettings.slave || replSettings.master; + + bool anyReplEnabled() { + return replPair || replSettings.slave || replSettings.master || theReplSet; } - void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ){ - + bool replAuthenticate(DBClientBase *conn); + + void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ) { + + if ( replSet ) { + if( theReplSet == 0 ) { + result.append("ismaster", false); + result.append("secondary", false); + result.append("info", ReplSet::startupStatusMsg); + result.append( "isreplicaset" , true ); + return; + } + + theReplSet->fillIsMaster(result); + return; + } + if ( replAllDead ) { result.append("ismaster", 0); - if( authed ) { + if( authed ) { if ( replPair ) result.append("remote", replPair->remote); } @@ -285,25 +306,25 @@ namespace mongo { result.appendBool("ismaster", _isMaster() ); } - if ( level && replSet ){ + if ( level && replSet ) { result.append( "info" , "is replica set" ); } - else if ( level ){ + else if ( level ) { BSONObjBuilder sources( result.subarrayStart( "sources" ) ); - + readlock lk( "local.sources" ); - Client::Context ctx( "local.sources" ); + Client::Context ctx( "local.sources", dbpath, 0, authed ); shared_ptr c = findTableScan("local.sources", BSONObj()); int n = 0; - while ( c->ok() ){ + while ( c->ok() ) { BSONObj s = c->current(); - + BSONObjBuilder bb; bb.append( s["host"] ); string sourcename = s["source"].valuestr(); if ( sourcename != "main" ) bb.append( s["source"] ); - + { BSONElement e = s["syncedTo"]; BSONObjBuilder t( bb.subobjStart( "syncedTo" ) ); @@ -311,23 +332,27 @@ namespace mongo { t.append( "inc" , e.timestampInc() ); t.done(); } - - if ( level > 1 ){ + + if ( level > 1 ) { dbtemprelease unlock; + // note: there is no so-style timeout on this connection; perhaps we should have one. ScopedDbConnection conn( s["host"].valuestr() ); - BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) ); - BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) ); - bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); - bb.appendDate( "masterLast" , last["ts"].timestampTime() ); - double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); - bb.append( "lagSeconds" , lag / 1000 ); + DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() ); + if ( cliConn && replAuthenticate( cliConn ) ) { + BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) ); + BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) ); + bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); + bb.appendDate( "masterLast" , last["ts"].timestampTime() ); + double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); + bb.append( "lagSeconds" , lag / 1000 ); + } conn.done(); } sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() ); c->advance(); } - + sources.done(); } } @@ -345,26 +370,15 @@ namespace mongo { virtual LockType locktype() const { return NONE; } CmdIsMaster() : Command("isMaster", true, "ismaster") { } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { - /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not - authenticated. - we allow unauthenticated ismaster but we aren't as verbose informationally if - one is not authenticated for admin db to be safe. - */ - - if( replSet ) { - if( theReplSet == 0 ) { - result.append("ismaster", false); - result.append("secondary", false); - errmsg = "replSet still trying to initialize"; - result.append("info", ReplSet::startupStatusMsg); - return true; - } - theReplSet->fillIsMaster(result); - return true; - } - - bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); + /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not + authenticated. + we allow unauthenticated ismaster but we aren't as verbose informationally if + one is not authenticated for admin db to be safe. + */ + bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); appendReplicationInfo( result , authed ); + + result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); return true; } } cmdismaster; @@ -375,14 +389,14 @@ namespace mongo { virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return WRITE; } + virtual LockType locktype() const { return NONE; } CmdIsInitialSyncComplete() : Command( "isinitialsynccomplete" ) {} virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { result.appendBool( "initialsynccomplete", getInitialSyncCompleted() ); return true; } } cmdisinitialsynccomplete; - + /* negotiate who is master -1=not set (probably means we just booted) @@ -482,7 +496,7 @@ namespace mongo { return true; } } cmdnegotiatemaster; - + int ReplPair::negotiate(DBClientConnection *conn, string method) { BSONObjBuilder b; b.append("negotiatemaster",1); @@ -491,7 +505,7 @@ namespace mongo { b.append("your_port", remotePort); BSONObj cmd = b.done(); BSONObj res = conn->findOne("admin.$cmd", cmd); - if ( ! res["ok"].trueValue() ){ + if ( ! res["ok"].trueValue() ) { string message = method + " negotiate failed"; problem() << message << ": " << res.toString() << '\n'; setMasterLocked(State_Confused, message.c_str()); @@ -503,7 +517,8 @@ namespace mongo { // choose who is master. if ( x != State_Slave && x != State_Master && x != State_Negotiating ) { problem() << method << " negotiate: bad you_are value " << res.toString() << endl; - } else if ( x != State_Negotiating ) { + } + else if ( x != State_Negotiating ) { string message = method + " negotiation"; setMasterLocked(x, message.c_str()); } @@ -542,8 +557,8 @@ namespace mongo { break; addDbNextPass.insert( e.fieldName() ); } - } - + } + dbsObj = o.getObjectField("incompleteCloneDbs"); if ( !dbsObj.isEmpty() ) { BSONObjIterator i(dbsObj); @@ -553,7 +568,7 @@ namespace mongo { break; incompleteCloneDbs.insert( e.fieldName() ); } - } + } _lastSavedLocalTs = OpTime( o.getField( "localLogTs" ).date() ); } @@ -569,7 +584,7 @@ namespace mongo { b.appendTimestamp("syncedTo", syncedTo.asDate()); b.appendTimestamp("localLogTs", _lastSavedLocalTs.asDate()); - + BSONObjBuilder dbsNextPassBuilder; int n = 0; for ( set::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) { @@ -622,7 +637,7 @@ namespace mongo { } } - static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, const BSONObj &spec, ReplSource::SourceVector &old) { + static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) { if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync. for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end(); ) { if ( s == **i ) { @@ -684,11 +699,12 @@ namespace mongo { else { try { massert( 10384 , "--only requires use of --source", cmdLine.only.empty()); - } catch ( ... ) { + } + catch ( ... ) { dbexit( EXIT_BADOPTIONS ); } } - + if ( replPair ) { const string &remote = replPair->remote; // --pairwith host specified. @@ -730,9 +746,9 @@ namespace mongo { tmp.syncedTo = OpTime(); tmp.replacing = true; } - } + } if ( ( !replPair && tmp.syncedTo.isNull() ) || - ( replPair && replSettings.fastsync ) ) { + ( replPair && replSettings.fastsync ) ) { DBDirectClient c; if ( c.exists( "local.oplog.$main" ) ) { BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) ); @@ -742,7 +758,7 @@ namespace mongo { } } } - addSourceToList(v, tmp, c->current(), old); + addSourceToList(v, tmp, old); c->advance(); } @@ -766,7 +782,7 @@ namespace mongo { } return false; } - + void ReplSource::forceResyncDead( const char *requester ) { if ( !replAllDead ) return; @@ -775,9 +791,9 @@ namespace mongo { for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) { (*i)->forceResync( requester ); } - replAllDead = 0; + replAllDead = 0; } - + void ReplSource::forceResync( const char *requester ) { BSONObj info; { @@ -800,7 +816,7 @@ namespace mongo { } } } - } + } syncedTo = OpTime(); addDbNextPass.clear(); save(); @@ -812,7 +828,7 @@ namespace mongo { dropDatabase(db); return db; } - + /* grab initial copy of a database from the master */ bool ReplSource::resync(string db) { string dummyNs = resyncDrop( db.c_str(), "internal" ); @@ -841,7 +857,7 @@ namespace mongo { log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;; } catch ( DBException& e ) { - log() << "sync: caught db exception " << e << " while applying op: " << op << endl;; + log() << "sync: caught db exception " << e << " while applying op: " << op << endl;; } } @@ -850,15 +866,17 @@ namespace mongo { { ts: ..., op: , ns: ..., o: , o2: , b: } ... see logOp() comments. + + @param alreadyLocked caller already put us in write lock if true */ - void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail) { + void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail, bool alreadyLocked) { if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly log(6) << "processing op: " << op << endl; if( op.getStringField("op")[0] == 'n' ) return; - char clientName[MaxDatabaseLen]; + char clientName[MaxDatabaseNameLen]; const char *ns = op.getStringField("ns"); nsToDatabase(ns, clientName); @@ -867,22 +885,27 @@ namespace mongo { return; } else if ( *ns == 0 ) { - problem() << "halting replication, bad op in oplog:\n " << op.toString() << endl; - replAllDead = "bad object in oplog"; - throw SyncException(); + /*if( op.getStringField("op")[0] != 'n' )*/ { + problem() << "halting replication, bad op in oplog:\n " << op.toString() << endl; + replAllDead = "bad object in oplog"; + throw SyncException(); + } + //ns = "local.system.x"; + //nsToDatabase(ns, clientName); } if ( !only.empty() && only != clientName ) return; - if( cmdLine.pretouch ) { + if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) { if( cmdLine.pretouch > 1 ) { /* note: this is bad - should be put in ReplSource. but this is first test... */ static int countdown; + assert( countdown >= 0 ); if( countdown > 0 ) { countdown--; // was pretouched on a prev pass - assert( countdown >= 0 ); - } else { + } + else { const int m = 4; if( tp.get() == 0 ) { int nthr = min(8, cmdLine.pretouch); @@ -911,7 +934,7 @@ namespace mongo { } } - dblock lk; + scoped_ptr lk( alreadyLocked ? 0 : new writelock() ); if ( localLogTail && replPair && replPair->state == ReplPair::State_Master ) { updateSetsWithLocalOps( *localLogTail, true ); // allow unlocking @@ -923,7 +946,7 @@ namespace mongo { log() << "replAllDead, throwing SyncException: " << replAllDead << endl; throw SyncException(); } - + Client::Context ctx( ns ); ctx.getClient()->curop()->reset(); @@ -932,14 +955,14 @@ namespace mongo { if( logLevel >= 6 ) log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl; - + // always apply admin command command // this is a bit hacky -- the semantics of replication/commands aren't well specified if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) { applyOperation( op ); return; } - + if ( ctx.justCreated() || empty || incompleteClone ) { // we must add to incomplete list now that setClient has been called incompleteCloneDbs.insert( clientName ); @@ -950,7 +973,8 @@ namespace mongo { clone 100 databases in one pass.) */ addDbNextPass.insert( clientName ); - } else { + } + else { if ( incompleteClone ) { log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl; } @@ -962,21 +986,25 @@ namespace mongo { incompleteCloneDbs.erase( clientName ); } save(); - } else { + } + else { bool mod; if ( replPair && replPair->state == ReplPair::State_Master ) { BSONObj id = idForOp( op, mod ); if ( !idTracker.haveId( ns, id ) ) { - applyOperation( op ); - } else if ( idTracker.haveModId( ns, id ) ) { + applyOperation( op ); + } + else if ( idTracker.haveModId( ns, id ) ) { log( 6 ) << "skipping operation matching mod id object " << op << endl; BSONObj existing; if ( Helpers::findOne( ns, id, existing ) ) logOp( "i", ns, existing ); - } else { + } + else { log( 6 ) << "skipping operation matching changed id object " << op << endl; } - } else { + } + else { applyOperation( op ); } addDbNextPass.erase( clientName ); @@ -988,33 +1016,33 @@ namespace mongo { const char *opType = op.getStringField( "op" ); BSONObj o = op.getObjectField( "o" ); switch( opType[ 0 ] ) { - case 'i': { - BSONObjBuilder idBuilder; - BSONElement id; - if ( !o.getObjectID( id ) ) - return BSONObj(); - idBuilder.append( id ); - return idBuilder.obj(); - } - case 'u': { - BSONObj o2 = op.getObjectField( "o2" ); - if ( strcmp( o2.firstElement().fieldName(), "_id" ) != 0 ) - return BSONObj(); - if ( o.firstElement().fieldName()[ 0 ] == '$' ) - mod = true; - return o2; - } - case 'd': { - if ( opType[ 1 ] != '\0' ) - return BSONObj(); // skip "db" op type - return o; - } - default: - break; - } + case 'i': { + BSONObjBuilder idBuilder; + BSONElement id; + if ( !o.getObjectID( id ) ) + return BSONObj(); + idBuilder.append( id ); + return idBuilder.obj(); + } + case 'u': { + BSONObj o2 = op.getObjectField( "o2" ); + if ( strcmp( o2.firstElement().fieldName(), "_id" ) != 0 ) + return BSONObj(); + if ( o.firstElement().fieldName()[ 0 ] == '$' ) + mod = true; + return o2; + } + case 'd': { + if ( opType[ 1 ] != '\0' ) + return BSONObj(); // skip "db" op type + return o; + } + default: + break; + } return BSONObj(); } - + void ReplSource::updateSetsWithOp( const BSONObj &op, bool mayUnlock ) { if ( mayUnlock ) { idTracker.mayUpgradeStorage(); @@ -1029,42 +1057,42 @@ namespace mongo { if ( mod ) idTracker.haveModId( ns, id, true ); idTracker.haveId( ns, id, true ); - } + } } - + void ReplSource::syncToTailOfRemoteLog() { string _ns = ns(); BSONObjBuilder b; if ( !only.empty() ) { b.appendRegex("ns", string("^") + only); - } + } BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) ); if ( !last.isEmpty() ) { BSONElement ts = last.getField( "ts" ); massert( 10386 , "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp ); syncedTo = OpTime( ts.date() ); - } + } } - + OpTime ReplSource::nextLastSavedLocalTs() const { Client::Context ctx( "local.oplog.$main" ); shared_ptr c = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) ); if ( c->ok() ) - return OpTime( c->current().getField( "ts" ).date() ); + return OpTime( c->current().getField( "ts" ).date() ); return OpTime(); } - + void ReplSource::setLastSavedLocalTs( const OpTime &nextLocalTs ) { _lastSavedLocalTs = nextLocalTs; log( 3 ) << "updated _lastSavedLocalTs to: " << _lastSavedLocalTs << endl; } - + void ReplSource::resetSlave() { log() << "**********************************************************\n"; log() << "Sending forcedead command to slave to stop its replication\n"; log() << "Host: " << hostName << " paired: " << paired << endl; massert( 10387 , "request to kill slave replication failed", - oplogReader.conn()->simpleCommand( "admin", 0, "forcedead" ) ); + oplogReader.conn()->simpleCommand( "admin", 0, "forcedead" ) ); syncToTailOfRemoteLog(); { dblock lk; @@ -1073,7 +1101,7 @@ namespace mongo { oplogReader.resetCursor(); } } - + bool ReplSource::updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ) { Client::Context ctx( "local.oplog.$main" ); shared_ptr localLog = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) ); @@ -1099,14 +1127,16 @@ namespace mongo { dbtemprelease t; resetSlave(); massert( 10388 , "local master log filled, forcing slave resync", false ); - } + } if ( !newTail.isNull() ) localLogTail = newTail; return true; } - + + extern unsigned replApplyBatchSize; + /* slave: pull some data from the master's oplog - note: not yet in db mutex at this point. + note: not yet in db mutex at this point. @return -1 error 0 ok, don't sleep 1 ok, sleep @@ -1126,7 +1156,7 @@ namespace mongo { OpTime localLogTail = _lastSavedLocalTs; bool initial = syncedTo.isNull(); - + if ( !oplogReader.haveCursor() || initial ) { if ( initial ) { // Important to grab last oplog timestamp before listing databases. @@ -1152,13 +1182,13 @@ namespace mongo { dblock lk; save(); } - + BSONObjBuilder q; q.appendDate("$gte", syncedTo.asDate()); BSONObjBuilder query; query.append("ts", q.done()); if ( !only.empty() ) { - // note we may here skip a LOT of data table scanning, a lot of work for the master. + // note we may here skip a LOT of data table scanning, a lot of work for the master. query.appendRegex("ns", string("^") + only); // maybe append "\\." here? } BSONObj queryObj = query.done(); @@ -1185,7 +1215,7 @@ namespace mongo { b.append("ns", *i + '.'); b.append("op", "db"); BSONObj op = b.done(); - sync_pullOpLog_applyOperation(op, 0); + sync_pullOpLog_applyOperation(op, 0, false); } } @@ -1195,7 +1225,8 @@ namespace mongo { if( oplogReader.awaitCapable() ) okResultCode = 0; // don't sleep - } else { + } + else { log() << "repl: " << ns << " oplog is empty\n"; } { @@ -1207,11 +1238,11 @@ namespace mongo { setLastSavedLocalTs( nextLastSaved ); } } - save(); + save(); } return okResultCode; } - + OpTime nextOpTime; { BSONObj op = oplogReader.next(); @@ -1234,32 +1265,31 @@ namespace mongo { massert( 10391 , "repl: bad object read from remote oplog", false); } } - + if ( replPair && replPair->state == ReplPair::State_Master ) { - + OpTime next( ts.date() ); if ( !tailing && !initial && next != syncedTo ) { log() << "remote slave log filled, forcing slave resync" << endl; resetSlave(); return 1; - } - + } + dblock lk; updateSetsWithLocalOps( localLogTail, true ); } - + nextOpTime = OpTime( ts.date() ); log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; - if ( tailing || initial ) { - if ( initial ) - log(1) << "repl: initial run\n"; - else { - if( !( syncedTo <= nextOpTime ) ) { - log() << "repl ASSERTION failed : syncedTo <= nextOpTime" << endl; - log() << "repl syncTo: " << syncedTo.toStringLong() << endl; - log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; - assert(false); - } + if ( initial ) { + log(1) << "repl: initial run\n"; + } + if( tailing ) { + if( !( syncedTo < nextOpTime ) ) { + log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; + log() << "repl syncTo: " << syncedTo.toStringLong() << endl; + log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; + assert(false); } oplogReader.putBack( op ); // op will be processed in the loop below nextOpTime = OpTime(); // will reread the op below @@ -1281,14 +1311,14 @@ namespace mongo { throw SyncException(); } else { - /* t == syncedTo, so the first op was applied previously. */ + /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ } } // apply operations { int n = 0; - time_t saveLast = time(0); + time_t saveLast = time(0); while ( 1 ) { /* from a.s.: I think the idea here is that we can establish a sync point between the local op log and the remote log with the following steps: @@ -1316,7 +1346,8 @@ namespace mongo { if ( getInitialSyncCompleted() ) { // if initial sync hasn't completed, break out of loop so we can set to completed or clone more dbs continue; } - } else { + } + else { setLastSavedLocalTs( nextLastSaved ); } } @@ -1332,109 +1363,132 @@ namespace mongo { else { } - OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { - // periodically note our progress, in case we are doing a lot of work and crash - dblock lk; + OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { + // periodically note our progress, in case we are doing a lot of work and crash + dblock lk; syncedTo = nextOpTime; // can't update local log ts since there are pending operations from our peer - save(); + save(); log() << "repl: checkpoint applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; - saveLast = time(0); - n = 0; - } + saveLast = time(0); + n = 0; + } BSONObj op = oplogReader.next(); - BSONElement ts = op.getField("ts"); - if( !( ts.type() == Date || ts.type() == Timestamp ) ) { - log() << "sync error: problem querying remote oplog record\n"; - log() << "op: " << op.toString() << '\n'; - log() << "halting replication" << endl; - replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; - throw SyncException(); - } - OpTime last = nextOpTime; - nextOpTime = OpTime( ts.date() ); - if ( !( last < nextOpTime ) ) { - log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; - log() << " last: " << last.toStringLong() << '\n'; - log() << " nextOpTime: " << nextOpTime.toStringLong() << '\n'; - log() << " halting replication" << endl; - replInfo = replAllDead = "sync error last >= nextOpTime"; - uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); - } - if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { - oplogReader.putBack( op ); - _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; - dblock lk; - if ( n > 0 ) { - syncedTo = last; - save(); + + unsigned b = replApplyBatchSize; + bool justOne = b == 1; + scoped_ptr lk( justOne ? 0 : new writelock() ); + while( 1 ) { + + BSONElement ts = op.getField("ts"); + if( !( ts.type() == Date || ts.type() == Timestamp ) ) { + log() << "sync error: problem querying remote oplog record" << endl; + log() << "op: " << op.toString() << endl; + log() << "halting replication" << endl; + replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; + throw SyncException(); + } + OpTime last = nextOpTime; + nextOpTime = OpTime( ts.date() ); + if ( !( last < nextOpTime ) ) { + log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; + log() << " last: " << last.toStringLong() << endl; + log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; + log() << " halting replication" << endl; + replInfo = replAllDead = "sync error last >= nextOpTime"; + uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); + } + if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { + assert( justOne ); + oplogReader.putBack( op ); + _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; + dblock lk; + if ( n > 0 ) { + syncedTo = last; + save(); + } + log() << "repl: applied " << n << " operations" << endl; + log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; + log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; + return okResultCode; } - log() << "repl: applied " << n << " operations" << endl; - log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; - log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; - break; - } - sync_pullOpLog_applyOperation(op, &localLogTail); - n++; + sync_pullOpLog_applyOperation(op, &localLogTail, !justOne); + n++; + + if( --b == 0 ) + break; + // if to here, we are doing mulpile applications in a singel write lock acquisition + if( !oplogReader.moreInCurrentBatch() ) { + // break if no more in batch so we release lock while reading from the master + break; + } + op = oplogReader.next(); + + getDur().commitIfNeeded(); + } } } return okResultCode; } - BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); - - bool replAuthenticate(DBClientConnection *conn) { - if( ! cc().isAdmin() ){ - log() << "replauthenticate: requires admin permissions, failing\n"; - return false; - } - - BSONObj user; - { - dblock lk; - Client::Context ctxt("local."); - if( !Helpers::findOne("local.system.users", userReplQuery, user) ) { - // try the first user is local - if( !Helpers::getSingleton("local.system.users", user) ) { - if( noauth ) - return true; // presumably we are running a --noauth setup all around. - - log() << "replauthenticate: no user in local.system.users to use for authentication\n"; - return false; - } - } - - } - - string u = user.getStringField("user"); - string p = user.getStringField("pwd"); - massert( 10392 , "bad user object? [1]", !u.empty()); - massert( 10393 , "bad user object? [2]", !p.empty()); - string err; - if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) { - log() << "replauthenticate: can't authenticate to master server, user:" << u << endl; - return false; - } - return true; - } + BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); + + bool replAuthenticate(DBClientBase *conn) { + if( ! cc().isAdmin() ) { + log() << "replauthenticate: requires admin permissions, failing\n"; + return false; + } + + string u; + string p; + if (internalSecurity.pwd.length() > 0) { + u = internalSecurity.user; + p = internalSecurity.pwd; + } + else { + BSONObj user; + { + dblock lk; + Client::Context ctxt("local."); + if( !Helpers::findOne("local.system.users", userReplQuery, user) || + // try the first user in local + !Helpers::getSingleton("local.system.users", user) ) { + log() << "replauthenticate: no user in local.system.users to use for authentication\n"; + return noauth; + } + } + u = user.getStringField("user"); + p = user.getStringField("pwd"); + massert( 10392 , "bad user object? [1]", !u.empty()); + massert( 10393 , "bad user object? [2]", !p.empty()); + } + + string err; + if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) { + log() << "replauthenticate: can't authenticate to master server, user:" << u << endl; + return false; + } + return true; + } bool replHandshake(DBClientConnection *conn) { - + BSONObj me; { dblock l; - if ( ! Helpers::getSingleton( "local.me" , me ) ){ + // local.me is an identifier for a server for getLastError w:2+ + if ( ! Helpers::getSingleton( "local.me" , me ) ) { BSONObjBuilder b; b.appendOID( "_id" , 0 , true ); me = b.obj(); Helpers::putSingleton( "local.me" , me ); } } - + BSONObjBuilder cmd; cmd.appendAs( me["_id"] , "handshake" ); @@ -1450,9 +1504,9 @@ namespace mongo { _conn = auto_ptr(new DBClientConnection( false, 0, replPair ? 20 : 0 /* tcp timeout */)); string errmsg; ReplInfo r("trying to connect to sync source"); - if ( !_conn->connect(hostName.c_str(), errmsg) || - !replAuthenticate(_conn.get()) || - !replHandshake(_conn.get()) ) { + if ( !_conn->connect(hostName.c_str(), errmsg) || + (!noauth && !replAuthenticate(_conn.get())) || + !replHandshake(_conn.get()) ) { resetConnection(); log() << "repl: " << errmsg << endl; return false; @@ -1460,7 +1514,7 @@ namespace mongo { } return true; } - + /* note: not yet in mutex at this point. returns >= 0 if ok. return -1 if you want to reconnect. return value of zero indicates no sleep necessary before next call @@ -1486,14 +1540,14 @@ namespace mongo { } if ( !oplogReader.connect(hostName) ) { - log(4) << "repl: can't connect to sync source" << endl; + log(4) << "repl: can't connect to sync source" << endl; if ( replPair && paired ) { assert( startsWith(hostName.c_str(), replPair->remoteHost.c_str()) ); replPair->arbitrate(); } return -1; } - + if ( paired ) { int remote = replPair->negotiate(oplogReader.conn(), "direct"); int nMasters = ( remote == ReplPair::State_Master ) + ( replPair->state == ReplPair::State_Master ); @@ -1504,17 +1558,17 @@ namespace mongo { } /* - // get current mtime at the server. - BSONObj o = conn->findOne("admin.$cmd", opTimeQuery); - BSONElement e = o.getField("optime"); - if( e.eoo() ) { - log() << "repl: failed to get cur optime from master" << endl; - log() << " " << o.toString() << endl; - return false; - } - uassert( 10124 , e.type() == Date ); - OpTime serverCurTime; - serverCurTime.asDate() = e.date(); + // get current mtime at the server. + BSONObj o = conn->findOne("admin.$cmd", opTimeQuery); + BSONElement e = o.getField("optime"); + if( e.eoo() ) { + log() << "repl: failed to get cur optime from master" << endl; + log() << " " << o.toString() << endl; + return false; + } + uassert( 10124 , e.type() == Date ); + OpTime serverCurTime; + serverCurTime.asDate() = e.date(); */ return sync_pullOpLog(nApplied); } @@ -1527,7 +1581,7 @@ namespace mongo { _ reuse that cursor when we can */ - /* returns: # of seconds to sleep before next pass + /* returns: # of seconds to sleep before next pass 0 = no sleep recommended 1 = special sentinel indicating adaptive sleep recommended */ @@ -1543,6 +1597,7 @@ namespace mongo { /* replication is not configured yet (for --slave) in local.sources. Poll for config it every 20 seconds. */ + log() << "no source given, add a master to local.sources to start replication" << endl; return 20; } @@ -1553,7 +1608,7 @@ namespace mongo { try { res = s->sync(nApplied); bool moreToSync = s->haveMoreDbsToSync(); - if( res < 0 ) { + if( res < 0 ) { sleepAdvice = 3; } else if( moreToSync ) { @@ -1562,7 +1617,7 @@ namespace mongo { else if ( s->sleepAdvice() ) { sleepAdvice = s->sleepAdvice(); } - else + else sleepAdvice = res; if ( res >= 0 && !moreToSync /*&& !s->syncedTo.isNull()*/ ) { pairSync->setInitialSyncCompletedLocking(); @@ -1588,9 +1643,9 @@ namespace mongo { } catch ( const std::exception &e ) { log() << "repl: std::exception " << e.what() << endl; - replInfo = "replMain caught std::exception"; + replInfo = "replMain caught std::exception"; } - catch ( ... ) { + catch ( ... ) { log() << "unexpected exception during replication. replication will halt" << endl; replAllDead = "caught unexpected exception during replication"; } @@ -1616,15 +1671,16 @@ namespace mongo { try { int nApplied = 0; s = _replMain(sources, nApplied); - if( s == 1 ) { + if( s == 1 ) { if( nApplied == 0 ) s = 2; - else if( nApplied > 100 ) { + else if( nApplied > 100 ) { // sleep very little - just enought that we aren't truly hammering master sleepmillis(75); s = 0; } } - } catch (...) { + } + catch (...) { out() << "caught exception in _replMain" << endl; s = 4; } @@ -1634,10 +1690,10 @@ namespace mongo { syncing--; } - if( relinquishSyncingSome ) { - relinquishSyncingSome = 0; - s = 1; // sleep before going back in to syncing=1 - } + if( relinquishSyncingSome ) { + relinquishSyncingSome = 0; + s = 1; // sleep before going back in to syncing=1 + } if ( s ) { stringstream ss; @@ -1660,21 +1716,21 @@ namespace mongo { while( 1 ) { sleepsecs( toSleep ); - /* write a keep-alive like entry to the log. this will make things like + /* write a keep-alive like entry to the log. this will make things like printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date even when things are idle. */ { writelocktry lk("",1); - if ( lk.got() ){ + if ( lk.got() ) { toSleep = 10; - - cc().getAuthenticationInfo()->authorize("admin"); - - try { + + cc().getAuthenticationInfo()->authorize("admin"); + + try { logKeepalive(); } - catch(...) { + catch(...) { log() << "caught exception in replMasterThread()" << endl; } } @@ -1690,11 +1746,11 @@ namespace mongo { sleepsecs(1); Client::initThread("replslave"); cc().iAmSyncThread(); - + { dblock lk; cc().getAuthenticationInfo()->authorize("admin"); - + BSONObj obj; if ( Helpers::getSingleton("local.pair.startup", obj) ) { // should be: {replacepeer:1} @@ -1730,12 +1786,11 @@ namespace mongo { void startReplication() { /* if we are going to be a replica set, we aren't doing other forms of replication. */ if( !cmdLine._replSet.empty() ) { - if( replSettings.slave || replSettings.master || replPair ) { + if( replSettings.slave || replSettings.master || replPair ) { log() << "***" << endl; log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl; log() << "***" << endl; } - createOplog(); newRepl(); return; } @@ -1773,7 +1828,7 @@ namespace mongo { createOplog(); boost::thread t(replMasterThread); } - + while( replSettings.fastsync ) // don't allow writes until we've set up from log sleepmillis( 50 ); } @@ -1807,5 +1862,29 @@ namespace mongo { } tp.join(); } - + + class ReplApplyBatchSizeValidator : public ParameterValidator { + public: + ReplApplyBatchSizeValidator() : ParameterValidator( "replApplyBatchSize" ) {} + + virtual bool isValid( BSONElement e , string& errmsg ) { + int b = e.numberInt(); + if( b < 1 || b > 1024 ) { + errmsg = "replApplyBatchSize has to be >= 1 and < 1024"; + return false; + } + + if ( replSettings.slavedelay != 0 && b > 1 ) { + errmsg = "can't use a batch size > 1 with slavedelay"; + return false; + } + if ( ! replSettings.slave ) { + errmsg = "can't set replApplyBatchSize on a non-slave machine"; + return false; + } + + return true; + } + } replApplyBatchSizeValidator; + } // namespace mongo diff --git a/db/repl.h b/db/repl.h index f33acad..45036fa 100644 --- a/db/repl.h +++ b/db/repl.h @@ -40,16 +40,16 @@ namespace mongo { - /* replication slave? (possibly with slave or repl pair nonmaster) + /* replication slave? (possibly with slave or repl pair nonmaster) --slave cmd line setting -> SimpleSlave - */ - typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes; + */ + typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes; class ReplSettings { public: SlaveTypes slave; - /* true means we are master and doing replication. if we are not writing to oplog (no --master or repl pairing), + /* true means we are master and doing replication. if we are not writing to oplog (no --master or repl pairing), this won't be true. */ bool master; @@ -57,9 +57,9 @@ namespace mongo { int opIdMem; bool fastsync; - + bool autoresync; - + int slavedelay; ReplSettings() @@ -69,14 +69,14 @@ namespace mongo { }; extern ReplSettings replSettings; - - bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, - bool slaveOk, bool useReplAuth, bool snapshot); + + bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, + bool slaveOk, bool useReplAuth, bool snapshot); /* A replication exception */ class SyncException : public DBException { public: - SyncException() : DBException( "sync exception" , 10001 ){} + SyncException() : DBException( "sync exception" , 10001 ) {} }; /* A Source is a source from which we can pull (replicate) data. @@ -94,11 +94,14 @@ namespace mongo { bool resync(string db); - /* pull some operations from the master's oplog, and apply them. */ + /** @param alreadyLocked caller already put us in write lock if true */ + void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail, bool alreadyLocked); + + /* pull some operations from the master's oplog, and apply them. + calls sync_pullOpLog_applyOperation + */ int sync_pullOpLog(int& nApplied); - void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail); - /* we only clone one database per pass, even if a lot need done. This helps us avoid overflowing the master's transaction log by doing too much work before going back to read more transactions. (Imagine a scenario of slave startup where we try to @@ -109,7 +112,7 @@ namespace mongo { set incompleteCloneDbs; ReplSource(); - + // returns the dummy ns used to do the drop string resyncDrop( const char *db, const char *requester ); // returns possibly unowned id spec for the operation. @@ -127,7 +130,7 @@ namespace mongo { bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ); string ns() const { return string( "local.oplog.$" ) + sourceName(); } unsigned _sleepAdviceTime; - + public: OplogReader oplogReader; @@ -136,9 +139,7 @@ namespace mongo { bool paired; // --pair in use string hostName; // ip addr or hostname plus optionally, ":" string _sourceName; // a logical source name. - string sourceName() const { - return _sourceName.empty() ? "main" : _sourceName; - } + string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; } string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating. /* the last time point we have already synced up to (in the remote/master's oplog). */ @@ -146,8 +147,8 @@ namespace mongo { /* This is for repl pairs. _lastSavedLocalTs is the most recent point in the local log that we know is consistent - with the remote log ( ie say the local op log has entries ABCDE and the remote op log - has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled + with the remote log ( ie say the local op log has entries ABCDE and the remote op log + has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled the DE-XY difference.) */ OpTime _lastSavedLocalTs; @@ -171,15 +172,15 @@ namespace mongo { return hostName == r.hostName && sourceName() == r.sourceName(); } string toString() const { return sourceName() + "@" + hostName; } - - bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); } + + bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); } int sleepAdvice() const { if ( !_sleepAdviceTime ) return 0; int wait = _sleepAdviceTime - unsigned( time( 0 ) ); return wait > 0 ? wait : 0; } - + static bool throttledForceResyncDead( const char *requester ); static void forceResyncDead( const char *requester ); void forceResync( const char *requester ); @@ -200,7 +201,8 @@ namespace mongo { if ( imp_[ ns ].insert( id.getOwned() ).second ) { size_ += id.objsize() + sizeof( BSONObj ); } - } else { + } + else { if ( imp_[ ns ].erase( id ) == 1 ) { size_ -= id.objsize() + sizeof( BSONObj ); } @@ -236,7 +238,7 @@ namespace mongo { // rename _id to id since there may be duplicates b.appendAs( id.firstElement(), "id" ); return b.obj(); - } + } DbSet impl_; }; @@ -244,14 +246,14 @@ namespace mongo { // All functions must be called with db mutex held // Kind of sloppy class structure, for now just want to keep the in mem // version speedy. - // see http://www.mongodb.org/display/DOCS/Pairing+Internals + // see http://www.mongodb.org/display/DOCS/Pairing+Internals class IdTracker { public: IdTracker() : - dbIds_( "local.temp.replIds" ), - dbModIds_( "local.temp.replModIds" ), - inMem_( true ), - maxMem_( replSettings.opIdMem ) { + dbIds_( "local.temp.replIds" ), + dbModIds_( "local.temp.replModIds" ), + inMem_( true ), + maxMem_( replSettings.opIdMem ) { } void reset( int maxMem = replSettings.opIdMem ) { memIds_.reset(); @@ -309,7 +311,7 @@ namespace mongo { void upgrade( MemIds &a, DbIds &b ) { for( MemIds::IdSets::const_iterator i = a.imp_.begin(); i != a.imp_.end(); ++i ) { for( BSONObjSetDefaultOrder::const_iterator j = i->second.begin(); j != i->second.end(); ++j ) { - set( b, i->first.c_str(), *j, true ); + set( b, i->first.c_str(), *j, true ); RARELY { dbtemprelease t; } @@ -323,9 +325,9 @@ namespace mongo { bool inMem_; int maxMem_; }; - + bool anyReplEnabled(); void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 ); - - + + } // namespace mongo diff --git a/db/repl/connections.h b/db/repl/connections.h index cdf2fad..7e7bfe5 100644 --- a/db/repl/connections.h +++ b/db/repl/connections.h @@ -1,4 +1,4 @@ -// @file +// @file /* * Copyright (C) 2010 10gen Inc. @@ -20,11 +20,12 @@ #include #include "../../client/dbclient.h" +#include "../security_key.h" -namespace mongo { +namespace mongo { - /** here we keep a single connection (with reconnect) for a set of hosts, - one each, and allow one user at a time per host. if in use already for that + /** here we keep a single connection (with reconnect) for a set of hosts, + one each, and allow one user at a time per host. if in use already for that host, we block. so this is an easy way to keep a 1-deep pool of connections that many threads can share. @@ -39,35 +40,37 @@ namespace mongo { throws exception on connect error (but fine to try again later with a new scopedconn object for same host). */ - class ScopedConn { + class ScopedConn { public: /** throws assertions if connect failure etc. */ ScopedConn(string hostport); ~ScopedConn(); /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic. - So here what we do is wrapper known safe methods and not allow cursor-style queries at all. This makes + So here what we do is wrapper known safe methods and not allow cursor-style queries at all. This makes ScopedConn limited in functionality but very safe. More non-cursor wrappers can be added here if needed. */ bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) { return conn()->runCommand(dbname, cmd, info, options); } - unsigned long long count(const string &ns) { - return conn()->count(ns); + unsigned long long count(const string &ns) { + return conn()->count(ns); } - BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) { + BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) { return conn()->findOne(ns, q, fieldsToReturn, queryOptions); } + void setTimeout(double to) { + conn()->setSoTimeout(to); + } private: auto_ptr connLock; - static mutex mapMutex; - struct X { - mutex z; + static mongo::mutex mapMutex; + struct X { + mongo::mutex z; DBClientConnection cc; - X() : z("X"), cc(/*reconnect*/ true, 0, - /*timeout*/ theReplSet ? theReplSet->config().ho.heartbeatTimeoutMillis/1000.0 : 10.0) { + X() : z("X"), cc(/*reconnect*/ true, 0, /*timeout*/ 10.0) { cc._logLevel = 2; } } *x; @@ -87,22 +90,30 @@ namespace mongo { connLock.reset( new scoped_lock(x->z) ); } } - if( !first ) { + if( !first ) { connLock.reset( new scoped_lock(x->z) ); return; } // we already locked above... string err; - x->cc.connect(hostport, err); + if (!x->cc.connect(hostport, err)) { + log() << "couldn't connect to " << hostport << ": " << err << rsLog; + return; + } + + if (!noauth && !x->cc.auth("local", internalSecurity.user, internalSecurity.pwd, err, false)) { + log() << "could not authenticate against " << conn()->toString() << ", " << err << rsLog; + return; + } } - inline ScopedConn::~ScopedConn() { + inline ScopedConn::~ScopedConn() { // conLock releases... } - /*inline DBClientConnection* ScopedConn::operator->() { - return &x->cc; + /*inline DBClientConnection* ScopedConn::operator->() { + return &x->cc; }*/ } diff --git a/db/repl/consensus.cpp b/db/repl/consensus.cpp index 1519c26..f764abe 100644 --- a/db/repl/consensus.cpp +++ b/db/repl/consensus.cpp @@ -19,9 +19,9 @@ #include "rs.h" #include "multicmd.h" -namespace mongo { +namespace mongo { - class CmdReplSetFresh : public ReplSetCommand { + class CmdReplSetFresh : public ReplSetCommand { public: CmdReplSetFresh() : ReplSetCommand("replSetFresh") { } private: @@ -29,23 +29,23 @@ namespace mongo { if( !check(errmsg, result) ) return false; - if( cmdObj["set"].String() != theReplSet->name() ) { + if( cmdObj["set"].String() != theReplSet->name() ) { errmsg = "wrong repl set name"; return false; } string who = cmdObj["who"].String(); int cfgver = cmdObj["cfgver"].Int(); - OpTime opTime(cmdObj["opTime"].Date()); + OpTime opTime(cmdObj["opTime"].Date()); bool weAreFresher = false; - if( theReplSet->config().version > cfgver ) { + if( theReplSet->config().version > cfgver ) { log() << "replSet member " << who << " is not yet aware its cfg version " << cfgver << " is stale" << rsLog; - result.append("info", "config version stale"); + result.append("info", "config version stale"); + weAreFresher = true; + } + else if( opTime < theReplSet->lastOpTimeWritten ) { weAreFresher = true; } - else if( opTime < theReplSet->lastOpTimeWritten ) { - weAreFresher = true; - } result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate()); result.append("fresher", weAreFresher); return true; @@ -66,19 +66,19 @@ namespace mongo { } } cmdReplSetElect; - int Consensus::totalVotes() const { + int Consensus::totalVotes() const { static int complain = 0; int vTot = rs._self->config().votes; - for( Member *m = rs.head(); m; m=m->next() ) + for( Member *m = rs.head(); m; m=m->next() ) vTot += m->config().votes; if( vTot % 2 == 0 && vTot && complain++ == 0 ) - log() << "replSet warning total number of votes is even - considering giving one member an extra vote" << rsLog; + log() << "replSet " /*buildbot! warning */ "total number of votes is even - add arbiter or give one member an extra vote" << rsLog; return vTot; } bool Consensus::aMajoritySeemsToBeUp() const { int vUp = rs._self->config().votes; - for( Member *m = rs.head(); m; m=m->next() ) + for( Member *m = rs.head(); m; m=m->next() ) vUp += m->hbinfo().up() ? m->config().votes : 0; return vUp * 2 > totalVotes(); } @@ -98,13 +98,13 @@ namespace mongo { const time_t LeaseTime = 30; - unsigned Consensus::yea(unsigned memberId) /* throws VoteException */ { + unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */ Atomic::tran t(ly); LastYea &ly = t.ref(); time_t now = time(0); if( ly.when + LeaseTime >= now && ly.who != memberId ) { log(1) << "replSet not voting yea for " << memberId << - " voted for " << ly.who << ' ' << now-ly.when << " secs ago" << rsLog; + " voted for " << ly.who << ' ' << now-ly.when << " secs ago" << rsLog; throw VoteException(); } ly.when = now; @@ -112,7 +112,7 @@ namespace mongo { return rs._self->config().votes; } - /* we vote for ourself at start of election. once it fails, we can cancel the lease we had in + /* we vote for ourself at start of election. once it fails, we can cancel the lease we had in place instead of leaving it for a long time. */ void Consensus::electionFailed(unsigned meid) { @@ -124,7 +124,7 @@ namespace mongo { } /* todo: threading **************** !!!!!!!!!!!!!!!! */ - void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) { + void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) { BSONObjBuilder& b = *_b; DEV log() << "replSet received elect msg " << cmd.toString() << rsLog; else log(2) << "replSet received elect msg " << cmd.toString() << rsLog; @@ -138,14 +138,14 @@ namespace mongo { const Member* hopeful = rs.findById(whoid); int vote = 0; - if( set != rs.name() ) { + if( set != rs.name() ) { log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog; } - else if( myver < cfgver ) { + else if( myver < cfgver ) { // we are stale. don't vote } - else if( myver > cfgver ) { + else if( myver > cfgver ) { // they are stale! log() << "replSet info got stale version # during election" << rsLog; vote = -10000; @@ -154,10 +154,10 @@ namespace mongo { log() << "couldn't find member with id " << whoid << rsLog; vote = -10000; } - else if( primary && primary->hbinfo().opTime > hopeful->hbinfo().opTime ) { + else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) { // other members might be aware of more up-to-date nodes log() << hopeful->fullName() << " is trying to elect itself but " << - primary->fullName() << " is already primary and more up-to-date" << rsLog; + primary->fullName() << " is already primary and more up-to-date" << rsLog; vote = -10000; } else { @@ -166,7 +166,7 @@ namespace mongo { rs.relinquish(); log() << "replSet info voting yea for " << whoid << rsLog; } - catch(VoteException&) { + catch(VoteException&) { log() << "replSet voting no already voted for another" << rsLog; } } @@ -182,10 +182,10 @@ namespace mongo { L.push_back( Target(m->fullName()) ); } - /* config version is returned as it is ok to use this unlocked. BUT, if unlocked, you would need + /* config version is returned as it is ok to use this unlocked. BUT, if unlocked, you would need to check later that the config didn't change. */ void ReplSetImpl::getTargets(list& L, int& configVersion) { - if( lockedByMe() ) { + if( lockedByMe() ) { _getTargets(L, configVersion); return; } @@ -200,15 +200,21 @@ namespace mongo { bool Consensus::weAreFreshest(bool& allUp, int& nTies) { const OpTime ord = theReplSet->lastOpTimeWritten; nTies = 0; - assert( !ord.isNull() ); + assert( !ord.isNull() ); BSONObj cmd = BSON( - "replSetFresh" << 1 << - "set" << rs.name() << - "opTime" << Date_t(ord.asDate()) << - "who" << rs._self->fullName() << - "cfgver" << rs._cfg->version ); + "replSetFresh" << 1 << + "set" << rs.name() << + "opTime" << Date_t(ord.asDate()) << + "who" << rs._self->fullName() << + "cfgver" << rs._cfg->version ); list L; int ver; + /* the following queries arbiters, even though they are never fresh. wonder if that makes sense. + it doesn't, but it could, if they "know" what freshness it one day. so consider removing + arbiters from getTargets() here. although getTargets is used elsewhere for elections; there + arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make + not fetching them herein happen. + */ rs.getTargets(L, ver); multiCommand(cmd, L); int nok = 0; @@ -228,25 +234,25 @@ namespace mongo { allUp = false; } } - DEV log() << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog; + log(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog; assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working... return true; } extern time_t started; - void Consensus::multiCommand(BSONObj cmd, list& L) { + void Consensus::multiCommand(BSONObj cmd, list& L) { assert( !rs.lockedByMe() ); mongo::multiCommand(cmd, L); } void Consensus::_electSelf() { - if( time(0) < steppedDown ) + if( time(0) < steppedDown ) return; { const OpTime ord = theReplSet->lastOpTimeWritten; - if( ord == 0 ) { + if( ord == 0 ) { log() << "replSet info not trying to elect self, do not yet have a complete set of data from any point in time" << rsLog; return; } @@ -254,16 +260,16 @@ namespace mongo { bool allUp; int nTies; - if( !weAreFreshest(allUp, nTies) ) { + if( !weAreFreshest(allUp, nTies) ) { log() << "replSet info not electing self, we are not freshest" << rsLog; return; } rs.sethbmsg("",9); - if( !allUp && time(0) - started < 60 * 5 ) { - /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data - if we don't have to -- we'd rather be offline and wait a little longer instead + if( !allUp && time(0) - started < 60 * 5 ) { + /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data + if we don't have to -- we'd rather be offline and wait a little longer instead todo: make this configurable. */ rs.sethbmsg("not electing self, not all members up and we have been up less than 5 minutes"); @@ -276,9 +282,10 @@ namespace mongo { /* tie? we then randomly sleep to try to not collide on our voting. */ /* todo: smarter. */ if( me.id() == 0 || sleptLast ) { - // would be fine for one node not to sleep + // would be fine for one node not to sleep // todo: biggest / highest priority nodes should be the ones that get to not sleep - } else { + } + else { assert( !rs.lockedByMe() ); // bad to go to sleep locked unsigned ms = ((unsigned) rand()) % 1000 + 50; DEV log() << "replSet tie " << nTies << " sleeping a little " << ms << "ms" << rsLog; @@ -297,13 +304,13 @@ namespace mongo { log() << "replSet info electSelf " << meid << rsLog; BSONObj electCmd = BSON( - "replSetElect" << 1 << - "set" << rs.name() << - "who" << me.fullName() << - "whoid" << me.hbinfo().id() << - "cfgver" << rs._cfg->version << - "round" << OID::gen() /* this is just for diagnostics */ - ); + "replSetElect" << 1 << + "set" << rs.name() << + "who" << me.fullName() << + "whoid" << me.hbinfo().id() << + "cfgver" << rs._cfg->version << + "round" << OID::gen() /* this is just for diagnostics */ + ); int configVersion; list L; @@ -326,7 +333,7 @@ namespace mongo { // defensive; should never happen as we have timeouts on connection and operation for our conn log() << "replSet too much time passed during our election, ignoring result" << rsLog; } - else if( configVersion != rs.config().version ) { + else if( configVersion != rs.config().version ) { log() << "replSet config version changed during our election, ignoring result" << rsLog; } else { @@ -334,9 +341,10 @@ namespace mongo { log(1) << "replSet election succeeded, assuming primary role" << rsLog; success = true; rs.assumePrimary(); - } + } } - } catch( std::exception& ) { + } + catch( std::exception& ) { if( !success ) electionFailed(meid); throw; } @@ -347,19 +355,19 @@ namespace mongo { assert( !rs.lockedByMe() ); assert( !rs.myConfig().arbiterOnly ); assert( rs.myConfig().slaveDelay == 0 ); - try { - _electSelf(); - } - catch(RetryAfterSleepException&) { + try { + _electSelf(); + } + catch(RetryAfterSleepException&) { throw; } - catch(VoteException& ) { + catch(VoteException& ) { log() << "replSet not trying to elect self as responded yea to someone else recently" << rsLog; } - catch(DBException& e) { + catch(DBException& e) { log() << "replSet warning caught unexpected exception in electSelf() " << e.toString() << rsLog; } - catch(...) { + catch(...) { log() << "replSet warning caught unexpected exception in electSelf()" << rsLog; } } diff --git a/db/repl/health.cpp b/db/repl/health.cpp index c75221c..762ca90 100644 --- a/db/repl/health.cpp +++ b/db/repl/health.cpp @@ -32,20 +32,22 @@ #include "../dbhelpers.h" namespace mongo { + /* decls for connections.h */ - ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M()); + ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M()); mutex ScopedConn::mapMutex("ScopedConn::mapMutex"); } -namespace mongo { +namespace mongo { using namespace mongoutils::html; using namespace bson; static RamLog _rsLog; Tee *rsLog = &_rsLog; + extern bool replSetBlind; - string ago(time_t t) { + string ago(time_t t) { if( t == 0 ) return ""; time_t x = time(0) - t; @@ -58,14 +60,14 @@ namespace mongo { s.precision(2); s << x / 60.0 << " mins"; } - else { + else { s.precision(2); s << x / 3600.0 << " hrs"; } return s.str(); } - void Member::summarizeMember(stringstream& s) const { + void Member::summarizeMember(stringstream& s) const { s << tr(); { stringstream u; @@ -89,27 +91,29 @@ namespace mongo { s << td(h); } s << td(config().votes); - { + s << td(config().priority); + { string stateText = state().toString(); if( _config.hidden ) stateText += " (hidden)"; - if( ok || stateText.empty() ) + if( ok || stateText.empty() ) s << td(stateText); // text blank if we've never connected else s << td( grey(str::stream() << "(was " << state().toString() << ')', true) ); } s << td( grey(hbinfo().lastHeartbeatMsg,!ok) ); stringstream q; - q << "/_replSetOplog?" << id(); + q << "/_replSetOplog?_id=" << id(); s << td( a(q.str(), "", never ? "?" : hbinfo().opTime.toString()) ); if( hbinfo().skew > INT_MIN ) { s << td( grey(str::stream() << hbinfo().skew,!ok) ); - } else + } + else s << td(""); s << _tr(); } - - string ReplSetImpl::stateAsHtml(MemberState s) { + + string ReplSetImpl::stateAsHtml(MemberState s) { if( s.s == MemberState::RS_STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP"); if( s.s == MemberState::RS_PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY"); if( s.s == MemberState::RS_SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY"); @@ -122,7 +126,7 @@ namespace mongo { return ""; } - string MemberState::toString() const { + string MemberState::toString() const { if( s == MemberState::RS_STARTUP ) return "STARTUP"; if( s == MemberState::RS_PRIMARY ) return "PRIMARY"; if( s == MemberState::RS_SECONDARY ) return "SECONDARY"; @@ -143,9 +147,9 @@ namespace mongo { set skip; be e = op["ts"]; - if( e.type() == Date || e.type() == Timestamp ) { + if( e.type() == Date || e.type() == Timestamp ) { OpTime ot = e._opTime(); - ss << td( time_t_to_String_short( ot.getSecs() ) ); + ss << td( time_t_to_String_short( ot.getSecs() ) ); ss << td( ot.toString() ); skip.insert("ts"); } @@ -155,7 +159,8 @@ namespace mongo { if( e.type() == NumberLong ) { ss << "" << hex << e.Long() << "\n"; skip.insert("h"); - } else + } + else ss << td("?"); ss << td(op["op"].valuestrsafe()); @@ -164,20 +169,17 @@ namespace mongo { skip.insert("ns"); ss << ""; - for( bo::iterator i(op); i.more(); ) { + for( bo::iterator i(op); i.more(); ) { be e = i.next(); if( skip.count(e.fieldName()) ) continue; ss << e.toString() << ' '; } - ss << ""; - - ss << ""; - ss << '\n'; + ss << "\n"; } - void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { + void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { const Member *m = findById(server_id); - if( m == 0 ) { + if( m == 0 ) { ss << "Error : can't find a member with id: " << server_id << '\n'; return; } @@ -187,21 +189,29 @@ namespace mongo { //const bo fields = BSON( "o" << false << "o2" << false ); const bo fields; - ScopedDbConnection conn(m->fullName()); + /** todo fix we might want an so timeout here */ + DBClientConnection conn(false, 0, /*timeout*/ 20); + { + string errmsg; + if( !conn.connect(m->fullName(), errmsg) ) { + ss << "couldn't connect to " << m->fullName() << ' ' << errmsg; + return; + } + } - auto_ptr c = conn->query(rsoplog, Query().sort("$natural",1), 20, 0, &fields); - if( c.get() == 0 ) { + auto_ptr c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields); + if( c.get() == 0 ) { ss << "couldn't query " << rsoplog; return; } static const char *h[] = {"ts","optime", "h","op","ns","rest",0}; ss << "\n"; - + "\n"; + ss << table(h, true); //ss << "
\n";
         int n = 0;
@@ -211,17 +221,17 @@ namespace mongo {
         while( c->more() ) {
             bo o = c->next();
             otLast = o["ts"]._opTime();
-            if( otFirst.isNull() ) 
+            if( otFirst.isNull() )
                 otFirst = otLast;
             say(ss, o);
-            n++;            
+            n++;
         }
         if( n == 0 ) {
             ss << rsoplog << " is empty\n";
         }
-        else { 
-            auto_ptr c = conn->query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
-            if( c.get() == 0 ) { 
+        else {
+            auto_ptr c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
+            if( c.get() == 0 ) {
                 ss << "couldn't query [2] " << rsoplog;
                 return;
             }
@@ -230,7 +240,7 @@ namespace mongo {
             otEnd = o["ts"]._opTime();
             while( 1 ) {
                 stringstream z;
-                if( o["ts"]._opTime() == otLast ) 
+                if( o["ts"]._opTime() == otLast )
                     break;
                 say(z, o);
                 x = z.str() + x;
@@ -253,32 +263,31 @@ namespace mongo {
             ss.precision(3);
             if( h < 72 )
                 ss << h << " hours";
-            else 
+            else
                 ss << h / 24.0 << " days";
             ss << "

\n"; } - - conn.done(); } - void ReplSetImpl::_summarizeAsHtml(stringstream& s) const { + void ReplSetImpl::_summarizeAsHtml(stringstream& s) const { s << table(0, false); s << tr("Set name:", _name); s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" ); s << _table(); - const char *h[] = {"Member", - "id", - "Up", - "cctime", - "Last heartbeat", - "Votes", "State", "Status", - "optime", - "skew", - 0}; + const char *h[] = {"Member", + "id", + "Up", + "cctime", + "Last heartbeat", + "Votes", "Priority", "State", "Messages", + "optime", + "skew", + 0 + }; s << table(h); - /* this is to sort the member rows by their ordinal _id, so they show up in the same + /* this is to sort the member rows by their ordinal _id, so they show up in the same order on all the different web ui's; that is less confusing for the operator. */ map mp; @@ -287,13 +296,13 @@ namespace mongo { readlocktry lk("local.replset.minvalid", 300); if( lk.got() ) { BSONObj mv; - if( Helpers::getSingleton("local.replset.minvalid", mv) ) { + if( Helpers::getSingleton("local.replset.minvalid", mv) ) { myMinValid = "minvalid:" + mv["ts"]._opTime().toString(); } } else myMinValid = "."; } - catch(...) { + catch(...) { myMinValid = "exception fetching minvalid"; } @@ -301,25 +310,26 @@ namespace mongo { stringstream s; /* self row */ s << tr() << td(_self->fullName() + " (me)") << - td(_self->id()) << - td("1") << //up - td(ago(started)) << - td("") << // last heartbeat - td(ToString(_self->config().votes)) << - td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") ); + td(_self->id()) << + td("1") << //up + td(ago(started)) << + td("") << // last heartbeat + td(ToString(_self->config().votes)) << + td(ToString(_self->config().priority)) << + td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") ); s << td( _hbmsg ); stringstream q; - q << "/_replSetOplog?" << _self->id(); + q << "/_replSetOplog?_id=" << _self->id(); s << td( a(q.str(), myMinValid, theReplSet->lastOpTimeWritten.toString()) ); s << td(""); // skew s << _tr(); - mp[_self->hbinfo().id()] = s.str(); + mp[_self->hbinfo().id()] = s.str(); } Member *m = head(); while( m ) { - stringstream s; + stringstream s; m->summarizeMember(s); - mp[m->hbinfo().id()] = s.str(); + mp[m->hbinfo().id()] = s.str(); m = m->next(); } @@ -333,26 +343,27 @@ namespace mongo { _rsLog.toHTML( s ); } - const Member* ReplSetImpl::findById(unsigned id) const { + const Member* ReplSetImpl::findById(unsigned id) const { if( id == _self->id() ) return _self; for( Member *m = head(); m; m = m->next() ) - if( m->id() == id ) + if( m->id() == id ) return m; return 0; } - void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const { + void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const { vector v; // add self { - HostAndPort h(getHostName(), cmdLine.port); - BSONObjBuilder bb; bb.append("_id", (int) _self->id()); - bb.append("name", h.toString()); + bb.append("name", _self->fullName()); bb.append("health", 1.0); bb.append("state", (int) box.getState().s); + bb.append("stateStr", box.getState().toString()); + bb.appendTimestamp("optime", lastOpTimeWritten.asDate()); + bb.appendDate("optimeDate", lastOpTimeWritten.getSecs() * 1000LL); string s = _self->lhb(); if( !s.empty() ) bb.append("errmsg", s); @@ -365,9 +376,19 @@ namespace mongo { BSONObjBuilder bb; bb.append("_id", (int) m->id()); bb.append("name", m->fullName()); - bb.append("health", m->hbinfo().health); + double h = m->hbinfo().health; + bb.append("health", h); bb.append("state", (int) m->state().s); + if( h == 0 ) { + // if we can't connect the state info is from the past and could be confusing to show + bb.append("stateStr", "(not reachable/healthy)"); + } + else { + bb.append("stateStr", m->state().toString()); + } bb.append("uptime", (unsigned) (m->hbinfo().upSince ? (time(0)-m->hbinfo().upSince) : 0)); + bb.appendTimestamp("optime", m->hbinfo().opTime.asDate()); + bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL); bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat); string s = m->lhb(); if( !s.empty() ) @@ -380,10 +401,12 @@ namespace mongo { b.appendTimeT("date", time(0)); b.append("myState", box.getState().s); b.append("members", v); + if( replSetBlind ) + b.append("blind",true); // to avoid confusion if set...normally never set except for testing. } - static struct Test : public UnitTest { - void run() { + static struct Test : public UnitTest { + void run() { HealthOptions a,b; assert( a == b ); assert( a.isDefault() ); diff --git a/db/repl/health.h b/db/repl/health.h index 645a3b5..a32db00 100644 --- a/db/repl/health.h +++ b/db/repl/health.h @@ -23,8 +23,8 @@ namespace mongo { /* throws */ bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false); - struct HealthOptions { - HealthOptions() { + struct HealthOptions { + HealthOptions() { heartbeatSleepMillis = 2000; heartbeatTimeoutMillis = 10000; heartbeatConnRetries = 2; @@ -42,8 +42,8 @@ namespace mongo { uassert(13113, "bad replset heartbeat option", heartbeatTimeoutMillis >= 10); } - bool operator==(const HealthOptions& r) const { - return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==heartbeatConnRetries; + bool operator==(const HealthOptions& r) const { + return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==heartbeatConnRetries; } }; diff --git a/db/repl/heartbeat.cpp b/db/repl/heartbeat.cpp index b39fad7..3972466 100644 --- a/db/repl/heartbeat.cpp +++ b/db/repl/heartbeat.cpp @@ -31,7 +31,7 @@ #include "../../util/unittest.h" #include "../instance.h" -namespace mongo { +namespace mongo { using namespace bson; @@ -42,7 +42,7 @@ namespace mongo { long long HeartbeatInfo::timeDown() const { if( up() ) return 0; - if( downSince == 0 ) + if( downSince == 0 ) return 0; // still waiting on first heartbeat return jsTime() - downSince; } @@ -53,10 +53,10 @@ namespace mongo { virtual bool adminOnly() const { return false; } CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if( replSetBlind ) + if( replSetBlind ) return false; - /* we don't call ReplSetCommand::check() here because heartbeat + /* we don't call ReplSetCommand::check() here because heartbeat checks many things that are pre-initialization. */ if( !replSet ) { errmsg = "not running with --replSet"; @@ -65,12 +65,12 @@ namespace mongo { /* we want to keep heartbeat connections open when relinquishing primary. tag them here. */ { - MessagingPort *mp = cc()._mp; - if( mp ) + MessagingPort *mp = cc().port(); + if( mp ) mp->tag |= 1; } - if( cmdObj["pv"].Int() != 1 ) { + if( cmdObj["pv"].Int() != 1 ) { errmsg = "incompatible replset protocol version"; return false; } @@ -86,7 +86,7 @@ namespace mongo { } result.append("rs", true); - if( cmdObj["checkEmpty"].trueValue() ) { + if( cmdObj["checkEmpty"].trueValue() ) { result.append("hasData", replHasDatabases()); } if( theReplSet == 0 ) { @@ -98,7 +98,7 @@ namespace mongo { return false; } - if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) { + if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) { errmsg = "repl set names do not match (2)"; result.append("mismatch", true); return false; @@ -118,8 +118,8 @@ namespace mongo { } cmdReplSetHeartbeat; /* throws dbexception */ - bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result, int myCfgVersion, int& theirCfgVersion, bool checkEmpty) { - if( replSetBlind ) { + bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result, int myCfgVersion, int& theirCfgVersion, bool checkEmpty) { + if( replSetBlind ) { //sleepmillis( rand() ); return false; } @@ -144,8 +144,8 @@ namespace mongo { public: ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm) : h(hh), m(mm) { } - string name() { return "ReplSetHealthPollTask"; } - void doWork() { + string name() const { return "ReplSetHealthPollTask"; } + void doWork() { if ( !theReplSet ) { log(2) << "theReplSet not initialized yet, skipping health poll this round" << rsLog; return; @@ -153,7 +153,7 @@ namespace mongo { HeartbeatInfo mem = m; HeartbeatInfo old = mem; - try { + try { BSONObj info; int theirConfigVersion = -10000; @@ -163,15 +163,17 @@ namespace mongo { time_t after = mem.lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown - try { - mem.skew = 0; - long long t = info["time"].Long(); - if( t > after ) + if ( info["time"].isNumber() ) { + long long t = info["time"].numberLong(); + if( t > after ) mem.skew = (int) (t - after); - else if( t < before ) + else if( t < before ) mem.skew = (int) (t - before); // negative } - catch(...) { + else { + // it won't be there if remote hasn't initialized yet + if( info.hasElement("time") ) + warning() << "heatbeat.time isn't a number: " << info << endl; mem.skew = INT_MIN; } @@ -182,7 +184,7 @@ namespace mongo { } if( ok ) { if( mem.upSince == 0 ) { - log() << "replSet info " << h.toString() << " is now up" << rsLog; + log() << "replSet info " << h.toString() << " is up" << rsLog; mem.upSince = mem.lastHeartbeat; } mem.health = 1.0; @@ -193,17 +195,20 @@ namespace mongo { be cfg = info["config"]; if( cfg.ok() ) { // received a new config - boost::function f = + boost::function f = boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy()); theReplSet->mgr->send(f); } } - else { + else { down(mem, info.getStringField("errmsg")); } } - catch(...) { - down(mem, "connect/transport error"); + catch(DBException& e) { + down(mem, e.what()); + } + catch(...) { + down(mem, "something unusual went wrong"); } m = mem; @@ -212,9 +217,9 @@ namespace mongo { static time_t last = 0; time_t now = time(0); bool changed = mem.changed(old); - if( changed ) { - if( old.hbstate != mem.hbstate ) - log() << "replSet " << h.toString() << ' ' << mem.hbstate.toString() << rsLog; + if( changed ) { + if( old.hbstate != mem.hbstate ) + log() << "replSet member " << h.toString() << ' ' << mem.hbstate.toString() << rsLog; } if( changed || now-last>4 ) { last = now; @@ -228,18 +233,18 @@ namespace mongo { if( mem.upSince || mem.downSince == 0 ) { mem.upSince = 0; mem.downSince = jsTime(); - log() << "replSet info " << h.toString() << " is now down (or slow to respond)" << rsLog; + log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog; } mem.lastHeartbeatMsg = msg; } }; - void ReplSetImpl::endOldHealthTasks() { + void ReplSetImpl::endOldHealthTasks() { unsigned sz = healthTasks.size(); for( set::iterator i = healthTasks.begin(); i != healthTasks.end(); i++ ) (*i)->halt(); healthTasks.clear(); - if( sz ) + if( sz ) DEV log() << "replSet debug: cleared old tasks " << sz << endl; } @@ -251,8 +256,8 @@ namespace mongo { void startSyncThread(); - /** called during repl set startup. caller expects it to return fairly quickly. - note ReplSet object is only created once we get a config - so this won't run + /** called during repl set startup. caller expects it to return fairly quickly. + note ReplSet object is only created once we get a config - so this won't run until the initiation. */ void ReplSetImpl::startThreads() { diff --git a/db/repl/manager.cpp b/db/repl/manager.cpp index 862ac46..ed39c31 100644 --- a/db/repl/manager.cpp +++ b/db/repl/manager.cpp @@ -1,4 +1,4 @@ -/* @file manager.cpp +/* @file manager.cpp */ /** @@ -23,20 +23,20 @@ namespace mongo { - enum { + enum { NOPRIMARY = -2, SELFPRIMARY = -1 }; /* check members OTHER THAN US to see if they think they are primary */ - const Member * Manager::findOtherPrimary(bool& two) { + const Member * Manager::findOtherPrimary(bool& two) { two = false; Member *m = rs->head(); Member *p = 0; while( m ) { DEV assert( m != rs->_self ); if( m->state().primary() && m->hbinfo().up() ) { - if( p ) { + if( p ) { two = true; return 0; } @@ -44,33 +44,36 @@ namespace mongo { } m = m->next(); } - if( p ) + if( p ) noteARemoteIsPrimary(p); return p; } - Manager::Manager(ReplSetImpl *_rs) : - task::Server("rs Manager"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) - { + Manager::Manager(ReplSetImpl *_rs) : + task::Server("rs Manager"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) { } - - Manager::~Manager() { - log() << "ERROR: ~Manager should never be called" << rsLog; + + Manager::~Manager() { + /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init. + the log message below is just a reminder to come back one day and review this code more, and to + make it cleaner. + */ + log() << "info: ~Manager called" << rsLog; rs->mgr = 0; - assert(false); } - void Manager::starting() { + void Manager::starting() { Client::initThread("rs Manager"); } - void Manager::noteARemoteIsPrimary(const Member *m) { + void Manager::noteARemoteIsPrimary(const Member *m) { if( rs->box.getPrimary() == m ) return; rs->_self->lhb() = ""; if( rs->iAmArbiterOnly() ) { rs->box.set(MemberState::RS_ARBITER, m); - } else { + } + else { rs->box.noteRemoteIsPrimary(m); } } @@ -87,9 +90,8 @@ namespace mongo { const Member *p = rs->box.getPrimary(); if( p && p != rs->_self ) { - if( !p->hbinfo().up() || - !p->hbinfo().hbstate.primary() ) - { + if( !p->hbinfo().up() || + !p->hbinfo().hbstate.primary() ) { p = 0; rs->box.setOtherPrimary(0); } @@ -101,36 +103,36 @@ namespace mongo { p2 = findOtherPrimary(two); if( two ) { /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */ - log() << "replSet warning DIAG two primaries (transiently)" << rsLog; + log() << "replSet info two primaries (transiently)" << rsLog; return; } } if( p2 ) { /* someone else thinks they are primary. */ - if( p == p2 ) { + if( p == p2 ) { // we thought the same; all set. return; } if( p == 0 ) { - noteARemoteIsPrimary(p2); + noteARemoteIsPrimary(p2); return; } // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx if( p != rs->_self ) { // switch primary from oldremotep->newremotep2 - noteARemoteIsPrimary(p2); + noteARemoteIsPrimary(p2); return; } /* we thought we were primary, yet now someone else thinks they are. */ if( !rs->elect.aMajoritySeemsToBeUp() ) { /* we can't see a majority. so the other node is probably the right choice. */ - noteARemoteIsPrimary(p2); + noteARemoteIsPrimary(p2); return; } - /* ignore for now, keep thinking we are master. - this could just be timing (we poll every couple seconds) or could indicate - a problem? if it happens consistently for a duration of time we should + /* ignore for now, keep thinking we are master. + this could just be timing (we poll every couple seconds) or could indicate + a problem? if it happens consistently for a duration of time we should alert the sysadmin. */ return; @@ -138,17 +140,17 @@ namespace mongo { /* didn't find anyone who wants to be primary */ - if( p ) { + if( p ) { /* we are already primary */ - if( p != rs->_self ) { + if( p != rs->_self ) { rs->sethbmsg("error p != rs->self in checkNewState"); log() << "replSet " << p->fullName() << rsLog; log() << "replSet " << rs->_self->fullName() << rsLog; return; } - if( rs->elect.shouldRelinquish() ) { + if( rs->elect.shouldRelinquish() ) { log() << "replSet can't see a majority of the set, relinquishing primary" << rsLog; rs->relinquish(); } @@ -162,7 +164,7 @@ namespace mongo { /* TODO : CHECK PRIORITY HERE. can't be elected if priority zero. */ /* no one seems to be primary. shall we try to elect ourself? */ - if( !rs->elect.aMajoritySeemsToBeUp() ) { + if( !rs->elect.aMajoritySeemsToBeUp() ) { static time_t last; static int n; int ll = 0; @@ -175,15 +177,15 @@ namespace mongo { busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one. } - try { - rs->elect.electSelf(); + try { + rs->elect.electSelf(); } catch(RetryAfterSleepException&) { /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */ requeue(); } - catch(...) { - log() << "replSet error unexpected assertion in rs manager" << rsLog; + catch(...) { + log() << "replSet error unexpected assertion in rs manager" << rsLog; } busyWithElectSelf = false; } diff --git a/db/repl/multicmd.h b/db/repl/multicmd.h index 9eb9a17..df7c4e5 100644 --- a/db/repl/multicmd.h +++ b/db/repl/multicmd.h @@ -21,7 +21,7 @@ #include "../../util/background.h" #include "connections.h" -namespace mongo { +namespace mongo { struct Target { Target(string hostport) : toHost(hostport), ok(false) { } @@ -33,38 +33,37 @@ namespace mongo { /* -- implementation ------------- */ - class _MultiCommandJob : public BackgroundJob { + class _MultiCommandJob : public BackgroundJob { public: BSONObj& cmd; Target& d; _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { } + private: - string name() { return "MultiCommandJob"; } + string name() const { return "MultiCommandJob"; } void run() { - try { + try { ScopedConn c(d.toHost); d.ok = c.runCommand("admin", cmd, d.result); } - catch(DBException&) { + catch(DBException&) { DEV log() << "dev caught dbexception on multiCommand " << d.toHost << rsLog; } } }; - inline void multiCommand(BSONObj cmd, list& L) { - typedef shared_ptr<_MultiCommandJob> P; - list

jobs; - list _jobs; + inline void multiCommand(BSONObj cmd, list& L) { + list jobs; - for( list::iterator i = L.begin(); i != L.end(); i++ ) { + for( list::iterator i = L.begin(); i != L.end(); i++ ) { Target& d = *i; _MultiCommandJob *j = new _MultiCommandJob(cmd, d); - jobs.push_back(P(j)); - _jobs.push_back(j); + j->go(); + jobs.push_back(j); } - BackgroundJob::go(_jobs); - BackgroundJob::wait(_jobs,5); + for( list::iterator i = jobs.begin(); i != jobs.end(); i++ ) { + (*i)->wait(); + } } - } diff --git a/db/repl/replset_commands.cpp b/db/repl/replset_commands.cpp index 328b0ab..dc8567a 100644 --- a/db/repl/replset_commands.cpp +++ b/db/repl/replset_commands.cpp @@ -24,7 +24,9 @@ #include "../../util/mongoutils/html.h" #include "../../client/dbclient.h" -namespace mongo { +using namespace bson; + +namespace mongo { void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial); @@ -50,7 +52,7 @@ namespace mongo { } // may not need this, but if removed check all tests still work: - if( !check(errmsg, result) ) + if( !check(errmsg, result) ) return false; if( cmdObj.hasElement("blind") ) { @@ -61,6 +63,7 @@ namespace mongo { } } cmdReplSetTest; + /** get rollback id */ class CmdReplSetGetRBID : public ReplSetCommand { public: /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */ @@ -68,26 +71,28 @@ namespace mongo { virtual void help( stringstream &help ) const { help << "internal"; } - CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") { + CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") { rbid = (int) curTimeMillis(); } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if( !check(errmsg, result) ) + if( !check(errmsg, result) ) return false; result.append("rbid",rbid); return true; } } cmdReplSetRBID; - using namespace bson; - void incRBID() { + /** we increment the rollback id on every rollback event. */ + void incRBID() { cmdReplSetRBID.rbid++; } - int getRBID(DBClientConnection *c) { + + /** helper to get rollback id from another server. */ + int getRBID(DBClientConnection *c) { bo info; c->simpleCommand("admin", &info, "replSetGetRBID"); return info["rbid"].numberInt(); - } + } class CmdReplSetGetStatus : public ReplSetCommand { public: @@ -98,7 +103,10 @@ namespace mongo { } CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if( !check(errmsg, result) ) + if ( cmdObj["forShell"].trueValue() ) + lastError.disableForCommand(); + + if( !check(errmsg, result) ) return false; theReplSet->summarizeStatus(result); return true; @@ -115,7 +123,7 @@ namespace mongo { } CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { } virtual bool run(const string& a, BSONObj& b, string& errmsg, BSONObjBuilder& c, bool d) { - try { + try { rwlock_try_write lk(mutex); return _run(a,b,errmsg,c,d); } @@ -125,16 +133,16 @@ namespace mongo { } private: bool _run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if( !check(errmsg, result) ) + if( !check(errmsg, result) ) return false; - if( !theReplSet->box.getState().primary() ) { + if( !theReplSet->box.getState().primary() ) { errmsg = "replSetReconfig command must be sent to the current replica set primary."; return false; } { - // just make sure we can get a write lock before doing anything else. we'll reacquire one - // later. of course it could be stuck then, but this check lowers the risk if weird things + // just make sure we can get a write lock before doing anything else. we'll reacquire one + // later. of course it could be stuck then, but this check lowers the risk if weird things // are up - we probably don't want a change to apply 30 minutes after the initial attempt. time_t t = time(0); writelock lk(""); @@ -159,7 +167,7 @@ namespace mongo { log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog; - if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) { + if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) { return false; } @@ -170,7 +178,7 @@ namespace mongo { theReplSet->haveNewConfig(newConfig, true); ReplSet::startupStatusMsg = "replSetReconfig'd"; } - catch( DBException& e ) { + catch( DBException& e ) { log() << "replSet replSetReconfig exception: " << e.what() << rsLog; throw; } @@ -182,8 +190,11 @@ namespace mongo { class CmdReplSetFreeze : public ReplSetCommand { public: virtual void help( stringstream &help ) const { - help << "Enable / disable failover for the set - locks current primary as primary even if issues occur.\nFor use during system maintenance.\n"; - help << "{ replSetFreeze : }"; + help << "{ replSetFreeze : }"; + help << "'freeze' state of member to the extent we can do that. What this really means is that\n"; + help << "this node will not attempt to become primary until the time period specified expires.\n"; + help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n"; + help << "A process restart unfreezes the member also.\n"; help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands"; } @@ -191,15 +202,22 @@ namespace mongo { virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if( !check(errmsg, result) ) return false; - errmsg = "not yet implemented"; /*TODO*/ - return false; + int secs = (int) cmdObj.firstElement().numberInt(); + if( theReplSet->freeze(secs) ) { + if( secs == 0 ) + result.append("info","unfreezing"); + } + if( secs == 1 ) + result.append("warning", "you really want to freeze for only 1 second?"); + return true; } } cmdReplSetFreeze; class CmdReplSetStepDown: public ReplSetCommand { public: virtual void help( stringstream &help ) const { - help << "Step down as primary. Will not try to reelect self or 1 minute.\n"; + help << "{ replSetStepDown : }\n"; + help << "Step down as primary. Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n"; help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n"; help << "http://www.mongodb.org/display/DOCS/Replica+Set+Commands"; } @@ -212,7 +230,10 @@ namespace mongo { errmsg = "not primary so can't step down"; return false; } - return theReplSet->stepDown(); + int secs = (int) cmdObj.firstElement().numberInt(); + if( secs == 0 ) + secs = 60; + return theReplSet->stepDown(secs); } } cmdReplSetStepDown; @@ -222,45 +243,46 @@ namespace mongo { class ReplSetHandler : public DbWebHandler { public: - ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ){} + ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ) {} virtual bool handles( const string& url ) const { return startsWith( url , "/_replSet" ); } - virtual void handle( const char *rq, string url, + virtual void handle( const char *rq, string url, BSONObj params, string& responseMsg, int& responseCode, - vector& headers, const SockAddr &from ){ - - string s = str::after(url, "/_replSetOplog?"); - if( !s.empty() ) - responseMsg = _replSetOplog(s); + vector& headers, const SockAddr &from ) { + + if( url == "/_replSetOplog" ) { + responseMsg = _replSetOplog(params); + } else responseMsg = _replSet(); responseCode = 200; } + string _replSetOplog(bo parms) { + int _id = (int) str::toUnsigned( parms["_id"].String() ); - string _replSetOplog(string parms) { stringstream s; string t = "Replication oplog"; s << start(t); s << p(t); - if( theReplSet == 0 ) { - if( cmdLine._replSet.empty() ) + if( theReplSet == 0 ) { + if( cmdLine._replSet.empty() ) s << p("Not using --replSet"); else { - s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") + s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") + ".
" + ReplSet::startupStatusMsg); } } else { try { - theReplSet->getOplogDiagsAsHtml(stringToNum(parms.c_str()), s); + theReplSet->getOplogDiagsAsHtml(_id, s); } - catch(std::exception& e) { - s << "error querying oplog: " << e.what() << '\n'; + catch(std::exception& e) { + s << "error querying oplog: " << e.what() << '\n'; } } @@ -269,20 +291,20 @@ namespace mongo { } /* /_replSet show replica set status in html format */ - string _replSet() { + string _replSet() { stringstream s; s << start("Replica Set Status " + prettyHostName()); - s << p( a("/", "back", "Home") + " | " + + s << p( a("/", "back", "Home") + " | " + a("/local/system.replset/?html=1", "", "View Replset Config") + " | " + - a("/replSetGetStatus?text", "", "replSetGetStatus") + " | " + + a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " + a("http://www.mongodb.org/display/DOCS/Replica+Sets", "", "Docs") ); - if( theReplSet == 0 ) { - if( cmdLine._replSet.empty() ) + if( theReplSet == 0 ) { + if( cmdLine._replSet.empty() ) s << p("Not using --replSet"); else { - s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") + s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") + ".
" + ReplSet::startupStatusMsg); } } diff --git a/db/repl/rs.cpp b/db/repl/rs.cpp index 1c0444a..90ed9f4 100644 --- a/db/repl/rs.cpp +++ b/db/repl/rs.cpp @@ -20,9 +20,12 @@ #include "../client.h" #include "../../client/dbclient.h" #include "../dbhelpers.h" +#include "../../s/d_logic.h" #include "rs.h" +#include "connections.h" +#include "../repl.h" -namespace mongo { +namespace mongo { using namespace bson; @@ -30,18 +33,18 @@ namespace mongo { ReplSet *theReplSet = 0; extern string *discoveredSeed; - void ReplSetImpl::sethbmsg(string s, int logLevel) { + void ReplSetImpl::sethbmsg(string s, int logLevel) { static time_t lastLogged; _hbmsgTime = time(0); - if( s == _hbmsg ) { + if( s == _hbmsg ) { // unchanged if( _hbmsgTime - lastLogged < 60 ) return; } unsigned sz = s.size(); - if( sz >= 256 ) + if( sz >= 256 ) memcpy(_hbmsg, s.c_str(), 255); else { _hbmsg[sz] = 0; @@ -53,7 +56,7 @@ namespace mongo { } } - void ReplSetImpl::assumePrimary() { + void ReplSetImpl::assumePrimary() { assert( iAmPotentiallyHot() ); writelock lk("admin."); // so we are synchronized with _logOp() box.setSelfPrimary(_self); @@ -62,17 +65,26 @@ namespace mongo { void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); } - void ReplSetImpl::relinquish() { + const bool closeOnRelinquish = true; + + void ReplSetImpl::relinquish() { if( box.getState().primary() ) { log() << "replSet relinquishing primary state" << rsLog; - changeState(MemberState::RS_RECOVERING); - - /* close sockets that were talking to us */ - /*log() << "replSet closing sockets after reqlinquishing primary" << rsLog; - MessagingPort::closeAllSockets(1);*/ + changeState(MemberState::RS_SECONDARY); + + if( closeOnRelinquish ) { + /* close sockets that were talking to us so they don't blithly send many writes that will fail + with "not master" (of course client could check result code, but in case they are not) + */ + log() << "replSet closing client sockets after reqlinquishing primary" << rsLog; + MessagingPort::closeAllSockets(1); + } + + // now that all connections were closed, strip this mongod from all sharding details + // if and when it gets promoted to a primary again, only then it should reload the sharding state + // the rationale here is that this mongod won't bring stale state when it regains primaryhood + shardingState.resetShardingState(); - // todo: > - //changeState(MemberState::RS_SECONDARY); } else if( box.getState().startup2() ) { // ? add comment @@ -81,26 +93,48 @@ namespace mongo { } /* look freshly for who is primary - includes relinquishing ourself. */ - void ReplSetImpl::forgetPrimary() { - if( box.getState().primary() ) + void ReplSetImpl::forgetPrimary() { + if( box.getState().primary() ) relinquish(); else { box.setOtherPrimary(0); } } - bool ReplSetImpl::_stepDown() { + // for the replSetStepDown command + bool ReplSetImpl::_stepDown(int secs) { lock lk(this); - if( box.getState().primary() ) { - changeState(MemberState::RS_RECOVERING); - elect.steppedDown = time(0) + 60; - log() << "replSet info stepped down as primary" << rsLog; + if( box.getState().primary() ) { + elect.steppedDown = time(0) + secs; + log() << "replSet info stepping down as primary secs=" << secs << rsLog; + relinquish(); return true; } return false; } - void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) { + bool ReplSetImpl::_freeze(int secs) { + lock lk(this); + /* note if we are primary we remain primary but won't try to elect ourself again until + this time period expires. + */ + if( secs == 0 ) { + elect.steppedDown = 0; + log() << "replSet info 'unfreezing'" << rsLog; + } + else { + if( !box.getState().primary() ) { + elect.steppedDown = time(0) + secs; + log() << "replSet info 'freezing' for " << secs << " seconds" << rsLog; + } + else { + log() << "replSet info received freeze command but we are primary" << rsLog; + } + } + return true; + } + + void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) { for( Member *m = _members.head(); m; m=m->next() ) { if( m->id() == h.id() ) { m->_hbinfo = h; @@ -109,7 +143,7 @@ namespace mongo { } } - list ReplSetImpl::memberHostnames() const { + list ReplSetImpl::memberHostnames() const { list L; L.push_back(_self->h()); for( Member *m = _members.head(); m; m = m->next() ) @@ -118,6 +152,7 @@ namespace mongo { } void ReplSetImpl::_fillIsMasterHost(const Member *m, vector& hosts, vector& passives, vector& arbiters) { + assert( m ); if( m->config().hidden ) return; @@ -126,8 +161,9 @@ namespace mongo { } else if( !m->config().arbiterOnly ) { if( m->config().slaveDelay ) { - /* hmmm - we don't list these as they are stale. */ - } else { + /* hmmm - we don't list these as they are stale. */ + } + else { passives.push_back(m->h().toString()); } } @@ -147,6 +183,7 @@ namespace mongo { _fillIsMasterHost(_self, hosts, passives, arbiters); for( Member *m = _members.head(); m; m = m->next() ) { + assert( m ); _fillIsMasterHost(m, hosts, passives, arbiters); } @@ -161,23 +198,27 @@ namespace mongo { } } - if( !isp ) { + if( !isp ) { const Member *m = sp.primary; if( m ) b.append("primary", m->h().toString()); } if( myConfig().arbiterOnly ) b.append("arbiterOnly", true); + if( myConfig().priority == 0 ) + b.append("passive", true); if( myConfig().slaveDelay ) b.append("slaveDelay", myConfig().slaveDelay); if( myConfig().hidden ) b.append("hidden", true); + if( !myConfig().buildIndexes ) + b.append("buildIndexes", false); } /** @param cfgString /, */ - void parseReplsetCmdLine(string cfgString, string& setname, vector& seeds, set& seedSet ) { - const char *p = cfgString.c_str(); + void parseReplsetCmdLine(string cfgString, string& setname, vector& seeds, set& seedSet ) { + const char *p = cfgString.c_str(); const char *slash = strchr(p, '/'); if( slash ) setname = string(p, slash-p); @@ -207,7 +248,8 @@ namespace mongo { //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost()); if( m.isSelf() ) { log(1) << "replSet ignoring seed " << m.toString() << " (=self)" << rsLog; - } else + } + else seeds.push_back(m); if( *comma == 0 ) break; @@ -216,10 +258,9 @@ namespace mongo { } } - ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this), - _self(0), - mgr( new Manager(this) ) - { + ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this), + _self(0), + mgr( new Manager(this) ) { _cfg = 0; memset(_hbmsg, 0, sizeof(_hbmsg)); *_hbmsg = '.'; // temp...just to see @@ -240,20 +281,21 @@ namespace mongo { } for( set::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) { if( i->isSelf() ) { - if( sss == 1 ) + if( sss == 1 ) log(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog; - } else + } + else log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog; } } void newReplUp(); - void ReplSetImpl::loadLastOpTimeWritten() { + void ReplSetImpl::loadLastOpTimeWritten() { //assert( lastOpTimeWritten.isNull() ); readlock lk(rsoplog); BSONObj o; - if( Helpers::getLast(rsoplog, o) ) { + if( Helpers::getLast(rsoplog, o) ) { lastH = o["h"].numberLong(); lastOpTimeWritten = o["ts"]._opTime(); uassert(13290, "bad replSet oplog entry?", !lastOpTimeWritten.isNull()); @@ -261,11 +303,11 @@ namespace mongo { } /* call after constructing to start - returns fairly quickly after launching its threads */ - void ReplSetImpl::_go() { - try { + void ReplSetImpl::_go() { + try { loadLastOpTimeWritten(); } - catch(std::exception& e) { + catch(std::exception& e) { log() << "replSet error fatal couldn't query the local " << rsoplog << " collection. Terminating mongod after 30 seconds." << rsLog; log() << e.what() << rsLog; sleepsecs(30); @@ -283,11 +325,17 @@ namespace mongo { extern BSONObj *getLastErrorDefault; + void ReplSetImpl::setSelfTo(Member *m) { + _self = m; + if( m ) _buildIndexes = m->config().buildIndexes; + else _buildIndexes = true; + } + /** @param reconf true if this is a reconfiguration and not an initial load of the configuration. @return true if ok; throws if config really bad; false if config doesn't include self */ bool ReplSetImpl::initFromConfig(ReplSetConfig& c, bool reconf) { - /* NOTE: haveNewConfig() writes the new config to disk before we get here. So + /* NOTE: haveNewConfig() writes the new config to disk before we get here. So we cannot error out at this point, except fatally. Check errors earlier. */ lock lk(this); @@ -302,25 +350,24 @@ namespace mongo { { unsigned nfound = 0; int me = 0; - for( vector::iterator i = c.members.begin(); i != c.members.end(); i++ ) { + for( vector::iterator i = c.members.begin(); i != c.members.end(); i++ ) { const ReplSetConfig::MemberCfg& m = *i; if( m.h.isSelf() ) { nfound++; me++; - if( !reconf || (_self && _self->id() == (unsigned) m._id) ) ; - else { + else { log() << "replSet " << _self->id() << ' ' << m._id << rsLog; assert(false); } } - else if( reconf ) { + else if( reconf ) { const Member *old = findById(m._id); - if( old ) { + if( old ) { nfound++; assert( (int) old->id() == m._id ); - if( old->config() == m ) { + if( old->config() == m ) { additive = false; } } @@ -328,16 +375,24 @@ namespace mongo { newOnes.push_back(&m); } } + + // change timeout settings, if necessary + ScopedConn conn(m.h.toString()); + conn.setTimeout(c.ho.heartbeatTimeoutMillis/1000.0); } if( me == 0 ) { + // initial startup with fastsync + if (!reconf && replSettings.fastsync) { + return false; + } // log() << "replSet config : " << _cfg->toString() << rsLog; - log() << "replSet error can't find self in the repl set configuration:" << rsLog; + log() << "replSet error self not present in the repl set configuration:" << rsLog; log() << c.toString() << rsLog; - assert(false); + uasserted(13497, "replSet error self not present in the configuration"); } uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 ); - if( reconf && config().members.size() != nfound ) + if( reconf && config().members.size() != nfound ) additive = false; } @@ -347,14 +402,14 @@ namespace mongo { _name = _cfg->_id; assert( !_name.empty() ); - if( additive ) { + if( additive ) { log() << "replSet info : additive change to configuration" << rsLog; for( list::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) { const ReplSetConfig::MemberCfg* m = *i; Member *mi = new Member(m->h, m->_id, m, false); - /** we will indicate that new members are up() initially so that we don't relinquish our - primary state because we can't (transiently) see a majority. they should be up as we + /** we will indicate that new members are up() initially so that we don't relinquish our + primary state because we can't (transiently) see a majority. they should be up as we check that new members are up before getting here on reconfig anyway. */ mi->get_hbinfo().health = 0.1; @@ -373,20 +428,30 @@ namespace mongo { int oldPrimaryId = -1; { const Member *p = box.getPrimary(); - if( p ) + if( p ) oldPrimaryId = p->id(); } forgetPrimary(); - _self = 0; - for( vector::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) { + + bool iWasArbiterOnly = _self ? iAmArbiterOnly() : false; + setSelfTo(0); + for( vector::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) { const ReplSetConfig::MemberCfg& m = *i; Member *mi; if( m.h.isSelf() ) { assert( _self == 0 ); - mi = _self = new Member(m.h, m._id, &m, true); + mi = new Member(m.h, m._id, &m, true); + setSelfTo(mi); + + // if the arbiter status changed + if (iWasArbiterOnly ^ iAmArbiterOnly()) { + _changeArbiterState(); + } + if( (int)mi->id() == oldPrimaryId ) box.setSelfPrimary(mi); - } else { + } + else { mi = new Member(m.h, m._id, &m, false); _members.push(mi); startHealthTaskFor(mi); @@ -397,26 +462,57 @@ namespace mongo { return true; } + void startSyncThread(); + + void ReplSetImpl::_changeArbiterState() { + if (iAmArbiterOnly()) { + changeState(MemberState::RS_ARBITER); + + // if there is an oplog, free it + // not sure if this is necessary, maybe just leave the oplog and let + // the user delete it if they want the space? + writelock lk(rsoplog); + Client::Context c(rsoplog); + NamespaceDetails *d = nsdetails(rsoplog); + if (d) { + string errmsg; + bob res; + dropCollection(rsoplog, errmsg, res); + + // clear last op time to force initial sync (if the arbiter + // becomes a "normal" server again) + lastOpTimeWritten = OpTime(); + } + } + else { + changeState(MemberState::RS_RECOVERING); + + // oplog will be allocated when sync begins + /* TODO : could this cause two sync threads to exist (race condition)? */ + boost::thread t(startSyncThread); + } + } + // Our own config must be the first one. - bool ReplSetImpl::_loadConfigFinish(vector& cfgs) { + bool ReplSetImpl::_loadConfigFinish(vector& cfgs) { int v = -1; ReplSetConfig *highest = 0; int myVersion = -2000; int n = 0; - for( vector::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) { + for( vector::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) { ReplSetConfig& cfg = *i; if( ++n == 1 ) myVersion = cfg.version; - if( cfg.ok() && cfg.version > v ) { + if( cfg.ok() && cfg.version > v ) { highest = &cfg; v = cfg.version; } } assert( highest ); - if( !initFromConfig(*highest) ) + if( !initFromConfig(*highest) ) return false; - if( highest->version > myVersion && highest->version >= 0 ) { + if( highest->version > myVersion && highest->version >= 0 ) { log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog; writelock lk("admin."); highest->saveConfigLocally(BSONObj()); @@ -430,7 +526,7 @@ namespace mongo { startupStatusMsg = "loading " + rsConfigNs + " config (LOADINGCONFIG)"; try { vector configs; - try { + try { configs.push_back( ReplSetConfig(HostAndPort::me()) ); } catch(DBException& e) { @@ -438,26 +534,26 @@ namespace mongo { throw; } for( vector::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) { - try { + try { configs.push_back( ReplSetConfig(*i) ); } - catch( DBException& e ) { + catch( DBException& e ) { log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog; } } - if( discoveredSeed ) { + if( discoveredSeed ) { try { configs.push_back( ReplSetConfig(HostAndPort(*discoveredSeed)) ); } - catch( DBException& ) { + catch( DBException& ) { log(1) << "replSet exception trying to load config from discovered seed " << *discoveredSeed << rsLog; } } int nok = 0; int nempty = 0; - for( vector::iterator i = configs.begin(); i != configs.end(); i++ ) { + for( vector::iterator i = configs.begin(); i != configs.end(); i++ ) { if( i->ok() ) nok++; if( i->empty() ) @@ -469,7 +565,9 @@ namespace mongo { startupStatus = EMPTYCONFIG; startupStatusMsg = "can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)"; log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog; - log(1) << "replSet have you ran replSetInitiate yet?" << rsLog; + static unsigned once; + if( ++once == 1 ) + log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog; if( _seeds->size() == 0 ) log(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog; } @@ -483,13 +581,13 @@ namespace mongo { continue; } - if( !_loadConfigFinish(configs) ) { + if( !_loadConfigFinish(configs) ) { log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try again." << rsLog; sleepsecs(20); continue; } } - catch(DBException& e) { + catch(DBException& e) { startupStatus = BADCONFIG; startupStatusMsg = "replSet error loading set config (BADCONFIG)"; log() << "replSet error loading configurations " << e.toString() << rsLog; @@ -504,30 +602,34 @@ namespace mongo { startupStatus = STARTED; } - void ReplSetImpl::_fatal() - { + void ReplSetImpl::_fatal() { //lock l(this); box.set(MemberState::RS_FATAL, 0); //sethbmsg("fatal error"); - log() << "replSet error fatal, stopping replication" << rsLog; + log() << "replSet error fatal, stopping replication" << rsLog; } - void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) { + void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) { lock l(this); // convention is to lock replset before taking the db rwlock writelock lk(""); bo comment; if( addComment ) comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version ); newConfig.saveConfigLocally(comment); - try { + try { initFromConfig(newConfig, true); log() << "replSet replSetReconfig new config saved locally" << rsLog; } - catch(DBException& e) { + catch(DBException& e) { + if( e.getCode() == 13497 /* removed from set */ ) { + cc().shutdown(); + dbexit( EXIT_CLEAN , "removed from replica set" ); // never returns + assert(0); + } log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog; _fatal(); } - catch(...) { + catch(...) { log() << "replSet error unexpected exception in haveNewConfig()" << rsLog; _fatal(); } @@ -538,30 +640,33 @@ namespace mongo { ReplSetConfig c(o); if( c.version > rs->config().version ) theReplSet->haveNewConfig(c, false); - else { - log() << "replSet info msgReceivedNewConfig but version isn't higher " << - c.version << ' ' << rs->config().version << rsLog; + else { + log() << "replSet info msgReceivedNewConfig but version isn't higher " << + c.version << ' ' << rs->config().version << rsLog; } } - /* forked as a thread during startup - it can run quite a while looking for config. but once found, + /* forked as a thread during startup + it can run quite a while looking for config. but once found, a separate thread takes over as ReplSetImpl::Manager, and this thread terminates. */ void startReplSets(ReplSetCmdline *replSetCmdline) { Client::initThread("startReplSets"); - try { + try { assert( theReplSet == 0 ); if( replSetCmdline == 0 ) { assert(!replSet); return; } + if( !noauth ) { + cc().getAuthenticationInfo()->authorize("local"); + } (theReplSet = new ReplSet(*replSetCmdline))->go(); } - catch(std::exception& e) { + catch(std::exception& e) { log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog; - if( theReplSet ) + if( theReplSet ) theReplSet->fatal(); } cc().shutdown(); @@ -569,10 +674,9 @@ namespace mongo { } -namespace boost { +namespace boost { - void assertion_failed(char const * expr, char const * function, char const * file, long line) - { + void assertion_failed(char const * expr, char const * function, char const * file, long line) { mongo::log() << "boost assertion failure " << expr << ' ' << function << ' ' << file << ' ' << line << endl; } diff --git a/db/repl/rs.h b/db/repl/rs.h index 6c4d9a8..1419ad6 100644 --- a/db/repl/rs.h +++ b/db/repl/rs.h @@ -43,6 +43,7 @@ namespace mongo { class Member : public List1::Base { public: Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self); + string fullName() const { return h().toString(); } const ReplSetConfig::MemberCfg& config() const { return _config; } const HeartbeatInfo& hbinfo() const { return _hbinfo; } @@ -51,10 +52,12 @@ namespace mongo { MemberState state() const { return _hbinfo.hbstate; } const HostAndPort& h() const { return _h; } unsigned id() const { return _hbinfo.id(); } + bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0 void summarizeMember(stringstream& s) const; - friend class ReplSetImpl; + private: + friend class ReplSetImpl; const ReplSetConfig::MemberCfg _config; const HostAndPort _h; HeartbeatInfo _hbinfo; @@ -65,8 +68,8 @@ namespace mongo { bool busyWithElectSelf; int _primary; - /** @param two - if true two primaries were seen. this can happen transiently, in addition to our - polling being only occasional. in this case null is returned, but the caller should + /** @param two - if true two primaries were seen. this can happen transiently, in addition to our + polling being only occasional. in this case null is returned, but the caller should not assume primary itself in that situation. */ const Member* findOtherPrimary(bool& two); @@ -75,7 +78,7 @@ namespace mongo { virtual void starting(); public: Manager(ReplSetImpl *rs); - ~Manager(); + virtual ~Manager(); void msgReceivedNewConfig(BSONObj); void msgCheckNewState(); }; @@ -84,7 +87,7 @@ namespace mongo { class Consensus { ReplSetImpl &rs; - struct LastYea { + struct LastYea { LastYea() : when(0), who(0xffffffff) { } time_t when; unsigned who; @@ -96,12 +99,12 @@ namespace mongo { bool weAreFreshest(bool& allUp, int& nTies); bool sleptLast; // slept last elect() pass public: - Consensus(ReplSetImpl *t) : rs(*t) { + Consensus(ReplSetImpl *t) : rs(*t) { sleptLast = false; steppedDown = 0; } - /* if we've stepped down, this is when we are allowed to try to elect ourself again. + /* if we've stepped down, this is when we are allowed to try to elect ourself again. todo: handle possible weirdnesses at clock skews etc. */ time_t steppedDown; @@ -115,40 +118,40 @@ namespace mongo { }; /** most operations on a ReplSet object should be done while locked. that logic implemented here. */ - class RSBase : boost::noncopyable { + class RSBase : boost::noncopyable { public: const unsigned magic; void assertValid() { assert( magic == 0x12345677 ); } private: - mutex m; + mongo::mutex m; int _locked; ThreadLocalValue _lockedByMe; protected: RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { } - ~RSBase() { + ~RSBase() { /* this can happen if we throw in the constructor; otherwise never happens. thus we log it as it is quite unusual. */ log() << "replSet ~RSBase called" << rsLog; } - class lock { + class lock { RSBase& rsbase; auto_ptr sl; public: - lock(RSBase* b) : rsbase(*b) { + lock(RSBase* b) : rsbase(*b) { if( rsbase._lockedByMe.get() ) return; // recursive is ok... sl.reset( new scoped_lock(rsbase.m) ); DEV assert(rsbase._locked == 0); - rsbase._locked++; + rsbase._locked++; rsbase._lockedByMe.set(true); } - ~lock() { + ~lock() { if( sl.get() ) { assert( rsbase._lockedByMe.get() ); DEV assert(rsbase._locked == 1); rsbase._lockedByMe.set(false); - rsbase._locked--; + rsbase._locked--; } } }; @@ -157,11 +160,11 @@ namespace mongo { /* for asserts */ bool locked() const { return _locked != 0; } - /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another + /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another just for asserts & such so we can make the contracts clear on who locks what when. we don't use these locks that frequently, so the little bit of overhead is fine. */ - bool lockedByMe() { return _lockedByMe.get(); } + bool lockedByMe() { return _lockedByMe.get(); } }; class ReplSetHealthPollTask; @@ -174,19 +177,19 @@ namespace mongo { MemberState state; const Member *primary; }; - const SP get() { + const SP get() { scoped_lock lk(m); return sp; } MemberState getState() const { return sp.state; } const Member* getPrimary() const { return sp.primary; } - void change(MemberState s, const Member *self) { + void change(MemberState s, const Member *self) { scoped_lock lk(m); - if( sp.state != s ) { + if( sp.state != s ) { log() << "replSet " << s.toString() << rsLog; } sp.state = s; - if( s.primary() ) { + if( s.primary() ) { sp.primary = self; } else { @@ -194,17 +197,17 @@ namespace mongo { sp.primary = 0; } } - void set(MemberState s, const Member *p) { + void set(MemberState s, const Member *p) { scoped_lock lk(m); sp.state = s; sp.primary = p; } void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); } - void setOtherPrimary(const Member *mem) { + void setOtherPrimary(const Member *mem) { scoped_lock lk(m); assert( !sp.state.primary() ); sp.primary = mem; } - void noteRemoteIsPrimary(const Member *remote) { + void noteRemoteIsPrimary(const Member *remote) { scoped_lock lk(m); if( !sp.state.secondary() && !sp.state.fatal() ) sp.state = MemberState::RS_RECOVERING; @@ -212,10 +215,10 @@ namespace mongo { } StateBox() : m("StateBox") { } private: - mutex m; + mongo::mutex m; SP sp; }; - + void parseReplsetCmdLine(string cfgString, string& setname, vector& seeds, set& seedSet ); /** Parameter given to the --replSet command line option (parsed). @@ -230,15 +233,15 @@ namespace mongo { }; /* information about the entire repl set, such as the various servers in the set, and their state */ - /* note: We currently do not free mem when the set goes away - it is assumed the replset is a + /* note: We currently do not free mem when the set goes away - it is assumed the replset is a singleton and long lived. */ class ReplSetImpl : protected RSBase { public: /** info on our state if the replset isn't yet "up". for example, if we are pre-initiation. */ - enum StartupStatus { - PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3, - EMPTYUNREACHABLE=4, STARTED=5, SOON=6 + enum StartupStatus { + PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3, + EMPTYUNREACHABLE=4, STARTED=5, SOON=6 }; static StartupStatus startupStatus; static string startupStatusMsg; @@ -260,18 +263,21 @@ namespace mongo { void relinquish(); void forgetPrimary(); protected: - bool _stepDown(); + bool _stepDown(int secs); + bool _freeze(int secs); private: void assumePrimary(); void loadLastOpTimeWritten(); void changeState(MemberState s); + const Member* getMemberToSyncTo(); + void _changeArbiterState(); protected: // "heartbeat message" - // sent in requestHeartbeat respond in field "hbm" + // sent in requestHeartbeat respond in field "hbm" char _hbmsg[256]; // we change this unlocked, thus not an stl::string time_t _hbmsgTime; // when it was logged public: - void sethbmsg(string s, int logLevel = 0); + void sethbmsg(string s, int logLevel = 0); protected: bool initFromConfig(ReplSetConfig& c, bool reconf=false); // true if ok; throws if config really bad; false if config doesn't include self void _fillIsMaster(BSONObjBuilder&); @@ -281,7 +287,7 @@ namespace mongo { MemberState state() const { return box.getState(); } void _fatal(); void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const; - void _summarizeAsHtml(stringstream&) const; + void _summarizeAsHtml(stringstream&) const; void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command /* throws exception if a problem initializing. */ @@ -295,7 +301,7 @@ namespace mongo { const vector *_seeds; ReplSetConfig *_cfg; - /** load our configuration from admin.replset. try seed machines too. + /** load our configuration from admin.replset. try seed machines too. @return true if ok; throws if config really bad; false if config doesn't include self */ bool _loadConfigFinish(vector& v); @@ -306,7 +312,9 @@ namespace mongo { bool iAmArbiterOnly() const { return myConfig().arbiterOnly; } bool iAmPotentiallyHot() const { return myConfig().potentiallyHot(); } protected: - Member *_self; + Member *_self; + bool _buildIndexes; // = _self->config().buildIndexes + void setSelfTo(Member *); // use this as it sets buildIndexes var private: List1 _members; /* all members of the set EXCEPT self. */ @@ -330,7 +338,7 @@ namespace mongo { private: /* pulling data from primary related - see rs_sync.cpp */ - bool initialSyncOplogApplication(string hn, const Member *primary, OpTime applyGTE, OpTime minValid); + bool initialSyncOplogApplication(const Member *primary, OpTime applyGTE, OpTime minValid); void _syncDoInitialSync(); void syncDoInitialSync(); void _syncThread(); @@ -340,21 +348,29 @@ namespace mongo { unsigned _syncRollback(OplogReader& r); void syncRollback(OplogReader& r); void syncFixUp(HowToFixUp& h, OplogReader& r); + bool _getOplogReader(OplogReader& r, string& hn); + bool _isStale(OplogReader& r, const string& hn); public: void syncThread(); }; - class ReplSet : public ReplSetImpl { + class ReplSet : public ReplSetImpl { public: ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) { } - bool stepDown() { return _stepDown(); } + // for the replSetStepDown command + bool stepDown(int secs) { return _stepDown(secs); } - string selfFullName() { + // for the replSetFreeze command + bool freeze(int secs) { return _freeze(secs); } + + string selfFullName() { lock lk(this); return _self->fullName(); } + bool buildIndexes() const { return _buildIndexes; } + /* call after constructing to start - returns fairly quickly after la[unching its threads */ void go() { _go(); } @@ -369,7 +385,7 @@ namespace mongo { void summarizeStatus(BSONObjBuilder& b) const { _summarizeStatus(b); } void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); } - /* we have a new config (reconfig) - apply it. + /* we have a new config (reconfig) - apply it. @param comment write a no-op comment to the oplog about it. only makes sense if one is primary and initiating the reconf. */ void haveNewConfig(ReplSetConfig& c, bool comment); @@ -380,16 +396,16 @@ namespace mongo { bool lockedByMe() { return RSBase::lockedByMe(); } // heartbeat msg to send to others; descriptive diagnostic info - string hbmsg() const { + string hbmsg() const { if( time(0)-_hbmsgTime > 120 ) return ""; - return _hbmsg; + return _hbmsg; } }; - /** base class for repl set commands. checks basic things such as in rs mode before the command + /** base class for repl set commands. checks basic things such as in rs mode before the command does its real work */ - class ReplSetCommand : public Command { + class ReplSetCommand : public Command { protected: ReplSetCommand(const char * s, bool show=false) : Command(s, show) { } virtual bool slaveOk() const { return true; } @@ -398,14 +414,14 @@ namespace mongo { virtual LockType locktype() const { return NONE; } virtual void help( stringstream &help ) const { help << "internal"; } bool check(string& errmsg, BSONObjBuilder& result) { - if( !replSet ) { + if( !replSet ) { errmsg = "not running with --replSet"; return false; } if( theReplSet == 0 ) { result.append("startupStatus", ReplSet::startupStatus); errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg; - if( ReplSet::startupStatus == 3 ) + if( ReplSet::startupStatus == 3 ) result.append("info", "run rs.initiate(...) if not yet done for the set"); return false; } @@ -415,9 +431,8 @@ namespace mongo { /** inlines ----------------- */ - inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) : - _config(*c), _h(h), _hbinfo(ord) - { + inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) : + _config(*c), _h(h), _hbinfo(ord) { if( self ) _hbinfo.health = 1.0; } diff --git a/db/repl/rs_config.cpp b/db/repl/rs_config.cpp index 371507d..5998f51 100644 --- a/db/repl/rs_config.cpp +++ b/db/repl/rs_config.cpp @@ -27,11 +27,11 @@ using namespace bson; -namespace mongo { +namespace mongo { void logOpInitiate(const bo&); - void assertOnlyHas(BSONObj o, const set& fields) { + void assertOnlyHas(BSONObj o, const set& fields) { BSONObj::iterator i(o); while( i.more() ) { BSONElement e = i.next(); @@ -41,7 +41,7 @@ namespace mongo { } } - list ReplSetConfig::otherMemberHostnames() const { + list ReplSetConfig::otherMemberHostnames() const { list L; for( vector::const_iterator i = members.begin(); i != members.end(); i++ ) { if( !i->h.isSelf() ) @@ -49,12 +49,12 @@ namespace mongo { } return L; } - + /* comment MUST only be set when initiating the set by the initiator */ - void ReplSetConfig::saveConfigLocally(bo comment) { + void ReplSetConfig::saveConfigLocally(bo comment) { checkRsConfig(); log() << "replSet info saving a newer config version to local.system.replset" << rsLog; - { + { writelock lk(""); Client::Context cx( rsConfigNs ); cx.db()->flushFiles(true); @@ -70,21 +70,21 @@ namespace mongo { } DEV log() << "replSet saveConfigLocally done" << rsLog; } - - /*static*/ - /*void ReplSetConfig::receivedNewConfig(BSONObj cfg) { + + /*static*/ + /*void ReplSetConfig::receivedNewConfig(BSONObj cfg) { if( theReplSet ) return; // this is for initial setup only, so far. todo ReplSetConfig c(cfg); writelock lk("admin."); - if( theReplSet ) + if( theReplSet ) return; c.saveConfigLocally(bo()); }*/ - bo ReplSetConfig::MemberCfg::asBson() const { + bo ReplSetConfig::MemberCfg::asBson() const { bob b; b << "_id" << _id; b.append("host", h.toString()); @@ -93,18 +93,28 @@ namespace mongo { if( arbiterOnly ) b << "arbiterOnly" << true; if( slaveDelay ) b << "slaveDelay" << slaveDelay; if( hidden ) b << "hidden" << hidden; + if( !buildIndexes ) b << "buildIndexes" << buildIndexes; + if( !tags.empty() ) { + BSONArrayBuilder a; + for( set::const_iterator i = tags.begin(); i != tags.end(); i++ ) + a.append(*i); + b.appendArray("tags", a.done()); + } + if( !initialSync.isEmpty() ) { + b << "initialSync" << initialSync; + } return b.obj(); } - bo ReplSetConfig::asBson() const { + bo ReplSetConfig::asBson() const { bob b; b.append("_id", _id).append("version", version); if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() ) { bob settings; if( !ho.isDefault() ) - settings << "heartbeatConnRetries " << ho.heartbeatConnRetries << - "heartbeatSleep" << ho.heartbeatSleepMillis / 1000 << - "heartbeatTimeout" << ho.heartbeatTimeoutMillis / 1000; + settings << "heartbeatConnRetries " << ho.heartbeatConnRetries << + "heartbeatSleep" << ho.heartbeatSleepMillis / 1000.0 << + "heartbeatTimeout" << ho.heartbeatTimeoutMillis / 1000.0; if( !getLastErrorDefaults.isEmpty() ) settings << "getLastErrorDefaults" << getLastErrorDefaults; b << "settings" << settings.obj(); @@ -122,7 +132,7 @@ namespace mongo { uassert(13126, "bad Member config", expr); } - void ReplSetConfig::MemberCfg::check() const{ + void ReplSetConfig::MemberCfg::check() const { mchk(_id >= 0 && _id <= 255); mchk(priority >= 0 && priority <= 1000); mchk(votes >= 0 && votes <= 100); @@ -130,41 +140,80 @@ namespace mongo { uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0); uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366); uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden); + uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0); + + if (!initialSync.isEmpty()) { + static const string legal[] = {"state", "name", "_id","optime"}; + static const set legals(legal, legal + 4); + assertOnlyHas(initialSync, legals); + + if (initialSync.hasElement("state")) { + uassert(13525, "initialSync source state must be 1 or 2", + initialSync["state"].isNumber() && + (initialSync["state"].Number() == 1 || + initialSync["state"].Number() == 2)); + } + if (initialSync.hasElement("name")) { + uassert(13526, "initialSync source name must be a string", + initialSync["name"].type() == mongo::String); + } + if (initialSync.hasElement("_id")) { + uassert(13527, "initialSync source _id must be a number", + initialSync["_id"].isNumber()); + } + if (initialSync.hasElement("optime")) { + uassert(13528, "initialSync source optime must be a timestamp", + initialSync["optime"].type() == mongo::Timestamp || + initialSync["optime"].type() == mongo::Date); + } + } } /** @param o old config - @param n new config + @param n new config */ - /*static*/ bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) { + /*static*/ + bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) { assert( theReplSet ); - if( o._id != n._id ) { - errmsg = "set name may not change"; + if( o._id != n._id ) { + errmsg = "set name may not change"; return false; } /* TODO : wonder if we need to allow o.version < n.version only, which is more lenient. - if someone had some intermediate config this node doesnt have, that could be + if someone had some intermediate config this node doesnt have, that could be necessary. but then how did we become primary? so perhaps we are fine as-is. */ - if( o.version + 1 != n.version ) { + if( o.version + 1 != n.version ) { errmsg = "version number wrong"; return false; } map old; - for( vector::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) { + for( vector::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) { old[i->h] = &(*i); } int me = 0; - for( vector::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) { + for( vector::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) { const ReplSetConfig::MemberCfg& m = *i; - if( old.count(m.h) ) { - if( old[m.h]->_id != m._id ) { + if( old.count(m.h) ) { + const ReplSetConfig::MemberCfg& oldCfg = *old[m.h]; + if( oldCfg._id != m._id ) { log() << "replSet reconfig error with member: " << m.h.toString() << rsLog; uasserted(13432, "_id may not change for members"); } + if( oldCfg.buildIndexes != m.buildIndexes ) { + log() << "replSet reconfig error with member: " << m.h.toString() << rsLog; + uasserted(13476, "buildIndexes may not change for members"); + } + /* are transitions to and from arbiterOnly guaranteed safe? if not, we should disallow here. + there is a test at replsets/replsetarb3.js */ + if( oldCfg.arbiterOnly != m.arbiterOnly ) { + log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog; + uasserted(13510, "arbiterOnly may not change for members"); + } } - if( m.h.isSelf() ) + if( m.h.isSelf() ) me++; } @@ -172,24 +221,33 @@ namespace mongo { /* TODO : MORE CHECKS HERE */ - log() << "replSet TODO : don't allow removal of a node until we handle it at the removed node end?" << endl; + DEV log() << "replSet TODO : don't allow removal of a node until we handle it at the removed node end?" << endl; // we could change its votes to zero perhaps instead as a short term... return true; } - void ReplSetConfig::clear() { + void ReplSetConfig::clear() { version = -5; _ok = false; } - void ReplSetConfig::checkRsConfig() const { + void ReplSetConfig::checkRsConfig() const { uassert(13132, - "nonmatching repl set name in _id field; check --replSet command line", - _id == cmdLine.ourSetName()); + "nonmatching repl set name in _id field; check --replSet command line", + _id == cmdLine.ourSetName()); uassert(13308, "replSet bad config version #", version > 0); uassert(13133, "replSet bad config no members", members.size() >= 1); - uassert(13309, "replSet bad config maximum number of members is 7 (for now)", members.size() <= 7); + uassert(13309, "replSet bad config maximum number of members is 12", members.size() <= 12); + { + unsigned voters = 0; + for( vector::const_iterator i = members.begin(); i != members.end(); ++i ) { + if( i->votes ) + voters++; + } + uassert(13612, "replSet bad config maximum number of voting members is 7", voters <= 7); + uassert(13613, "replSet bad config no voting members", voters > 0); + } } void ReplSetConfig::from(BSONObj o) { @@ -213,7 +271,8 @@ namespace mongo { if( settings["heartbeatTimeout"].ok() ) ho.heartbeatTimeoutMillis = (unsigned) (settings["heartbeatTimeout"].Number() * 1000); ho.check(); - try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); } catch(...) { } + try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); } + catch(...) { } } set hosts; @@ -231,43 +290,57 @@ namespace mongo { BSONObj mobj = members[i].Obj(); MemberCfg m; try { - static const string legal[] = {"_id","votes","priority","host","hidden","slaveDelay","arbiterOnly"}; - static const set legals(legal, legal + 7); + static const string legal[] = { + "_id","votes","priority","host", "hidden","slaveDelay", + "arbiterOnly","buildIndexes","tags","initialSync" + }; + static const set legals(legal, legal + 10); assertOnlyHas(mobj, legals); - try { + try { m._id = (int) mobj["_id"].Number(); - } catch(...) { + } + catch(...) { /* TODO: use of string exceptions may be problematic for reconfig case! */ - throw "_id must be numeric"; + throw "_id must be numeric"; } string s; try { s = mobj["host"].String(); m.h = HostAndPort(s); } - catch(...) { + catch(...) { throw string("bad or missing host field? ") + mobj.toString(); } - if( m.h.isLocalHost() ) + if( m.h.isLocalHost() ) localhosts++; m.arbiterOnly = mobj.getBoolField("arbiterOnly"); m.slaveDelay = mobj["slaveDelay"].numberInt(); if( mobj.hasElement("hidden") ) m.hidden = mobj.getBoolField("hidden"); + if( mobj.hasElement("buildIndexes") ) + m.buildIndexes = mobj.getBoolField("buildIndexes"); if( mobj.hasElement("priority") ) m.priority = mobj["priority"].Number(); if( mobj.hasElement("votes") ) m.votes = (unsigned) mobj["votes"].Number(); + if( mobj.hasElement("tags") ) { + vector v = mobj["tags"].Array(); + for( unsigned i = 0; i < v.size(); i++ ) + m.tags.insert( v[i].String() ); + } + if( mobj.hasElement("initialSync")) { + m.initialSync = mobj["initialSync"].Obj().getOwned(); + } m.check(); } - catch( const char * p ) { + catch( const char * p ) { log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog; stringstream ss; ss << "replSet members[" << i << "] " << p; uassert(13107, ss.str(), false); } - catch(DBException& e) { + catch(DBException& e) { log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog; stringstream ss; ss << "bad config for member[" << i << "] " << e.what(); @@ -289,7 +362,7 @@ namespace mongo { uassert(13122, "bad repl set config?", expr); } - ReplSetConfig::ReplSetConfig(BSONObj cfg) { + ReplSetConfig::ReplSetConfig(BSONObj cfg) { clear(); from(cfg); configAssert( version < 0 /*unspecified*/ || (version >= 1 && version <= 5000) ); @@ -315,18 +388,19 @@ namespace mongo { BSONObj cmd = BSON( "replSetHeartbeat" << setname ); int theirVersion; BSONObj info; + log() << "trying to contact " << h.toString() << rsLog; bool ok = requestHeartbeat(setname, "", h.toString(), info, -2, theirVersion); - if( info["rs"].trueValue() ) { + if( info["rs"].trueValue() ) { // yes, it is a replicate set, although perhaps not yet initialized } else { if( !ok ) { log() << "replSet TEMP !ok heartbeating " << h.toString() << " on cfg load" << rsLog; - if( !info.isEmpty() ) + if( !info.isEmpty() ) log() << "replSet info " << h.toString() << " : " << info.toString() << rsLog; return; } - { + { stringstream ss; ss << "replSet error: member " << h.toString() << " is not in --replSet mode"; msgassertedNoTrace(13260, ss.str().c_str()); // not caught as not a user exception - we want it not caught @@ -343,7 +417,7 @@ namespace mongo { cfg = conn.findOne(rsConfigNs, Query()).getOwned(); count = conn.count(rsConfigNs); } - catch ( DBException& e) { + catch ( DBException& ) { if ( !h.isSelf() ) { throw; } @@ -356,14 +430,14 @@ namespace mongo { if( count > 1 ) uasserted(13109, str::stream() << "multiple rows in " << rsConfigNs << " not supported host: " << h.toString()); - + if( cfg.isEmpty() ) { version = EMPTYCONFIG; return; } version = -1; } - catch( DBException& e) { + catch( DBException& e) { version = v; log(level) << "replSet load config couldn't get from " << h.toString() << ' ' << e.what() << rsLog; return; diff --git a/db/repl/rs_config.h b/db/repl/rs_config.h index e39dad7..7d43fe6 100644 --- a/db/repl/rs_config.h +++ b/db/repl/rs_config.h @@ -23,7 +23,7 @@ #include "../../util/hostandport.h" #include "health.h" -namespace mongo { +namespace mongo { /* singleton config object is stored here */ const string rsConfigNs = "local.system.replset"; @@ -31,7 +31,7 @@ namespace mongo { class ReplSetConfig { enum { EMPTYCONFIG = -2 }; public: - /* if something is misconfigured, throws an exception. + /* if something is misconfigured, throws an exception. if couldn't be queried or is just blank, ok() will be false. */ ReplSetConfig(const HostAndPort& h); @@ -41,7 +41,7 @@ namespace mongo { bool ok() const { return _ok; } struct MemberCfg { - MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false) { } + MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { } int _id; /* ordinal */ unsigned votes; /* how many votes this node gets. default 1. */ HostAndPort h; @@ -49,15 +49,17 @@ namespace mongo { bool arbiterOnly; int slaveDelay; /* seconds. int rather than unsigned for convenient to/front bson conversion. */ bool hidden; /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */ + bool buildIndexes; /* if false, do not create any non-_id indexes */ + set tags; /* tagging for data center, rack, etc. */ + BSONObj initialSync; /* directions for initial sync source */ void check() const; /* check validity, assert if not. */ BSONObj asBson() const; - bool potentiallyHot() const { - return !arbiterOnly && priority > 0; - } - bool operator==(const MemberCfg& r) const { - return _id==r._id && votes == r.votes && h == r.h && priority == r.priority && - arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden; + bool potentiallyHot() const { return !arbiterOnly && priority > 0; } + bool operator==(const MemberCfg& r) const { + return _id==r._id && votes == r.votes && h == r.h && priority == r.priority && + arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden && + buildIndexes == buildIndexes; } bool operator!=(const MemberCfg& r) const { return !(*this == r); } }; diff --git a/db/repl/rs_exception.h b/db/repl/rs_exception.h old mode 100755 new mode 100644 index e71cad2..fc372fc --- a/db/repl/rs_exception.h +++ b/db/repl/rs_exception.h @@ -1,15 +1,15 @@ -// @file rs_exception.h - -#pragma once - -namespace mongo { - - class VoteException : public std::exception { +// @file rs_exception.h + +#pragma once + +namespace mongo { + + class VoteException : public std::exception { public: - const char * what() const throw () { return "VoteException"; } + const char * what() const throw () { return "VoteException"; } }; - class RetryAfterSleepException : public std::exception { + class RetryAfterSleepException : public std::exception { public: const char * what() const throw () { return "RetryAfterSleepException"; } }; diff --git a/db/repl/rs_initialsync.cpp b/db/repl/rs_initialsync.cpp index 3851c66..5a54059 100644 --- a/db/repl/rs_initialsync.cpp +++ b/db/repl/rs_initialsync.cpp @@ -15,6 +15,7 @@ */ #include "pch.h" +#include "../repl.h" #include "../client.h" #include "../../client/dbclient.h" #include "rs.h" @@ -33,15 +34,17 @@ namespace mongo { // add try/catch with sleep - void isyncassert(const char *msg, bool expr) { - if( !expr ) { + void isyncassert(const char *msg, bool expr) { + if( !expr ) { string m = str::stream() << "initial sync " << msg; theReplSet->sethbmsg(m, 0); uasserted(13404, m); } } - void ReplSetImpl::syncDoInitialSync() { + void ReplSetImpl::syncDoInitialSync() { + createOplog(); + while( 1 ) { try { _syncDoInitialSync(); @@ -54,14 +57,14 @@ namespace mongo { } } - bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, - bool slaveOk, bool useReplAuth, bool snapshot); + bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, + bool slaveOk, bool useReplAuth, bool snapshot); /* todo : progress metering to sethbmsg. */ static bool clone(const char *master, string db) { string err; - return cloneFrom(master, err, db, false, - /*slaveok later can be true*/ false, true, false); + return cloneFrom(master, err, db, false, + /* slave_ok */ true, true, false); } void _logOpObjRS(const BSONObj& op); @@ -71,11 +74,11 @@ namespace mongo { static void emptyOplog() { writelock lk(rsoplog); Client::Context ctx(rsoplog); - NamespaceDetails *d = nsdetails(rsoplog); + NamespaceDetails *d = nsdetails(rsoplog); - // temp - if( d && d->nrecords == 0 ) - return; // already empty, ok. + // temp + if( d && d->stats.nrecords == 0 ) + return; // already empty, ok. log(1) << "replSet empty oplog" << rsLog; d->emptyCappedCollection(rsoplog); @@ -84,10 +87,10 @@ namespace mongo { string errmsg; bob res; dropCollection(rsoplog, errmsg, res); - log() << "replSet recreated oplog so it is empty. todo optimize this..." << rsLog; - createOplog();*/ + log() << "replSet recreated oplog so it is empty. todo optimize this..." << rsLog; + createOplog();*/ - // TEMP: restart to recreate empty oplog + // TEMP: restart to recreate empty oplog //log() << "replSet FATAL error during initial sync. mongod restart required." << rsLog; //dbexit( EXIT_CLEAN ); @@ -100,106 +103,182 @@ namespace mongo { */ } - void ReplSetImpl::_syncDoInitialSync() { - sethbmsg("initial sync pending",0); + /** + * Choose a member to sync from. + * + * The initalSync option is an object with 1 k/v pair: + * + * "state" : 1|2 + * "name" : "host" + * "_id" : N + * "optime" : t + * + * All except optime are exact matches. "optime" will find a secondary with + * an optime >= to the optime given. + */ + const Member* ReplSetImpl::getMemberToSyncTo() { + BSONObj sync = myConfig().initialSync; + bool secondaryOnly = false, isOpTime = false; + char *name = 0; + int id = -1; + OpTime optime; StateBox::SP sp = box.get(); assert( !sp.state.primary() ); // wouldn't make sense if we were. - const Member *cp = sp.primary; - if( cp == 0 ) { - sethbmsg("initial sync need a member to be primary",0); + // if it exists, we've already checked that these fields are valid in + // rs_config.cpp + if ( !sync.isEmpty() ) { + if (sync.hasElement("state")) { + if (sync["state"].Number() == 1) { + if (sp.primary) { + sethbmsg( str::stream() << "syncing to primary: " << sp.primary->fullName(), 0); + return const_cast(sp.primary); + } + else { + sethbmsg("couldn't clone from primary"); + return NULL; + } + } + else { + secondaryOnly = true; + } + } + if (sync.hasElement("name")) { + name = (char*)sync["name"].valuestr(); + } + if (sync.hasElement("_id")) { + id = (int)sync["_id"].Number(); + } + if (sync.hasElement("optime")) { + isOpTime = true; + optime = sync["optime"]._opTime(); + } + } + + for( Member *m = head(); m; m = m->next() ) { + if (!m->hbinfo().up() || + (m->state() != MemberState::RS_SECONDARY && + m->state() != MemberState::RS_PRIMARY) || + (secondaryOnly && m->state() != MemberState::RS_SECONDARY) || + (id != -1 && (int)m->id() != id) || + (name != 0 && strcmp(name, m->fullName().c_str()) != 0) || + (isOpTime && optime >= m->hbinfo().opTime)) { + continue; + } + + sethbmsg( str::stream() << "syncing to: " << m->fullName(), 0); + return const_cast(m); + } + + sethbmsg( str::stream() << "couldn't find a member matching the sync criteria: " << + "\nstate? " << (secondaryOnly ? "2" : "none") << + "\nname? " << (name ? name : "none") << + "\n_id? " << id << + "\noptime? " << optime.toStringPretty() ); + + return NULL; + } + + /** + * Do the initial sync for this member. + */ + void ReplSetImpl::_syncDoInitialSync() { + sethbmsg("initial sync pending",0); + + const Member *source = getMemberToSyncTo(); + if (!source) { + sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } - string masterHostname = cp->h().toString(); + string sourceHostname = source->h().toString(); OplogReader r; - if( !r.connect(masterHostname) ) { - sethbmsg( str::stream() << "initial sync couldn't connect to " << cp->h().toString() , 0); + if( !r.connect(sourceHostname) ) { + sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); - if( lastOp.isEmpty() ) { + if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } OpTime startingTS = lastOp["ts"]._opTime(); - - { - /* make sure things aren't too flappy */ - sleepsecs(5); - isyncassert( "flapping?", box.getPrimary() == cp ); - BSONObj o = r.getLastOp(rsoplog); - isyncassert( "flapping [2]?", !o.isEmpty() ); - } - - sethbmsg("initial sync drop all databases", 0); - dropAllDatabasesExceptLocal(); -// sethbmsg("initial sync drop oplog", 0); -// emptyOplog(); - - list dbs = r.conn()->getDatabaseNames(); - for( list::iterator i = dbs.begin(); i != dbs.end(); i++ ) { - string db = *i; - if( db != "local" ) { - sethbmsg( str::stream() << "initial sync cloning db: " << db , 0); - bool ok; - { - writelock lk(db); - Client::Context ctx(db); - ok = clone(masterHostname.c_str(), db); - } - if( !ok ) { - sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0); - sleepsecs(300); - return; + if (replSettings.fastsync) { + log() << "fastsync: skipping database clone" << rsLog; + } + else { + sethbmsg("initial sync drop all databases", 0); + dropAllDatabasesExceptLocal(); + + sethbmsg("initial sync clone all databases", 0); + + list dbs = r.conn()->getDatabaseNames(); + for( list::iterator i = dbs.begin(); i != dbs.end(); i++ ) { + string db = *i; + if( db != "local" ) { + sethbmsg( str::stream() << "initial sync cloning db: " << db , 0); + bool ok; + { + writelock lk(db); + Client::Context ctx(db); + ok = clone(sourceHostname.c_str(), db); + } + if( !ok ) { + sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0); + sleepsecs(300); + return; + } } } } sethbmsg("initial sync query minValid",0); - /* our cloned copy will be strange until we apply oplog events that occurred + isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() ); + + /* our cloned copy will be strange until we apply oplog events that occurred through the process. we note that time point here. */ BSONObj minValid = r.getLastOp(rsoplog); - assert( !minValid.isEmpty() ); + isyncassert( "getLastOp is empty ", !minValid.isEmpty() ); OpTime mvoptime = minValid["ts"]._opTime(); assert( !mvoptime.isNull() ); - /* copy the oplog + /* apply relevant portion of the oplog */ { - sethbmsg("initial sync copy+apply oplog"); - if( ! initialSyncOplogApplication(masterHostname, cp, startingTS, mvoptime) ) { // note we assume here that this call does not throw + sethbmsg("initial sync initial oplog application"); + isyncassert( "initial sync source must remain readable throughout our initial sync [2]", source->state().readable() ); + if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw log() << "replSet initial sync failed during applyoplog" << rsLog; emptyOplog(); // otherwise we'll be up! - lastOpTimeWritten = OpTime(); - lastH = 0; + lastOpTimeWritten = OpTime(); + lastH = 0; log() << "replSet cleaning up [1]" << rsLog; { writelock lk("local."); Client::Context cx( "local." ); - cx.db()->flushFiles(true); + cx.db()->flushFiles(true); } log() << "replSet cleaning up [2]" << rsLog; - sleepsecs(2); + sleepsecs(5); return; } } sethbmsg("initial sync finishing up",0); - + assert( !box.getState().primary() ); // wouldn't make sense if we were. { writelock lk("local."); Client::Context cx( "local." ); - cx.db()->flushFiles(true); + cx.db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } diff --git a/db/repl/rs_initiate.cpp b/db/repl/rs_initiate.cpp index 9c74be0..cf1941f 100644 --- a/db/repl/rs_initiate.cpp +++ b/db/repl/rs_initiate.cpp @@ -26,47 +26,63 @@ #include "rs.h" #include "rs_config.h" #include "../dbhelpers.h" +#include "../oplog.h" using namespace bson; using namespace mongoutils; -namespace mongo { +namespace mongo { /* called on a reconfig AND on initiate - throws + throws @param initial true when initiating */ void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial) { int failures = 0; int me = 0; + stringstream selfs; for( vector::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) { if( i->h.isSelf() ) { me++; - if( !i->potentiallyHot() ) { + if( me > 1 ) + selfs << ','; + selfs << i->h.toString(); + if( !i->potentiallyHot() ) { uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary"); } } } - uassert(13278, "bad config - dups?", me <= 1); // dups? - uassert(13279, "can't find self in the replset config", me == 1); + uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups? + if( me != 1 ) { + stringstream ss; + ss << "can't find self in the replset config"; + if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port; + if( me != 0 ) ss << " found: " << me; + uasserted(13279, ss.str()); + } for( vector::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) { + // we know we're up + if (i->h.isSelf()) { + continue; + } + BSONObj res; { bool ok = false; try { int theirVersion = -1000; - ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/); - if( theirVersion >= cfg.version ) { + ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/); + if( theirVersion >= cfg.version ) { stringstream ss; ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure"; uasserted(13259, ss.str()); } } - catch(DBException& e) { + catch(DBException& e) { log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog; } - catch(...) { + catch(...) { log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog; } if( res.getBoolField("mismatch") ) @@ -96,7 +112,7 @@ namespace mongo { trying to keep change small as release is near. */ const Member* m = theReplSet->findById( i->_id ); - if( m ) { + if( m ) { // ok, so this was an existing member (wouldn't make sense to add to config a new member that is down) assert( m->h().toString() == i->h.toString() ); allowFailure = true; @@ -113,24 +129,24 @@ namespace mongo { } if( initial ) { bool hasData = res["hasData"].Bool(); - uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set. All members except initiator must be empty.", - !hasData || i->h.isSelf()); + uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set. All members except initiator must be empty.", + !hasData || i->h.isSelf()); } } } - class CmdReplSetInitiate : public ReplSetCommand { + class CmdReplSetInitiate : public ReplSetCommand { public: virtual LockType locktype() const { return NONE; } CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { } - virtual void help(stringstream& h) const { - h << "Initiate/christen a replica set."; + virtual void help(stringstream& h) const { + h << "Initiate/christen a replica set."; h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands"; } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { log() << "replSet replSetInitiate admin command received from client" << rsLog; - if( !replSet ) { + if( !replSet ) { errmsg = "server is not running with --replSet"; return false; } @@ -141,12 +157,12 @@ namespace mongo { } { - // just make sure we can get a write lock before doing anything else. we'll reacquire one - // later. of course it could be stuck then, but this check lowers the risk if weird things + // just make sure we can get a write lock before doing anything else. we'll reacquire one + // later. of course it could be stuck then, but this check lowers the risk if weird things // are up. time_t t = time(0); writelock lk(""); - if( time(0)-t > 10 ) { + if( time(0)-t > 10 ) { errmsg = "took a long time to get write lock, so not initiating. Initiate when server less busy?"; return false; } @@ -155,7 +171,7 @@ namespace mongo { it is ok if the initiating member has *other* data than that. */ BSONObj o; - if( Helpers::getFirst(rsoplog, o) ) { + if( Helpers::getFirst(rsoplog, o) ) { errmsg = rsoplog + string(" is not empty on the initiating member. cannot initiate."); return false; } @@ -194,7 +210,7 @@ namespace mongo { configObj = b.obj(); log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog; } - else { + else { configObj = cmdObj["replSetInitiate"].Obj(); } @@ -203,7 +219,7 @@ namespace mongo { ReplSetConfig newConfig(configObj); parsed = true; - if( newConfig.version > 1 ) { + if( newConfig.version > 1 ) { errmsg = "can't initiate with a version number greater than 1"; return false; } @@ -214,6 +230,8 @@ namespace mongo { log() << "replSet replSetInitiate all members seem up" << rsLog; + createOplog(); + writelock lk(""); bo comment = BSON( "msg" << "initiating set"); newConfig.saveConfigLocally(comment); @@ -222,9 +240,9 @@ namespace mongo { ReplSet::startupStatus = ReplSet::SOON; ReplSet::startupStatusMsg = "Received replSetInitiate - should come online shortly."; } - catch( DBException& e ) { + catch( DBException& e ) { log() << "replSet replSetInitiate exception: " << e.what() << rsLog; - if( !parsed ) + if( !parsed ) errmsg = string("couldn't parse cfg object ") + e.what(); else errmsg = string("couldn't initiate : ") + e.what(); diff --git a/db/repl/rs_member.h b/db/repl/rs_member.h index 099cb22..017b6ea 100644 --- a/db/repl/rs_member.h +++ b/db/repl/rs_member.h @@ -30,18 +30,18 @@ namespace mongo { RS_FATAL something bad has occurred and server is not completely offline with regard to the replica set. fatal error. RS_STARTUP2 loaded config, still determining who is primary */ - struct MemberState { - enum MS { - RS_STARTUP, - RS_PRIMARY, - RS_SECONDARY, - RS_RECOVERING, - RS_FATAL, - RS_STARTUP2, - RS_UNKNOWN, /* remote node not yet reached */ - RS_ARBITER, - RS_DOWN, /* node not reachable for a report */ - RS_ROLLBACK + struct MemberState { + enum MS { + RS_STARTUP = 0, + RS_PRIMARY = 1, + RS_SECONDARY = 2, + RS_RECOVERING = 3, + RS_FATAL = 4, + RS_STARTUP2 = 5, + RS_UNKNOWN = 6, /* remote node not yet reached */ + RS_ARBITER = 7, + RS_DOWN = 8, /* node not reachable for a report */ + RS_ROLLBACK = 9 } s; MemberState(MS ms = RS_UNKNOWN) : s(ms) { } @@ -53,6 +53,7 @@ namespace mongo { bool startup2() const { return s == RS_STARTUP2; } bool fatal() const { return s == RS_FATAL; } bool rollback() const { return s == RS_ROLLBACK; } + bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; } string toString() const; @@ -60,9 +61,9 @@ namespace mongo { bool operator!=(const MemberState& r) const { return s != r.s; } }; - /* this is supposed to be just basic information on a member, + /* this is supposed to be just basic information on a member, and copy constructable. */ - class HeartbeatInfo { + class HeartbeatInfo { unsigned _id; public: HeartbeatInfo() : _id(0xffffffff),hbstate(MemberState::RS_UNKNOWN),health(-1.0),downSince(0),skew(INT_MIN) { } @@ -88,15 +89,15 @@ namespace mongo { bool changed(const HeartbeatInfo& old) const; }; - inline HeartbeatInfo::HeartbeatInfo(unsigned id) : _id(id) { + inline HeartbeatInfo::HeartbeatInfo(unsigned id) : _id(id) { hbstate = MemberState::RS_UNKNOWN; health = -1.0; downSince = 0; - lastHeartbeat = upSince = 0; + lastHeartbeat = upSince = 0; skew = INT_MIN; } - inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const { + inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const { return health != old.health || hbstate != old.hbstate; } diff --git a/db/repl/rs_optime.h b/db/repl/rs_optime.h index b3607fa..f0ca569 100644 --- a/db/repl/rs_optime.h +++ b/db/repl/rs_optime.h @@ -1,58 +1,58 @@ -// @file rs_optime.h - -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#pragma once - -#include "../../util/optime.h" - -namespace mongo { - +// @file rs_optime.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#pragma once + +#include "../../util/optime.h" + +namespace mongo { + const char rsoplog[] = "local.oplog.rs"; - - /* - class RSOpTime : public OpTime { - public: - bool initiated() const { return getSecs() != 0; } - };*/ - - /*struct RSOpTime { - unsigned long long ord; - - RSOpTime() : ord(0) { } - - bool initiated() const { return ord > 0; } - - void initiate() { - assert( !initiated() ); - ord = 1000000; - } - - ReplTime inc() { - DEV assertInWriteLock(); - return ++ord; - } - - string toString() const { return str::stream() << ord; } - - // query the oplog and set the highest value herein. acquires a db read lock. throws. - void load(); - }; - - extern RSOpTime rsOpTime;*/ - -} + + /* + class RSOpTime : public OpTime { + public: + bool initiated() const { return getSecs() != 0; } + };*/ + + /*struct RSOpTime { + unsigned long long ord; + + RSOpTime() : ord(0) { } + + bool initiated() const { return ord > 0; } + + void initiate() { + assert( !initiated() ); + ord = 1000000; + } + + ReplTime inc() { + DEV assertInWriteLock(); + return ++ord; + } + + string toString() const { return str::stream() << ord; } + + // query the oplog and set the highest value herein. acquires a db read lock. throws. + void load(); + }; + + extern RSOpTime rsOpTime;*/ + +} diff --git a/db/repl/rs_rollback.cpp b/db/repl/rs_rollback.cpp index 6b2544c..0b4cc28 100644 --- a/db/repl/rs_rollback.cpp +++ b/db/repl/rs_rollback.cpp @@ -1,5 +1,5 @@ /* @file rs_rollback.cpp -* +* * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify @@ -25,7 +25,7 @@ /* Scenarios We went offline with ops not replicated out. - + F = node that failed and coming back. P = node that took over, new primary @@ -33,11 +33,11 @@ F : a b c d e f g P : a b c d q - The design is "keep P". One could argue here that "keep F" has some merits, however, in most cases P - will have significantly more data. Also note that P may have a proper subset of F's stream if there were + The design is "keep P". One could argue here that "keep F" has some merits, however, in most cases P + will have significantly more data. Also note that P may have a proper subset of F's stream if there were no subsequent writes. - For now the model is simply : get F back in sync with P. If P was really behind or something, we should have + For now the model is simply : get F back in sync with P. If P was really behind or something, we should have just chosen not to fail over anyway. #2: @@ -50,9 +50,9 @@ Steps find an event in common. 'd'. - undo our events beyond that by: + undo our events beyond that by: (1) taking copy from other server of those objects - (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object + (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object -- i.e., reset minvalid. (3) we could skip operations on objects that are previous in time to our capture of the object as an optimization. @@ -65,15 +65,15 @@ namespace mongo { bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string& errmsg, bool logforrepl); void incRBID(); - class rsfatal : public std::exception { + class rsfatal : public std::exception { public: - virtual const char* what() const throw(){ return "replica set fatal exception"; } + virtual const char* what() const throw() { return "replica set fatal exception"; } }; struct DocID { const char *ns; be _id; - bool operator<(const DocID& d) const { + bool operator<(const DocID& d) const { int c = strcmp(ns, d.ns); if( c < 0 ) return true; if( c > 0 ) return false; @@ -82,7 +82,7 @@ namespace mongo { }; struct HowToFixUp { - /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only + /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only need to refetch it once. */ set toRefetch; @@ -97,9 +97,9 @@ namespace mongo { int rbid; // remote server's current rollback sequence # }; - static void refetch(HowToFixUp& h, const BSONObj& ourObj) { + static void refetch(HowToFixUp& h, const BSONObj& ourObj) { const char *op = ourObj.getStringField("op"); - if( *op == 'n' ) + if( *op == 'n' ) return; unsigned long long totSize = 0; @@ -108,53 +108,54 @@ namespace mongo { throw "rollback too large"; DocID d; + // NOTE The assigned ns value may become invalid if we yield. d.ns = ourObj.getStringField("ns"); - if( *d.ns == 0 ) { + if( *d.ns == 0 ) { log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog; return; } bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o"); - if( o.isEmpty() ) { + if( o.isEmpty() ) { log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog; return; } - if( *op == 'c' ) { + if( *op == 'c' ) { be first = o.firstElement(); NamespaceString s(d.ns); // foo.$cmd string cmdname = first.fieldName(); Command *cmd = Command::findCommand(cmdname.c_str()); - if( cmd == 0 ) { + if( cmd == 0 ) { log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog; return; } else { /* findandmodify - tranlated? - godinsert?, + godinsert?, renamecollection a->b. just resync a & b */ if( cmdname == "create" ) { - /* Create collection operation - { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } } + /* Create collection operation + { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } } */ string ns = s.db + '.' + o["create"].String(); // -> foo.abc h.toDrop.insert(ns); return; } - else if( cmdname == "drop" ) { + else if( cmdname == "drop" ) { string ns = s.db + '.' + first.valuestr(); h.collectionsToResync.insert(ns); return; } - else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) { + else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) { /* TODO: this is bad. we simply full resync the collection here, which could be very slow. */ log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog; string ns = s.db + '.' + first.valuestr(); h.collectionsToResync.insert(ns); return; } - else if( cmdname == "renameCollection" ) { + else if( cmdname == "renameCollection" ) { /* TODO: slow. */ log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog; string from = first.valuestr(); @@ -163,15 +164,15 @@ namespace mongo { h.collectionsToResync.insert(to); return; } - else if( cmdname == "reIndex" ) { + else if( cmdname == "reIndex" ) { return; } - else if( cmdname == "dropDatabase" ) { + else if( cmdname == "dropDatabase" ) { log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog; log() << "replSet " << o.toString() << rsLog; throw rsfatal(); } - else { + else { log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog; log() << "replSet cmdname=" << cmdname << rsLog; throw rsfatal(); @@ -190,15 +191,15 @@ namespace mongo { int getRBID(DBClientConnection*); - static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) { + static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) { static time_t last; - if( time(0)-last < 60 ) { + if( time(0)-last < 60 ) { throw "findcommonpoint waiting a while before trying again"; } last = time(0); assert( dbMutex.atLeastReadLocked() ); - Client::Context c(rsoplog, dbpath, 0, false); + Client::Context c(rsoplog); NamespaceDetails *nsd = nsdetails(rsoplog); assert(nsd); ReverseCappedCursor u(nsd); @@ -226,7 +227,7 @@ namespace mongo { log() << "replSet info rollback our last optime: " << ourTime.toStringPretty() << rsLog; log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog; log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog; - if( diff > 3600 ) { + if( diff > 3600 ) { log() << "replSet rollback too long a time period for a rollback." << rsLog; throw "error not willing to roll back more than one hour of data"; } @@ -236,8 +237,8 @@ namespace mongo { while( 1 ) { scanned++; /* todo add code to assure no excessive scanning for too long */ - if( ourTime == theirTime ) { - if( ourObj["h"].Long() == theirObj["h"].Long() ) { + if( ourTime == theirTime ) { + if( ourObj["h"].Long() == theirObj["h"].Long() ) { // found the point back in time where we match. // todo : check a few more just to be careful about hash collisions. log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog; @@ -249,7 +250,7 @@ namespace mongo { refetch(h, ourObj); - if( !t->more() ) { + if( !t->more() ) { log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; @@ -270,8 +271,8 @@ namespace mongo { ourObj = u.current(); ourTime = ourObj["ts"]._opTime(); } - else if( theirTime > ourTime ) { - if( !t->more() ) { + else if( theirTime > ourTime ) { + if( !t->more() ) { log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; @@ -281,11 +282,11 @@ namespace mongo { theirObj = t->nextSafe(); theirTime = theirObj["ts"]._opTime(); } - else { + else { // theirTime < ourTime refetch(h, ourObj); u.advance(); - if( !u.ok() ) { + if( !u.ok() ) { log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; @@ -298,299 +299,303 @@ namespace mongo { } } - struct X { + struct X { const bson::bo *op; bson::bo goodVersionOfObject; }; - static void setMinValid(bo newMinValid) { - try { - log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog; - } - catch(...) { } - { - Helpers::putSingleton("local.replset.minvalid", newMinValid); - Client::Context cx( "local." ); - cx.db()->flushFiles(true); - } + static void setMinValid(bo newMinValid) { + try { + log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog; + } + catch(...) { } + { + Helpers::putSingleton("local.replset.minvalid", newMinValid); + Client::Context cx( "local." ); + cx.db()->flushFiles(true); + } } void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) { - DBClientConnection *them = r.conn(); - - // fetch all first so we needn't handle interruption in a fancy way - - unsigned long long totSize = 0; - - list< pair > goodVersions; - - bo newMinValid; - - /* fetch all the goodVersions of each document from current primary */ - DocID d; - unsigned long long n = 0; - try { - for( set::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { - d = *i; - - assert( !d._id.eoo() ); - - { - /* TODO : slow. lots of round trips. */ - n++; - bo good= them->findOne(d.ns, d._id.wrap()).getOwned(); - totSize += good.objsize(); - uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 ); - - // note good might be eoo, indicating we should delete it - goodVersions.push_back(pair(d,good)); - } - } - newMinValid = r.getLastOp(rsoplog); - if( newMinValid.isEmpty() ) { - sethbmsg("rollback error newMinValid empty?"); - return; - } - } - catch(DBException& e) { - sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0); - log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog; - throw e; - } - - MemoryMappedFile::flushAll(true); - - sethbmsg("rollback 3.5"); - if( h.rbid != getRBID(r.conn()) ) { - // our source rolled back itself. so the data we received isn't necessarily consistent. - sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt"); - return; - } - - // update them - sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size()); - - bool warn = false; - - assert( !h.commonPointOurDiskloc.isNull() ); - - dbMutex.assertWriteLocked(); - - /* we have items we are writing that aren't from a point-in-time. thus best not to come online - until we get to that point in freshness. */ - setMinValid(newMinValid); - - /** any full collection resyncs required? */ - if( !h.collectionsToResync.empty() ) { - for( set::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { - string ns = *i; - sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns); - Client::Context c(*i, dbpath, 0, /*doauth*/false); - try { - bob res; - string errmsg; - dropCollection(ns, errmsg, res); - { - dbtemprelease r; - bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false); - if( !ok ) { - log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog; - throw "rollback error resyncing rollection [1]"; - } - } - } - catch(...) { - log() << "replset rollback error resyncing collection " << ns << rsLog; - throw "rollback error resyncing rollection [2]"; - } - } - - /* we did more reading from primary, so check it again for a rollback (which would mess us up), and - make minValid newer. - */ - sethbmsg("rollback 4.2"); - { - string err; - try { - newMinValid = r.getLastOp(rsoplog); - if( newMinValid.isEmpty() ) { - err = "can't get minvalid from primary"; - } else { - setMinValid(newMinValid); - } - } - catch(...) { - err = "can't get/set minvalid"; - } - if( h.rbid != getRBID(r.conn()) ) { - // our source rolled back itself. so the data we received isn't necessarily consistent. - // however, we've now done writes. thus we have a problem. - err += "rbid at primary changed during resync/rollback"; - } - if( !err.empty() ) { - log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog; - /* todo: reset minvalid so that we are permanently in fatal state */ - /* todo: don't be fatal, but rather, get all the data first. */ - sethbmsg("rollback error"); - throw rsfatal(); - } - } - sethbmsg("rollback 4.3"); - } - - sethbmsg("rollback 4.6"); - /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */ - for( set::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { - Client::Context c(*i, dbpath, 0, /*doauth*/false); - try { - bob res; - string errmsg; - log(1) << "replSet rollback drop: " << *i << rsLog; - dropCollection(*i, errmsg, res); - } - catch(...) { - log() << "replset rollback error dropping collection " << *i << rsLog; - } - } - - sethbmsg("rollback 4.7"); - Client::Context c(rsoplog, dbpath, 0, /*doauth*/false); - NamespaceDetails *oplogDetails = nsdetails(rsoplog); - uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails); - - map > removeSavers; - - unsigned deletes = 0, updates = 0; - for( list >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) { - const DocID& d = i->first; - bo pattern = d._id.wrap(); // { _id : ... } - try { - assert( d.ns && *d.ns ); - if( h.collectionsToResync.count(d.ns) ) { - /* we just synced this entire collection */ - continue; - } - - /* keep an archive of items rolled back */ - shared_ptr& rs = removeSavers[d.ns]; - if ( ! rs ) - rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) ); - - // todo: lots of overhead in context, this can be faster - Client::Context c(d.ns, dbpath, 0, /*doauth*/false); - if( i->second.isEmpty() ) { - // wasn't on the primary; delete. - /* TODO1.6 : can't delete from a capped collection. need to handle that here. */ - deletes++; - - NamespaceDetails *nsd = nsdetails(d.ns); - if( nsd ) { - if( nsd->capped ) { - /* can't delete from a capped collection - so we truncate instead. if this item must go, - so must all successors!!! */ - try { - /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */ - // this will crazy slow if no _id index. - long long start = Listener::getElapsedTimeMillis(); - DiskLoc loc = Helpers::findOne(d.ns, pattern, false); - if( Listener::getElapsedTimeMillis() - start > 200 ) - log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; - //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern); - if( !loc.isNull() ) { - try { - nsd->cappedTruncateAfter(d.ns, loc, true); - } - catch(DBException& e) { - if( e.getCode() == 13415 ) { - // hack: need to just make cappedTruncate do this... - nsd->emptyCappedCollection(d.ns); - } else { - throw; - } - } - } - } - catch(DBException& e) { - log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog; - } - } - else { - try { - deletes++; - deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() ); - } - catch(...) { - log() << "replSet error rollback delete failed ns:" << d.ns << rsLog; - } - } - // did we just empty the collection? if so let's check if it even exists on the source. - if( nsd->nrecords == 0 ) { - try { - string sys = cc().database()->name + ".system.namespaces"; - bo o = them->findOne(sys, QUERY("name"<second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() ); - } - } - catch(DBException& e) { - log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog; - warn = true; - } - } - - removeSavers.clear(); // this effectively closes all of them - - sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates); - MemoryMappedFile::flushAll(true); - sethbmsg("rollback 6"); - - // clean up oplog - log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog; - // todo: fatal error if this throws? - oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false); - - /* reset cached lastoptimewritten and h value */ - loadLastOpTimeWritten(); - - sethbmsg("rollback 7"); - MemoryMappedFile::flushAll(true); - - // done - if( warn ) - sethbmsg("issues during syncRollback, see log"); - else - sethbmsg("rollback done"); - } - - void ReplSetImpl::syncRollback(OplogReader&r) { + DBClientConnection *them = r.conn(); + + // fetch all first so we needn't handle interruption in a fancy way + + unsigned long long totSize = 0; + + list< pair > goodVersions; + + bo newMinValid; + + /* fetch all the goodVersions of each document from current primary */ + DocID d; + unsigned long long n = 0; + try { + for( set::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { + d = *i; + + assert( !d._id.eoo() ); + + { + /* TODO : slow. lots of round trips. */ + n++; + bo good= them->findOne(d.ns, d._id.wrap()).getOwned(); + totSize += good.objsize(); + uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 ); + + // note good might be eoo, indicating we should delete it + goodVersions.push_back(pair(d,good)); + } + } + newMinValid = r.getLastOp(rsoplog); + if( newMinValid.isEmpty() ) { + sethbmsg("rollback error newMinValid empty?"); + return; + } + } + catch(DBException& e) { + sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0); + log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog; + throw e; + } + + MemoryMappedFile::flushAll(true); + + sethbmsg("rollback 3.5"); + if( h.rbid != getRBID(r.conn()) ) { + // our source rolled back itself. so the data we received isn't necessarily consistent. + sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt"); + return; + } + + // update them + sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size()); + + bool warn = false; + + assert( !h.commonPointOurDiskloc.isNull() ); + + dbMutex.assertWriteLocked(); + + /* we have items we are writing that aren't from a point-in-time. thus best not to come online + until we get to that point in freshness. */ + setMinValid(newMinValid); + + /** any full collection resyncs required? */ + if( !h.collectionsToResync.empty() ) { + for( set::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { + string ns = *i; + sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns); + Client::Context c(*i); + try { + bob res; + string errmsg; + dropCollection(ns, errmsg, res); + { + dbtemprelease r; + bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false); + if( !ok ) { + log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog; + throw "rollback error resyncing rollection [1]"; + } + } + } + catch(...) { + log() << "replset rollback error resyncing collection " << ns << rsLog; + throw "rollback error resyncing rollection [2]"; + } + } + + /* we did more reading from primary, so check it again for a rollback (which would mess us up), and + make minValid newer. + */ + sethbmsg("rollback 4.2"); + { + string err; + try { + newMinValid = r.getLastOp(rsoplog); + if( newMinValid.isEmpty() ) { + err = "can't get minvalid from primary"; + } + else { + setMinValid(newMinValid); + } + } + catch(...) { + err = "can't get/set minvalid"; + } + if( h.rbid != getRBID(r.conn()) ) { + // our source rolled back itself. so the data we received isn't necessarily consistent. + // however, we've now done writes. thus we have a problem. + err += "rbid at primary changed during resync/rollback"; + } + if( !err.empty() ) { + log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog; + /* todo: reset minvalid so that we are permanently in fatal state */ + /* todo: don't be fatal, but rather, get all the data first. */ + sethbmsg("rollback error"); + throw rsfatal(); + } + } + sethbmsg("rollback 4.3"); + } + + sethbmsg("rollback 4.6"); + /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */ + for( set::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { + Client::Context c(*i); + try { + bob res; + string errmsg; + log(1) << "replSet rollback drop: " << *i << rsLog; + dropCollection(*i, errmsg, res); + } + catch(...) { + log() << "replset rollback error dropping collection " << *i << rsLog; + } + } + + sethbmsg("rollback 4.7"); + Client::Context c(rsoplog); + NamespaceDetails *oplogDetails = nsdetails(rsoplog); + uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails); + + map > removeSavers; + + unsigned deletes = 0, updates = 0; + for( list >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) { + const DocID& d = i->first; + bo pattern = d._id.wrap(); // { _id : ... } + try { + assert( d.ns && *d.ns ); + if( h.collectionsToResync.count(d.ns) ) { + /* we just synced this entire collection */ + continue; + } + + getDur().commitIfNeeded(); + + /* keep an archive of items rolled back */ + shared_ptr& rs = removeSavers[d.ns]; + if ( ! rs ) + rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) ); + + // todo: lots of overhead in context, this can be faster + Client::Context c(d.ns); + if( i->second.isEmpty() ) { + // wasn't on the primary; delete. + /* TODO1.6 : can't delete from a capped collection. need to handle that here. */ + deletes++; + + NamespaceDetails *nsd = nsdetails(d.ns); + if( nsd ) { + if( nsd->capped ) { + /* can't delete from a capped collection - so we truncate instead. if this item must go, + so must all successors!!! */ + try { + /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */ + // this will crazy slow if no _id index. + long long start = Listener::getElapsedTimeMillis(); + DiskLoc loc = Helpers::findOne(d.ns, pattern, false); + if( Listener::getElapsedTimeMillis() - start > 200 ) + log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; + //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern); + if( !loc.isNull() ) { + try { + nsd->cappedTruncateAfter(d.ns, loc, true); + } + catch(DBException& e) { + if( e.getCode() == 13415 ) { + // hack: need to just make cappedTruncate do this... + nsd->emptyCappedCollection(d.ns); + } + else { + throw; + } + } + } + } + catch(DBException& e) { + log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog; + } + } + else { + try { + deletes++; + deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() ); + } + catch(...) { + log() << "replSet error rollback delete failed ns:" << d.ns << rsLog; + } + } + // did we just empty the collection? if so let's check if it even exists on the source. + if( nsd->stats.nrecords == 0 ) { + try { + string sys = cc().database()->name + ".system.namespaces"; + bo o = them->findOne(sys, QUERY("name"<second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() ); + } + } + catch(DBException& e) { + log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog; + warn = true; + } + } + + removeSavers.clear(); // this effectively closes all of them + + sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates); + MemoryMappedFile::flushAll(true); + sethbmsg("rollback 6"); + + // clean up oplog + log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog; + // todo: fatal error if this throws? + oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false); + + /* reset cached lastoptimewritten and h value */ + loadLastOpTimeWritten(); + + sethbmsg("rollback 7"); + MemoryMappedFile::flushAll(true); + + // done + if( warn ) + sethbmsg("issues during syncRollback, see log"); + else + sethbmsg("rollback done"); + } + + void ReplSetImpl::syncRollback(OplogReader&r) { unsigned s = _syncRollback(r); - if( s ) + if( s ) sleepsecs(s); } - unsigned ReplSetImpl::_syncRollback(OplogReader&r) { + unsigned ReplSetImpl::_syncRollback(OplogReader&r) { assert( !lockedByMe() ); assert( !dbMutex.atLeastReadLocked() ); @@ -604,7 +609,7 @@ namespace mongo { if( box.getState().secondary() ) { /* by doing this, we will not service reads (return an error as we aren't in secondary staate. - that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred + that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred or removed or yielded later anyway. also, this is better for status reporting - we know what is happening. @@ -618,7 +623,7 @@ namespace mongo { r.resetCursor(); /*DBClientConnection us(false, 0, 0); string errmsg; - if( !us.connect(HostAndPort::me().toString(),errmsg) ) { + if( !us.connect(HostAndPort::me().toString(),errmsg) ) { sethbmsg("rollback connect to self failure" + errmsg); return; }*/ @@ -627,15 +632,15 @@ namespace mongo { try { syncRollbackFindCommonPoint(r.conn(), how); } - catch( const char *p ) { + catch( const char *p ) { sethbmsg(string("rollback 2 error ") + p); return 10; } - catch( rsfatal& ) { + catch( rsfatal& ) { _fatal(); return 2; } - catch( DBException& e ) { + catch( DBException& e ) { sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min"); dbtemprelease r; sleepsecs(60); @@ -647,20 +652,20 @@ namespace mongo { { incRBID(); - try { + try { syncFixUp(how, r); } - catch( rsfatal& ) { + catch( rsfatal& ) { sethbmsg("rollback fixup error"); _fatal(); return 2; } - catch(...) { + catch(...) { incRBID(); throw; } incRBID(); - /* success - leave "ROLLBACK" state + /* success - leave "ROLLBACK" state can go to SECONDARY once minvalid is achieved */ box.change(MemberState::RS_RECOVERING, _self); diff --git a/db/repl/rs_sync.cpp b/db/repl/rs_sync.cpp index 9de3f60..8d06fcc 100644 --- a/db/repl/rs_sync.cpp +++ b/db/repl/rs_sync.cpp @@ -19,30 +19,21 @@ #include "../../client/dbclient.h" #include "rs.h" #include "../repl.h" - +#include "connections.h" namespace mongo { using namespace bson; - extern unsigned replSetForceInitialSyncFailure; - void startSyncThread() { - Client::initThread("rs_sync"); - cc().iAmSyncThread(); - theReplSet->syncThread(); - cc().shutdown(); - } - + /* apply the log op that is in param o */ void ReplSetImpl::syncApply(const BSONObj &o) { - //const char *op = o.getStringField("op"); - - char db[MaxDatabaseLen]; + char db[MaxDatabaseNameLen]; const char *ns = o.getStringField("ns"); nsToDatabase(ns, db); if ( *ns == '.' || *ns == 0 ) { - if( *o.getStringField("op") == 'n' ) - return; + if( *o.getStringField("op") == 'n' ) + return; log() << "replSet skipping bad op in oplog: " << o.toString() << endl; return; } @@ -54,19 +45,21 @@ namespace mongo { applyOperation_inlock(o); } + /* initial oplog application, during initial sync, after cloning. + @return false on failure. + this method returns an error and doesn't throw exceptions (i think). + */ bool ReplSetImpl::initialSyncOplogApplication( - string hn, - const Member *primary, + const Member *source, OpTime applyGTE, - OpTime minValid) - { - if( primary == 0 ) return false; + OpTime minValid) { + if( source == 0 ) return false; - OpTime ts; + const string hn = source->h().toString(); + OplogReader r; try { - OplogReader r; - if( !r.connect(hn) ) { - log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog; + if( !r.connect(hn) ) { + log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog; return false; } @@ -80,48 +73,63 @@ namespace mongo { } assert( r.haveCursor() ); - /* we lock outside the loop to avoid the overhead of locking on every operation. server isn't usable yet anyway! */ - writelock lk(""); - { - if( !r.more() ) { + if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); + log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); - assert( !t.isNull() ); + + if( op.firstElement().fieldName() == string("$err") ) { + log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; + return false; + } + + uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); + log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; + log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } } + } + catch(DBException& e) { + log() << "replSet initial sync failing: " << e.toString() << rsLog; + return false; + } - // todo : use exhaust - unsigned long long n = 0; - while( 1 ) { + /* we lock outside the loop to avoid the overhead of locking on every operation. */ + writelock lk(""); + // todo : use exhaust + OpTime ts; + unsigned long long n = 0; + while( 1 ) { + try { if( !r.more() ) break; BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ { - //writelock lk(""); - ts = o["ts"]._opTime(); /* if we have become primary, we dont' want to apply things from elsewhere - anymore. assumePrimary is in the db lock so we are safe as long as + anymore. assumePrimary is in the db lock so we are safe as long as we check after we locked above. */ - const Member *p1 = box.getPrimary(); - if( p1 != primary || replSetForceInitialSyncFailure ) { + if( (source->state() != MemberState::RS_PRIMARY && + source->state() != MemberState::RS_SECONDARY) || + replSetForceInitialSyncFailure ) { + int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; + throw DBException("forced error",0); } - log() << "replSet primary was:" << primary->fullName() << " now:" << - (p1 != 0 ? p1->fullName() : "none") << rsLog; + log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } @@ -131,38 +139,48 @@ namespace mongo { } _logOpObjRS(o); /* with repl sets we write the ops to our oplog too */ } - if( ++n % 100000 == 0 ) { + if( ++n % 100000 == 0 ) { // simple progress metering log() << "replSet initialSyncOplogApplication " << n << rsLog; } + + getDur().commitIfNeeded(); } - } - catch(DBException& e) { - if( ts <= minValid ) { - // didn't make it far enough - log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog; - return false; + catch (DBException& e) { + if( e.getCode() == 11000 || e.getCode() == 11001 ) { + // skip duplicate key exceptions + continue; + } + + if( ts <= minValid ) { + // didn't make it far enough + log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog; + return false; + } + + // otherwise, whatever + break; } } return true; } - /* should be in RECOVERING state on arrival here. + /* should be in RECOVERING state on arrival here. readlocks @return true if transitioned to SECONDARY */ - bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) { - bool golive = false; + bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) { + bool golive = false; { readlock lk("local.replset.minvalid"); BSONObj mv; - if( Helpers::getSingleton("local.replset.minvalid", mv) ) { + if( Helpers::getSingleton("local.replset.minvalid", mv) ) { minvalid = mv["ts"]._opTime(); - if( minvalid <= lastOpTimeWritten ) { + if( minvalid <= lastOpTimeWritten ) { golive=true; } } - else + else golive = true; /* must have been the original member */ } if( golive ) { @@ -172,44 +190,104 @@ namespace mongo { return golive; } - /* tail the primary's oplog. ok to return, will be re-called. */ - void ReplSetImpl::syncTail() { - // todo : locking vis a vis the mgr... + /** + * Checks if the oplog given is too far ahead to read from. + * + * @param r the oplog + * @param hn the hostname (for log messages) + * + * @return if we are stale compared to the oplog on hn + */ + bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) { + BSONObj remoteOldestOp = r.findOne(rsoplog, Query()); + OpTime ts = remoteOldestOp["ts"]._opTime(); + DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog; + else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog; + DEV { + // debugging sync1.js... + log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog; + log() << "replSet our state: " << state().toString() << rsLog; + } + if( lastOpTimeWritten < ts ) { + log() << "replSet error RS102 too stale to catch up, at least from " << hn << rsLog; + log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog; + log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog; + log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog; + sethbmsg("error RS102 too stale to catch up"); + changeState(MemberState::RS_RECOVERING); + sleepsecs(120); + return true; + } + return false; + } - const Member *primary = box.getPrimary(); - if( primary == 0 ) return; - string hn = primary->h().toString(); - OplogReader r; - if( !r.connect(primary->h().toString()) ) { + /** + * Tries to connect the oplog reader to a potential sync source. If + * successful, it checks that we are not stale compared to this source. + * + * @param r reader to populate + * @param hn hostname to try + * + * @return if both checks pass, it returns true, otherwise false. + */ + bool ReplSetImpl::_getOplogReader(OplogReader& r, string& hn) { + assert(r.conn() == 0); + + if( !r.connect(hn) ) { log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog; - return; + r.resetConnection(); + return false; + } + if( _isStale(r, hn)) { + r.resetConnection(); + return false; } + return true; + } - /* first make sure we are not hopelessly out of sync by being very stale. */ - { - BSONObj remoteOldestOp = r.findOne(rsoplog, Query()); - OpTime ts = remoteOldestOp["ts"]._opTime(); - DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog; - else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog; - DEV { - // debugging sync1.js... - log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog; - log() << "replSet our state: " << state().toString() << rsLog; + /* tail an oplog. ok to return, will be re-called. */ + void ReplSetImpl::syncTail() { + // todo : locking vis a vis the mgr... + OplogReader r; + string hn; + + const Member *target = box.getPrimary(); + if (target != 0) { + hn = target->h().toString(); + if (!_getOplogReader(r, hn)) { + // we might be stale wrt the primary, but could still sync from + // a secondary + target = 0; + } + } + + // if we cannot reach the master but someone else is more up-to-date + // than we are, sync from them. + if( target == 0 ) { + for(Member *m = head(); m; m=m->next()) { + hn = m->h().toString(); + if (m->hbinfo().up() && m->state().readable() && + (m->hbinfo().opTime > lastOpTimeWritten) && + m->config().slaveDelay == 0 && + _getOplogReader(r, hn)) { + target = m; + break; + } } - if( lastOpTimeWritten < ts ) { - log() << "replSet error RS102 too stale to catch up, at least from primary: " << hn << rsLog; - log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog; - log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog; - log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog; - sethbmsg("error RS102 too stale to catch up"); - sleepsecs(120); + + // no server found + if (target == 0) { + // if there is no one to sync from + OpTime minvalid; + tryToGoLiveAsASecondary(minvalid); return; } } r.tailingQueryGTE(rsoplog, lastOpTimeWritten); assert( r.haveCursor() ); - assert( r.awaitCapable() ); + + uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); { if( !r.more() ) { @@ -222,7 +300,7 @@ namespace mongo { return; } OpTime theirTS = theirLastOp["ts"]._opTime(); - if( theirTS < lastOpTimeWritten ) { + if( theirTS < lastOpTimeWritten ) { log() << "replSet we are ahead of the primary, will try to roll back" << rsLog; syncRollback(r); return; @@ -231,7 +309,7 @@ namespace mongo { log() << "replSet syncTail condition 1" << rsLog; sleepsecs(1); } - catch(DBException& e) { + catch(DBException& e) { log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog; sleepsecs(2); } @@ -249,12 +327,9 @@ namespace mongo { BSONObj o = r.nextSafe(); OpTime ts = o["ts"]._opTime(); long long h = o["h"].numberLong(); - if( ts != lastOpTimeWritten || h != lastH ) { - log(1) << "TEMP our last op time written: " << lastOpTimeWritten.toStringPretty() << endl; - log(1) << "TEMP primary's GTE: " << ts.toStringPretty() << endl; - /* - }*/ - + if( ts != lastOpTimeWritten || h != lastH ) { + log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << endl; + log() << "replset source's GTE: " << ts.toStringPretty() << endl; syncRollback(r); return; } @@ -268,49 +343,45 @@ namespace mongo { while( 1 ) { while( 1 ) { - if( !r.moreInCurrentBatch() ) { - /* we need to occasionally check some things. between + if( !r.moreInCurrentBatch() ) { + /* we need to occasionally check some things. between batches is probably a good time. */ /* perhaps we should check this earlier? but not before the rollback checks. */ - if( state().recovering() ) { + if( state().recovering() ) { /* can we go to RS_SECONDARY state? we can if not too old and if minvalid achieved */ OpTime minvalid; bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid); if( golive ) { ; } - else { + else { sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString()); } /* todo: too stale capability */ } - if( box.getPrimary() != primary ) - return; + { + const Member *primary = box.getPrimary(); + + if( !target->hbinfo().hbstate.readable() || + // if we are not syncing from the primary, return (if + // it's up) so that we can try accessing it again + (target != primary && primary != 0)) { + return; + } + } } if( !r.more() ) break; - { + { BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ - { - writelock lk(""); - /* if we have become primary, we dont' want to apply things from elsewhere - anymore. assumePrimary is in the db lock so we are safe as long as - we check after we locked above. */ - if( box.getPrimary() != primary ) { - if( box.getState().primary() ) - log(0) << "replSet stopping syncTail we are now primary" << rsLog; - return; - } - - syncApply(o); - _logOpObjRS(o); /* with repl sets we write the ops to our oplog too: */ - } int sd = myConfig().slaveDelay; - if( sd ) { + // ignore slaveDelay if the box is still initializing. once + // it becomes secondary we can worry about it. + if( sd && box.getState().secondary() ) { const OpTime ts = o["ts"]._opTime(); long long a = ts.getSecs(); long long b = time(0); @@ -329,13 +400,30 @@ namespace mongo { sleepsecs(6); if( time(0) >= waitUntil ) break; - if( box.getPrimary() != primary ) + if( !target->hbinfo().hbstate.readable() ) { break; + } if( myConfig().slaveDelay != sd ) // reconf break; } } } + + } + + { + writelock lk(""); + + /* if we have become primary, we dont' want to apply things from elsewhere + anymore. assumePrimary is in the db lock so we are safe as long as + we check after we locked above. */ + if( box.getState().primary() ) { + log(0) << "replSet stopping syncTail we are now primary" << rsLog; + return; + } + + syncApply(o); + _logOpObjRS(o); /* with repl sets we write the ops to our oplog too: */ } } } @@ -345,8 +433,9 @@ namespace mongo { // TODO : reuse our connection to the primary. return; } - if( box.getPrimary() != primary ) + if( !target->hbinfo().hbstate.readable() ) { return; + } // looping back is ok because this is a tailable cursor } } @@ -357,15 +446,11 @@ namespace mongo { sleepsecs(1); return; } - if( sp.state.fatal() ) { + if( sp.state.fatal() ) { sleepsecs(5); return; } - /* later, we can sync from up secondaries if we want. tbd. */ - if( sp.primary == 0 ) - return; - /* do we have anything at all? */ if( lastOpTimeWritten.isNull() ) { syncDoInitialSync(); @@ -377,23 +462,64 @@ namespace mongo { } void ReplSetImpl::syncThread() { - if( myConfig().arbiterOnly ) - return; - while( 1 ) { + /* test here was to force a receive timeout + ScopedConn c("localhost"); + bo info; + try { + log() << "this is temp" << endl; + c.runCommand("admin", BSON("sleep"<<120), info); + log() << info.toString() << endl; + c.runCommand("admin", BSON("sleep"<<120), info); + log() << "temp" << endl; + } + catch( DBException& e ) { + log() << e.toString() << endl; + c.runCommand("admin", BSON("sleep"<<120), info); + log() << "temp" << endl; + } + */ + + while( 1 ) { + if( myConfig().arbiterOnly ) + return; + try { _syncThread(); } - catch(DBException& e) { + catch(DBException& e) { sethbmsg("syncThread: " + e.toString()); sleepsecs(10); } - catch(...) { + catch(...) { sethbmsg("unexpected exception in syncThread()"); - // TODO : SET NOT SECONDARY here. + // TODO : SET NOT SECONDARY here? sleepsecs(60); } sleepsecs(1); + + /* normally msgCheckNewState gets called periodically, but in a single node repl set there + are no heartbeat threads, so we do it here to be sure. this is relevant if the singleton + member has done a stepDown() and needs to come back up. + */ + OCCASIONALLY mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) ); + } + } + + void startSyncThread() { + static int n; + if( n != 0 ) { + log() << "replSet ERROR : more than one sync thread?" << rsLog; + assert( n == 0 ); + } + n++; + + Client::initThread("replica set sync"); + cc().iAmSyncThread(); + if (!noauth) { + cc().getAuthenticationInfo()->authorize("local"); } + theReplSet->syncThread(); + cc().shutdown(); } } diff --git a/db/repl_block.cpp b/db/repl_block.cpp index 9cff24f..05be343 100644 --- a/db/repl_block.cpp +++ b/db/repl_block.cpp @@ -35,13 +35,13 @@ namespace mongo { class SlaveTracking : public BackgroundJob { public: - string name() { return "SlaveTracking"; } + string name() const { return "SlaveTracking"; } static const char * NS; struct Ident { - - Ident(BSONObj r,string h,string n){ + + Ident(BSONObj r,string h,string n) { BSONObjBuilder b; b.appendElements( r ); b.append( "host" , h ); @@ -52,18 +52,18 @@ namespace mongo { bool operator<( const Ident& other ) const { return obj.woCompare( other.obj ) < 0; } - + BSONObj obj; }; struct Info { - Info() : loc(0){} - ~Info(){ - if ( loc && owned ){ + Info() : loc(0) {} + ~Info() { + if ( loc && owned ) { delete loc; } } - bool owned; + bool owned; // true if loc is a pointer of our creation (and not a pointer into a MMF) OpTime * loc; }; @@ -72,33 +72,33 @@ namespace mongo { _started = false; } - void run(){ + void run() { Client::initThread( "slaveTracking" ); DBDirectClient db; - while ( ! inShutdown() ){ + while ( ! inShutdown() ) { sleepsecs( 1 ); if ( ! _dirty ) continue; - + writelock lk(NS); list< pair > todo; - + { scoped_lock mylk(_mutex); - - for ( map::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ){ + + for ( map::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ) { BSONObjBuilder temp; temp.appendTimestamp( "syncedTo" , i->second.loc[0].asDate() ); - todo.push_back( pair( i->first.obj.getOwned() , + todo.push_back( pair( i->first.obj.getOwned() , BSON( "$set" << temp.obj() ).getOwned() ) ); } - + _slaves.clear(); } - for ( list< pair >::iterator i=todo.begin(); i!=todo.end(); i++ ){ + for ( list< pair >::iterator i=todo.begin(); i!=todo.end(); i++ ) { db.update( NS , i->first , i->second , true ); } @@ -106,52 +106,54 @@ namespace mongo { } } - void reset(){ + void reset() { scoped_lock mylk(_mutex); _slaves.clear(); } - void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ){ + void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) { REPLDEBUG( host << " " << rid << " " << ns << " " << last ); scoped_lock mylk(_mutex); - + #ifdef _DEBUG MongoFileAllowWrites allowWrites; #endif Ident ident(rid,host,ns); Info& i = _slaves[ ident ]; - if ( i.loc ){ - i.loc[0] = last; + if ( i.loc ) { + if( i.owned ) + i.loc[0] = last; + else + getDur().setNoJournal(i.loc, &last, sizeof(last)); return; } - + dbMutex.assertAtLeastReadLocked(); BSONObj res; - if ( Helpers::findOne( NS , ident.obj , res ) ){ + if ( Helpers::findOne( NS , ident.obj , res ) ) { assert( res["syncedTo"].type() ); i.owned = false; i.loc = (OpTime*)res["syncedTo"].value(); - i.loc[0] = last; + getDur().setNoJournal(i.loc, &last, sizeof(last)); return; } - + i.owned = true; - i.loc = new OpTime[1]; - i.loc[0] = last; + i.loc = new OpTime(last); _dirty = true; - if ( ! _started ){ + if ( ! _started ) { // start background thread here since we definitely need it _started = true; go(); } } - - bool opReplicatedEnough( OpTime op , int w ){ + + bool opReplicatedEnough( OpTime op , int w ) { RARELY { REPLDEBUG( "looking for : " << op << " w=" << w ); } @@ -161,9 +163,9 @@ namespace mongo { w--; // now this is the # of slaves i need scoped_lock mylk(_mutex); - for ( map::iterator i=_slaves.begin(); i!=_slaves.end(); i++){ + for ( map::iterator i=_slaves.begin(); i!=_slaves.end(); i++) { OpTime s = *(i->second.loc); - if ( s < op ){ + if ( s < op ) { continue; } if ( --w == 0 ) @@ -171,9 +173,15 @@ namespace mongo { } return w <= 0; } - + + unsigned getSlaveCount() const { + scoped_lock mylk(_mutex); + + return _slaves.size(); + } + // need to be careful not to deadlock with this - mongo::mutex _mutex; + mutable mongo::mutex _mutex; map _slaves; bool _dirty; bool _started; @@ -182,12 +190,12 @@ namespace mongo { const char * SlaveTracking::NS = "local.slaves"; - void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ){ + void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ) { if ( lastOp.isNull() ) return; - + assert( str::startsWith(ns, "local.oplog.") ); - + Client * c = curop.getClient(); assert(c); BSONObj rid = c->getRemoteID(); @@ -197,11 +205,15 @@ namespace mongo { slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp ); } - bool opReplicatedEnough( OpTime op , int w ){ + bool opReplicatedEnough( OpTime op , int w ) { return slaveTracking.opReplicatedEnough( op , w ); } - void resetSlaveCache(){ + void resetSlaveCache() { slaveTracking.reset(); } + + unsigned getSlaveCount() { + return slaveTracking.getSlaveCount(); + } } diff --git a/db/repl_block.h b/db/repl_block.h index e9a990a..978932d 100644 --- a/db/repl_block.h +++ b/db/repl_block.h @@ -24,11 +24,15 @@ /** local.slaves - current location for all slaves - + */ namespace mongo { - - void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ); + + void updateSlaveLocation( CurOp& curop, const char * oplog_ns , OpTime lastOp ); + + /** @return true if op has made it to w servers */ bool opReplicatedEnough( OpTime op , int w ); + void resetSlaveCache(); + unsigned getSlaveCount(); } diff --git a/db/replpair.h b/db/replpair.h index 1da8b78..a551308 100644 --- a/db/replpair.h +++ b/db/replpair.h @@ -55,8 +55,8 @@ namespace mongo { int remotePort; string remoteHost; string remote; // host:port if port specified. - // int date; // -1 not yet set; 0=slave; 1=master - + // int date; // -1 not yet set; 0=slave; 1=master + string getInfo() { stringstream ss; ss << " state: "; @@ -113,12 +113,12 @@ namespace mongo { */ inline bool _isMaster() { if( replSet ) { - if( theReplSet ) + if( theReplSet ) return theReplSet->isPrimary(); return false; } - if( ! replSettings.slave ) + if( ! replSettings.slave ) return true; if ( replAllDead ) @@ -128,17 +128,17 @@ namespace mongo { if( replPair->state == ReplPair::State_Master ) return true; } - else { + else { if( replSettings.master ) { - // if running with --master --slave, allow. note that master is also true + // if running with --master --slave, allow. note that master is also true // for repl pairs so the check for replPair above is important. return true; } } - + if ( cc().isGod() ) return true; - + return false; } inline bool isMaster(const char *client = 0) { @@ -152,20 +152,22 @@ namespace mongo { return strcmp( client, "local" ) == 0; } - inline void notMasterUnless(bool expr) { + inline void notMasterUnless(bool expr) { uassert( 10107 , "not master" , expr ); } - /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair - so that queries to a pair are realtime consistent as much as possible. use setSlaveOk() to + /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair + so that queries to a pair are realtime consistent as much as possible. use setSlaveOk() to query the nonmaster member of a replica pair. */ inline void replVerifyReadsOk(ParsedQuery& pq) { if( replSet ) { - /* todo: speed up the secondary case. as written here there are 2 mutex entries, it can be 1. */ + /* todo: speed up the secondary case. as written here there are 2 mutex entries, it can b 1. */ if( isMaster() ) return; - notMasterUnless( pq.hasOption(QueryOption_SlaveOk) && theReplSet && theReplSet->isSecondary() ); - } else { + uassert(13435, "not master and slaveok=false", pq.hasOption(QueryOption_SlaveOk)); + uassert(13436, "not master or secondary, can't read", theReplSet && theReplSet->isSecondary() ); + } + else { notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave ); } } diff --git a/db/resource.h b/db/resource.h old mode 100755 new mode 100644 index bee8d30..9ba1ed2 --- a/db/resource.h +++ b/db/resource.h @@ -1,16 +1,16 @@ -//{{NO_DEPENDENCIES}} -// Microsoft Visual C++ generated include file. -// Used by db.rc -// -#define IDI_ICON2 102 - -// Next default values for new objects -// -#ifdef APSTUDIO_INVOKED -#ifndef APSTUDIO_READONLY_SYMBOLS -#define _APS_NEXT_RESOURCE_VALUE 104 -#define _APS_NEXT_COMMAND_VALUE 40001 -#define _APS_NEXT_CONTROL_VALUE 1001 -#define _APS_NEXT_SYMED_VALUE 101 -#endif -#endif +//{{NO_DEPENDENCIES}} +// Microsoft Visual C++ generated include file. +// Used by db.rc +// +#define IDI_ICON2 102 + +// Next default values for new objects +// +#ifdef APSTUDIO_INVOKED +#ifndef APSTUDIO_READONLY_SYMBOLS +#define _APS_NEXT_RESOURCE_VALUE 104 +#define _APS_NEXT_COMMAND_VALUE 40001 +#define _APS_NEXT_CONTROL_VALUE 1001 +#define _APS_NEXT_SYMED_VALUE 101 +#endif +#endif diff --git a/db/restapi.cpp b/db/restapi.cpp index e9a7ae2..7460c94 100644 --- a/db/restapi.cpp +++ b/db/restapi.cpp @@ -29,6 +29,8 @@ #include "clientcursor.h" #include "background.h" +#include "restapi.h" + namespace mongo { extern const char *replInfo; @@ -39,17 +41,17 @@ namespace mongo { class RESTHandler : public DbWebHandler { public: - RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ){} + RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ) {} - virtual bool handles( const string& url ) const { - return + virtual bool handles( const string& url ) const { + return url[0] == '/' && url.find_last_of( '/' ) > 0; } - virtual void handle( const char *rq, string url, + virtual void handle( const char *rq, string url, BSONObj params, string& responseMsg, int& responseCode, - vector& headers, const SockAddr &from ){ + vector& headers, const SockAddr &from ) { string::size_type first = url.find( "/" , 1 ); if ( first == string::npos ) { @@ -62,12 +64,6 @@ namespace mongo { string coll = url.substr( first + 1 ); string action = ""; - BSONObj params; - if ( coll.find( "?" ) != string::npos ) { - MiniWebServer::parseParams( params , coll.substr( coll.find( "?" ) + 1 ) ); - coll = coll.substr( 0 , coll.find( "?" ) ); - } - string::size_type last = coll.find_last_of( "/" ); if ( last == string::npos ) { action = coll; @@ -107,7 +103,7 @@ namespace mongo { out() << "don't know how to handle a [" << method << "]" << endl; } - if( html ) + if( html ) headers.push_back("Content-Type: text/html;charset=utf-8"); else headers.push_back("Content-Type: text/plain;charset=utf-8"); @@ -118,7 +114,7 @@ namespace mongo { bool handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) { Timer t; - int html = _getOption( params["html"] , 0 ); + int html = _getOption( params["html"] , 0 ); int skip = _getOption( params["skip"] , 0 ); int num = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new @@ -131,7 +127,7 @@ namespace mongo { BSONObjBuilder queryBuilder; BSONObjIterator i(params); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); string name = e.fieldName(); if ( ! name.find( "filter_" ) == 0 ) @@ -167,10 +163,11 @@ namespace mongo { if( html ) { string title = string("query ") + ns; - out << start(title) + out << start(title) << p(title) << "

";
-            } else {
+            }
+            else {
                 out << "{\n";
                 out << "  \"offset\" : " << skip << ",\n";
                 out << "  \"rows\": [\n";
@@ -195,7 +192,7 @@ namespace mongo {
                 }
             }
 
-            if( html ) { 
+            if( html ) {
                 out << "
\n"; if( howMany == 0 ) out << p("Collection is empty"); out << _end(); @@ -216,7 +213,8 @@ namespace mongo { try { BSONObj obj = fromjson( body ); db.insert( ns.c_str(), obj ); - } catch ( ... ) { + } + catch ( ... ) { responseCode = 400; // Bad Request. Seems reasonable for now. out << "{ \"ok\" : false }"; return; @@ -233,18 +231,18 @@ namespace mongo { return atoi( e.valuestr() ); return def; } - + DBDirectClient db; } restHandler; - bool webHaveAdminUsers(){ + bool RestAdminAccess::haveAdminUsers() const { readlocktryassert rl("admin.system.users", 10000); - Client::Context cx( "admin.system.users" ); - return ! Helpers::isEmpty("admin.system.users"); + Client::Context cx( "admin.system.users", dbpath, NULL, false ); + return ! Helpers::isEmpty("admin.system.users", false); } - BSONObj webGetAdminUser( const string& username ){ + BSONObj RestAdminAccess::getAdminUser( const string& username ) const { Client::GodScope gs; readlocktryassert rl("admin.system.users", 10000); Client::Context cx( "admin.system.users" ); @@ -256,19 +254,19 @@ namespace mongo { class LowLevelMongodStatus : public WebStatusPlugin { public: - LowLevelMongodStatus() : WebStatusPlugin( "low level" , 5 , "requires read lock" ){} + LowLevelMongodStatus() : WebStatusPlugin( "low level" , 5 , "requires read lock" ) {} - virtual void init(){} + virtual void init() {} - void _gotLock( int millis , stringstream& ss ){ + void _gotLock( int millis , stringstream& ss ) { ss << "
\n";
             ss << "time to get readlock: " << millis << "ms\n";
-            
+
             ss << "# databases: " << dbHolder.size() << '\n';
-            
+
             if( ClientCursor::numCursors()>500 )
                 ss << "# Cursors: " << ClientCursor::numCursors() << '\n';
-            
+
             ss << "\nreplication: ";
             if( *replInfo )
                 ss << "\nreplInfo:  " << replInfo << "\n\n";
@@ -296,10 +294,10 @@ namespace mongo {
             ss << "
\n"; } - virtual void run( stringstream& ss ){ + virtual void run( stringstream& ss ) { Timer t; readlocktry lk( "" , 300 ); - if ( lk.got() ){ + if ( lk.got() ) { _gotLock( t.millis() , ss ); } else { diff --git a/db/restapi.h b/db/restapi.h new file mode 100644 index 0000000..e5ac520 --- /dev/null +++ b/db/restapi.h @@ -0,0 +1,34 @@ +/** @file restapi.h + */ + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../util/admin_access.h" + +namespace mongo { + + class RestAdminAccess : public AdminAccess { + public: + virtual ~RestAdminAccess() { } + + virtual bool haveAdminUsers() const; + virtual BSONObj getAdminUser( const string& username ) const; + }; + +} // namespace mongo diff --git a/db/scanandorder.h b/db/scanandorder.h index 8d63b9a..4c491fa 100644 --- a/db/scanandorder.h +++ b/db/scanandorder.h @@ -50,34 +50,25 @@ namespace mongo { _ response size limit from runquery; push it up a bit. */ - inline void fillQueryResultFromObj(BufBuilder& bb, FieldMatcher *filter, BSONObj& js, DiskLoc* loc=NULL) { + inline void fillQueryResultFromObj(BufBuilder& bb, Projection *filter, const BSONObj& js, DiskLoc* loc=NULL) { if ( filter ) { BSONObjBuilder b( bb ); - BSONObjIterator i( js ); - while ( i.more() ){ - BSONElement e = i.next(); - const char * fname = e.fieldName(); - - if ( strcmp( fname , "_id" ) == 0 ){ - if (filter->includeID()) - b.append( e ); - } else { - filter->append( b , e ); - } - } + filter->transform( js , b ); if (loc) b.append("$diskLoc", loc->toBSONObj()); b.done(); - } else if (loc) { + } + else if (loc) { BSONObjBuilder b( bb ); b.appendElements(js); b.append("$diskLoc", loc->toBSONObj()); b.done(); - } else { + } + else { bb.appendBuf((void*) js.objdata(), js.objsize()); } } - + typedef multimap BestMap; class ScanAndOrder { BestMap best; // key -> full object @@ -87,9 +78,10 @@ namespace mongo { unsigned approxSize; void _add(BSONObj& k, BSONObj o, DiskLoc* loc) { - if (!loc){ + if (!loc) { best.insert(make_pair(k.getOwned(),o.getOwned())); - } else { + } + else { BSONObjBuilder b; b.appendElements(o); b.append("$diskLoc", loc->toBSONObj()); @@ -110,8 +102,8 @@ namespace mongo { public: ScanAndOrder(int _startFrom, int _limit, BSONObj _order) : - best( BSONObjCmp( _order ) ), - startFrom(_startFrom), order(_order) { + best( BSONObjCmp( _order ) ), + startFrom(_startFrom), order(_order) { limit = _limit > 0 ? _limit + startFrom : 0x7fffffff; approxSize = 0; } @@ -140,7 +132,7 @@ namespace mongo { _addIfBetter(k, o, i, loc); } - void _fill(BufBuilder& b, FieldMatcher *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) { + void _fill(BufBuilder& b, Projection *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) { int n = 0; int nFilled = 0; for ( BestMap::iterator i = begin; i != end; i++ ) { @@ -158,7 +150,7 @@ namespace mongo { } /* scanning complete. stick the query result in b for n objects. */ - void fill(BufBuilder& b, FieldMatcher *filter, int& nout) { + void fill(BufBuilder& b, Projection *filter, int& nout) { _fill(b, filter, nout, best.begin(), best.end()); } diff --git a/db/security.cpp b/db/security.cpp index c552b53..1ec4218 100644 --- a/db/security.cpp +++ b/db/security.cpp @@ -20,19 +20,17 @@ #include "security.h" #include "instance.h" #include "client.h" -#include "curop.h" +#include "curop-inl.h" #include "db.h" #include "dbhelpers.h" namespace mongo { - bool noauth = true; - - int AuthenticationInfo::warned = 0; + int AuthenticationInfo::warned = 0; - void AuthenticationInfo::print(){ + void AuthenticationInfo::print() { cout << "AuthenticationInfo: " << this << '\n'; - for ( map::iterator i=m.begin(); i!=m.end(); i++ ){ + for ( map::iterator i=m.begin(); i!=m.end(); i++ ) { cout << "\t" << i->first << "\t" << i->second.level << '\n'; } cout << "END" << endl; @@ -40,16 +38,16 @@ namespace mongo { bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) { - if ( cc().isGod() ){ + if ( cc().isGod() ) { return true; } - - if ( isLocalHost ){ - atleastreadlock l(""); + + if ( isLocalHost ) { + atleastreadlock l(""); Client::GodScope gs; Client::Context c("admin.system.users"); BSONObj result; - if( ! Helpers::getSingleton("admin.system.users", result) ){ + if( ! Helpers::getSingleton("admin.system.users", result) ) { if( warned == 0 ) { warned++; log() << "note: no users configured in admin.system.users, allowing localhost access" << endl; diff --git a/db/security.h b/db/security.h index a6a9103..2b947c1 100644 --- a/db/security.h +++ b/db/security.h @@ -20,12 +20,10 @@ #include "nonce.h" #include "concurrency.h" +#include "security_key.h" namespace mongo { - // --noauth cmd line option - extern bool noauth; - /* for a particular db */ struct Auth { Auth() { level = 0; } @@ -35,36 +33,36 @@ namespace mongo { class AuthenticationInfo : boost::noncopyable { mongo::mutex _lock; map m; // dbname -> auth - static int warned; + static int warned; public: - bool isLocalHost; + bool isLocalHost; AuthenticationInfo() : _lock("AuthenticationInfo") { isLocalHost = false; } ~AuthenticationInfo() { } - void logout(const string& dbname ) { + void logout(const string& dbname ) { scoped_lock lk(_lock); - m.erase(dbname); - } - void authorize(const string& dbname ) { + m.erase(dbname); + } + void authorize(const string& dbname ) { scoped_lock lk(_lock); m[dbname].level = 2; } void authorizeReadOnly(const string& dbname) { scoped_lock lk(_lock); - m[dbname].level = 1; + m[dbname].level = 1; } bool isAuthorized(const string& dbname) { return _isAuthorized( dbname, 2 ); } bool isAuthorizedReads(const string& dbname) { return _isAuthorized( dbname, 1 ); } bool isAuthorizedForLock(const string& dbname, int lockType ) { return _isAuthorized( dbname , lockType > 0 ? 2 : 1 ); } - + void print(); protected: - bool _isAuthorized(const string& dbname, int level) { + bool _isAuthorized(const string& dbname, int level) { if( m[dbname].level >= level ) return true; - if( noauth ) return true; - if( m["admin"].level >= level ) return true; - if( m["local"].level >= level ) return true; + if( noauth ) return true; + if( m["admin"].level >= level ) return true; + if( m["local"].level >= level ) return true; return _isAuthorizedSpecialChecks( dbname ); } diff --git a/db/security_commands.cpp b/db/security_commands.cpp index 7bf2813..67605aa 100644 --- a/db/security_commands.cpp +++ b/db/security_commands.cpp @@ -22,7 +22,7 @@ #include "pch.h" #include "security.h" #include "../util/md5.hpp" -#include "json.h" +#include "json.h" #include "pdfile.h" #include "db.h" #include "dbhelpers.h" @@ -32,17 +32,17 @@ namespace mongo { -/* authentication + /* authentication - system.users contains - { user : , pwd : , ... } + system.users contains + { user : , pwd : , ... } - getnonce sends nonce to client + getnonce sends nonce to client - client then sends { authenticate:1, nonce:, user:, key: } + client then sends { authenticate:1, nonce:, user:, key: } - where is md5() as a string -*/ + where is md5() as a string + */ boost::thread_specific_ptr lastNonce; @@ -83,7 +83,7 @@ namespace mongo { return true; } } cmdLogout; - + class CmdAuthenticate : public Command { public: virtual bool requiresAuth() { return false; } @@ -93,7 +93,7 @@ namespace mongo { virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return WRITE; } // TODO: make this READ + virtual LockType locktype() const { return WRITE; } virtual void help(stringstream& ss) const { ss << "internal"; } CmdAuthenticate() : Command("authenticate") {} bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { @@ -102,16 +102,16 @@ namespace mongo { string user = cmdObj.getStringField("user"); string key = cmdObj.getStringField("key"); string received_nonce = cmdObj.getStringField("nonce"); - - if( user.empty() || key.empty() || received_nonce.empty() ) { - log() << "field missing/wrong type in received authenticate command " - << dbname - << endl; + + if( user.empty() || key.empty() || received_nonce.empty() ) { + log() << "field missing/wrong type in received authenticate command " + << dbname + << endl; errmsg = "auth fails"; sleepmillis(10); return false; } - + stringstream digestBuilder; { @@ -120,12 +120,13 @@ namespace mongo { if ( ln == 0 ) { reject = true; log(1) << "auth: no lastNonce" << endl; - } else { + } + else { digestBuilder << hex << *ln; reject = digestBuilder.str() != received_nonce; if ( reject ) log(1) << "auth: different lastNonce" << endl; } - + if ( reject ) { log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl; errmsg = "auth fails"; @@ -134,52 +135,60 @@ namespace mongo { } } - static BSONObj userPattern = fromjson("{\"user\":1}"); - string systemUsers = dbname + ".system.users"; - OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1"); - BSONObj userObj; - { - BSONObjBuilder b; - b << "user" << user; - BSONObj query = b.done(); - if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { - log() << "auth: couldn't find user " << user << ", " << systemUsers << endl; - errmsg = "auth fails"; - return false; + string pwd; + + if (user == internalSecurity.user) { + pwd = internalSecurity.pwd; + } + else { + static BSONObj userPattern = fromjson("{\"user\":1}"); + string systemUsers = dbname + ".system.users"; + OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1"); + { + BSONObjBuilder b; + b << "user" << user; + BSONObj query = b.done(); + if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { + log() << "auth: couldn't find user " << user << ", " << systemUsers << endl; + errmsg = "auth fails"; + return false; + } } + + pwd = userObj.getStringField("pwd"); } - + + md5digest d; { - - string pwd = userObj.getStringField("pwd"); digestBuilder << user << pwd; string done = digestBuilder.str(); - + md5_state_t st; md5_init(&st); md5_append(&st, (const md5_byte_t *) done.c_str(), done.size()); md5_finish(&st, d); } - + string computed = digestToString( d ); - - if ( key != computed ){ + + if ( key != computed ) { log() << "auth: key mismatch " << user << ", ns:" << dbname << endl; errmsg = "auth fails"; return false; } AuthenticationInfo *ai = cc().getAuthenticationInfo(); - + if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) { ai->authorizeReadOnly( cc().database()->name.c_str() ); - } else { + } + else { ai->authorize( cc().database()->name.c_str() ); } return true; } } cmdAuthenticate; - + } // namespace mongo diff --git a/db/security_key.cpp b/db/security_key.cpp new file mode 100644 index 0000000..1ea7021 --- /dev/null +++ b/db/security_key.cpp @@ -0,0 +1,105 @@ +// security_key.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +/** + * This file contains inter-mongo instance security helpers. Due to the + * requirement that it be possible to compile this into mongos and mongod, it + * should not depend on much external stuff. + */ + +#include "pch.h" +#include "security_key.h" +#include "../client/dbclient.h" + +#include + +namespace mongo { + + bool noauth = true; + AuthInfo internalSecurity; + + bool setUpSecurityKey(const string& filename) { + struct stat stats; + + // check obvious file errors + if (stat(filename.c_str(), &stats) == -1) { + log() << "error getting file " << filename << ": " << strerror(errno) << endl; + return false; + } + +#if !defined(WIN32) + // check permissions: must be X00, where X is >= 4 + if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) { + log() << "permissions on " << filename << " are too open" << endl; + return false; + } +#endif + + const unsigned long long fileLength = stats.st_size; + if (fileLength < 6 || fileLength > 1024) { + log() << " key file " << filename << " has length " << stats.st_size + << ", must be between 6 and 1024 chars" << endl; + return false; + } + + FILE* file = fopen( filename.c_str(), "rb" ); + if (!file) { + log() << "error opening file: " << filename << ": " << strerror(errno) << endl; + return false; + } + + string str = ""; + + // strip key file + unsigned long long read = 0; + while (read < fileLength) { + char buf; + int readLength = fread(&buf, 1, 1, file); + if (readLength < 1) { + log() << "error reading file " << filename << endl; + return false; + } + read++; + + // check for whitespace + if ((buf >= '\x09' && buf <= '\x0D') || buf == ' ') { + continue; + } + + // check valid base64 + if ((buf < 'A' || buf > 'Z') && (buf < 'a' || buf > 'z') && (buf < '0' || buf > '9') && buf != '+' && buf != '/') { + log() << "invalid char in key file " << filename << ": " << buf << endl; + return false; + } + + str += buf; + } + + if (str.size() < 6) { + log() << "security key must be at least 6 characters" << endl; + return false; + } + + log(1) << "security key: " << str << endl; + + // createPWDigest should really not be a member func + DBClientConnection conn; + internalSecurity.pwd = conn.createPasswordDigest(internalSecurity.user, str); + + return true; + } +} // namespace mongo diff --git a/db/security_key.h b/db/security_key.h new file mode 100644 index 0000000..86f1307 --- /dev/null +++ b/db/security_key.h @@ -0,0 +1,47 @@ +// security_key.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +namespace mongo { + + /** + * Internal secret key info. + */ + struct AuthInfo { + AuthInfo() { + user = "__system"; + } + string user; + string pwd; + }; + + // --noauth cmd line option + extern bool noauth; + extern AuthInfo internalSecurity; + + /** + * This method checks the validity of filename as a security key, hashes its + * contents, and stores it in the internalSecurity variable. Prints an + * error message to the logs if there's an error. + * @param filename the file containing the key + * @return if the key was successfully stored + */ + bool setUpSecurityKey(const string& filename); + +} // namespace mongo diff --git a/db/stats/counters.cpp b/db/stats/counters.cpp index a2d4cfb..889e8a8 100644 --- a/db/stats/counters.cpp +++ b/db/stats/counters.cpp @@ -22,7 +22,7 @@ namespace mongo { - OpCounters::OpCounters(){ + OpCounters::OpCounters() { int zero = 0; BSONObjBuilder b; @@ -42,16 +42,16 @@ namespace mongo { _command = (AtomicUInt*)_obj["command"].value(); } - void OpCounters::gotOp( int op , bool isCommand ){ - switch ( op ){ + void OpCounters::gotOp( int op , bool isCommand ) { + switch ( op ) { case dbInsert: /*gotInsert();*/ break; // need to handle multi-insert - case dbQuery: + case dbQuery: if ( isCommand ) gotCommand(); - else - gotQuery(); + else + gotQuery(); break; - + case dbUpdate: gotUpdate(); break; case dbDelete: gotDelete(); break; case dbGetMore: gotGetMore(); break; @@ -62,24 +62,48 @@ namespace mongo { default: log() << "OpCounters::gotOp unknown op: " << op << endl; } } - - IndexCounters::IndexCounters(){ + + BSONObj& OpCounters::getObj() { + const unsigned MAX = 1 << 30; + RARELY { + bool wrap = + _insert->get() > MAX || + _query->get() > MAX || + _update->get() > MAX || + _delete->get() > MAX || + _getmore->get() > MAX || + _command->get() > MAX; + + if ( wrap ) { + _insert->zero(); + _query->zero(); + _update->zero(); + _delete->zero(); + _getmore->zero(); + _command->zero(); + } + + } + return _obj; + } + + IndexCounters::IndexCounters() { _memSupported = _pi.blockCheckSupported(); - + _btreeMemHits = 0; _btreeMemMisses = 0; _btreeAccesses = 0; - - + + _maxAllowed = ( numeric_limits< long long >::max() ) / 2; _resets = 0; _sampling = 0; _samplingrate = 100; } - - void IndexCounters::append( BSONObjBuilder& b ){ - if ( ! _memSupported ){ + + void IndexCounters::append( BSONObjBuilder& b ) { + if ( ! _memSupported ) { b.append( "note" , "not supported on this platform" ); return; } @@ -90,33 +114,33 @@ namespace mongo { bb.appendNumber( "misses" , _btreeMemMisses ); bb.append( "resets" , _resets ); - + bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) ); - + bb.done(); - - if ( _btreeAccesses > _maxAllowed ){ + + if ( _btreeAccesses > _maxAllowed ) { _btreeAccesses = 0; _btreeMemMisses = 0; _btreeMemHits = 0; _resets++; } } - + FlushCounters::FlushCounters() : _total_time(0) , _flushes(0) , _last() {} - void FlushCounters::flushed(int ms){ + void FlushCounters::flushed(int ms) { _flushes++; _total_time += ms; _last_time = ms; _last = jsTime(); } - void FlushCounters::append( BSONObjBuilder& b ){ + void FlushCounters::append( BSONObjBuilder& b ) { b.appendNumber( "flushes" , _flushes ); b.appendNumber( "total_ms" , _total_time ); b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) ); @@ -125,25 +149,59 @@ namespace mongo { } - void GenericCounter::hit( const string& name , int count ){ + void GenericCounter::hit( const string& name , int count ) { scoped_lock lk( _mutex ); _counts[name]++; } - + BSONObj GenericCounter::getObj() { BSONObjBuilder b(128); { mongo::mutex::scoped_lock lk( _mutex ); - for ( map::iterator i=_counts.begin(); i!=_counts.end(); i++ ){ + for ( map::iterator i=_counts.begin(); i!=_counts.end(); i++ ) { b.appendNumber( i->first , i->second ); } } return b.obj(); } - + + void NetworkCounter::hit( long long bytesIn , long long bytesOut ) { + const long long MAX = 1ULL << 60; + + // don't care about the race as its just a counter + bool overflow = _bytesIn > MAX || _bytesOut > MAX; + + if ( overflow ) { + _lock.lock(); + _overflows++; + _bytesIn = bytesIn; + _bytesOut = bytesOut; + _requests = 1; + _lock.unlock(); + } + else { + _lock.lock(); + _bytesIn += bytesIn; + _bytesOut += bytesOut; + _requests++; + _lock.unlock(); + } + } + + void NetworkCounter::append( BSONObjBuilder& b ) { + _lock.lock(); + b.appendNumber( "bytesIn" , _bytesIn ); + b.appendNumber( "bytesOut" , _bytesOut ); + b.appendNumber( "numRequests" , _requests ); + _lock.unlock(); + } + OpCounters globalOpCounters; + OpCounters replOpCounters; IndexCounters globalIndexCounters; FlushCounters globalFlushCounters; + NetworkCounter networkCounter; + } diff --git a/db/stats/counters.h b/db/stats/counters.h index 2704464..b5cad85 100644 --- a/db/stats/counters.h +++ b/db/stats/counters.h @@ -21,6 +21,7 @@ #include "../jsobj.h" #include "../../util/message.h" #include "../../util/processinfo.h" +#include "../../util/concurrency/spin_lock.h" namespace mongo { @@ -30,28 +31,33 @@ namespace mongo { */ class OpCounters { public: - + OpCounters(); - AtomicUInt * getInsert(){ return _insert; } - AtomicUInt * getQuery(){ return _query; } - AtomicUInt * getUpdate(){ return _update; } - AtomicUInt * getDelete(){ return _delete; } - AtomicUInt * getGetMore(){ return _getmore; } - AtomicUInt * getCommand(){ return _command; } - - void gotInsert(){ _insert[0]++; } - void gotQuery(){ _query[0]++; } - void gotUpdate(){ _update[0]++; } - void gotDelete(){ _delete[0]++; } - void gotGetMore(){ _getmore[0]++; } - void gotCommand(){ _command[0]++; } + AtomicUInt * getInsert() { return _insert; } + AtomicUInt * getQuery() { return _query; } + AtomicUInt * getUpdate() { return _update; } + AtomicUInt * getDelete() { return _delete; } + AtomicUInt * getGetMore() { return _getmore; } + AtomicUInt * getCommand() { return _command; } + + void incInsertInWriteLock(int n) { _insert->x += n; } + void gotInsert() { _insert[0]++; } + void gotQuery() { _query[0]++; } + void gotUpdate() { _update[0]++; } + void gotDelete() { _delete[0]++; } + void gotGetMore() { _getmore[0]++; } + void gotCommand() { _command[0]++; } void gotOp( int op , bool isCommand ); - BSONObj& getObj(){ return _obj; } + BSONObj& getObj(); + private: BSONObj _obj; + + // todo: there will be a lot of cache line contention on these. need to do something + // else eventually. AtomicUInt * _insert; AtomicUInt * _query; AtomicUInt * _update; @@ -59,14 +65,16 @@ namespace mongo { AtomicUInt * _getmore; AtomicUInt * _command; }; - + extern OpCounters globalOpCounters; + extern OpCounters replOpCounters; + class IndexCounters { public: IndexCounters(); - - void btree( char * node ){ + + void btree( char * node ) { if ( ! _memSupported ) return; if ( _sampling++ % _samplingrate ) @@ -74,28 +82,28 @@ namespace mongo { btree( _pi.blockInMemory( node ) ); } - void btree( bool memHit ){ + void btree( bool memHit ) { if ( memHit ) _btreeMemHits++; else _btreeMemMisses++; _btreeAccesses++; } - void btreeHit(){ _btreeMemHits++; _btreeAccesses++; } - void btreeMiss(){ _btreeMemMisses++; _btreeAccesses++; } - + void btreeHit() { _btreeMemHits++; _btreeAccesses++; } + void btreeMiss() { _btreeMemMisses++; _btreeAccesses++; } + void append( BSONObjBuilder& b ); - + private: ProcessInfo _pi; bool _memSupported; int _sampling; int _samplingrate; - + int _resets; long long _maxAllowed; - + long long _btreeMemMisses; long long _btreeMemHits; long long _btreeAccesses; @@ -108,7 +116,7 @@ namespace mongo { FlushCounters(); void flushed(int ms); - + void append( BSONObjBuilder& b ); private: @@ -130,4 +138,21 @@ namespace mongo { map _counts; // TODO: replace with thread safe map mongo::mutex _mutex; }; + + class NetworkCounter { + public: + NetworkCounter() : _bytesIn(0), _bytesOut(0), _requests(0), _overflows(0) {} + void hit( long long bytesIn , long long bytesOut ); + void append( BSONObjBuilder& b ); + private: + long long _bytesIn; + long long _bytesOut; + long long _requests; + + long long _overflows; + + SpinLock _lock; + }; + + extern NetworkCounter networkCounter; } diff --git a/db/stats/fine_clock.h b/db/stats/fine_clock.h index 1f23175..02600e7 100644 --- a/db/stats/fine_clock.h +++ b/db/stats/fine_clock.h @@ -36,29 +36,30 @@ namespace mongo { * Really, you shouldn't be using this class in hot code paths for * platforms you're not sure whether the overhead is low. */ - class FineClock{ + class FineClock { public: typedef timespec WallTime; - static WallTime now(){ + static WallTime now() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts; } - static uint64_t diffInNanos( WallTime end, WallTime start ){ + static uint64_t diffInNanos( WallTime end, WallTime start ) { uint64_t diff; - if ( end.tv_nsec < start.tv_nsec ){ + if ( end.tv_nsec < start.tv_nsec ) { diff = 1000000000 * ( end.tv_sec - start.tv_sec - 1); diff += 1000000000 + end.tv_nsec - start.tv_nsec; - } else { + } + else { diff = 1000000000 * ( end.tv_sec - start.tv_sec ); diff += end.tv_nsec - start.tv_nsec; } return diff; } - + }; } diff --git a/db/stats/service_stats.cpp b/db/stats/service_stats.cpp index 5574ecb..d69147f 100644 --- a/db/stats/service_stats.cpp +++ b/db/stats/service_stats.cpp @@ -25,7 +25,7 @@ namespace mongo { using std::ostringstream; - ServiceStats::ServiceStats(){ + ServiceStats::ServiceStats() { // Time histogram covers up to 128msec in exponential intervals // starting at 125usec. Histogram::Options timeOpts; @@ -43,12 +43,12 @@ namespace mongo { _spaceHistogram = new Histogram( spaceOpts ); } - ServiceStats::~ServiceStats(){ + ServiceStats::~ServiceStats() { delete _timeHistogram; delete _spaceHistogram; } - void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ){ + void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ) { _spinLock.lock(); _timeHistogram->insert( duration / 1000 /* in usecs */ ); _spaceHistogram->insert( bytes ); diff --git a/db/stats/snapshots.cpp b/db/stats/snapshots.cpp index 3ce80ca..a81568d 100644 --- a/db/stats/snapshots.cpp +++ b/db/stats/snapshots.cpp @@ -27,28 +27,27 @@ handles snapshotting performance metrics and other such things */ namespace mongo { - void SnapshotData::takeSnapshot(){ - _created = curTimeMicros64(); - _globalUsage = Top::global.getGlobalData(); + void SnapshotData::takeSnapshot() { + _created = curTimeMicros64(); + _globalUsage = Top::global.getGlobalData(); _totalWriteLockedTime = dbMutex.info().getTimeLocked(); Top::global.cloneMap(_usage); } SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer ) - : _older( older ) , _newer( newer ) - { + : _older( older ) , _newer( newer ) { assert( _newer._created > _older._created ); _elapsed = _newer._created - _older._created; - + } - - Top::CollectionData SnapshotDelta::globalUsageDiff(){ + + Top::CollectionData SnapshotDelta::globalUsageDiff() { return Top::CollectionData( _older._globalUsage , _newer._globalUsage ); } - Top::UsageMap SnapshotDelta::collectionUsageDiff(){ + Top::UsageMap SnapshotDelta::collectionUsageDiff() { Top::UsageMap u; - - for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ){ + + for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) { Top::UsageMap::const_iterator j = _older._usage.find(i->first); if (j != _older._usage.end()) u[i->first] = Top::CollectionData( j->second , i->second ); @@ -62,8 +61,8 @@ namespace mongo { , _loc(0) , _stored(0) {} - - const SnapshotData* Snapshots::takeSnapshot(){ + + const SnapshotData* Snapshots::takeSnapshot() { scoped_lock lk(_lock); _loc = ( _loc + 1 ) % _n; _snapshots[_loc].takeSnapshot(); @@ -72,7 +71,7 @@ namespace mongo { return &_snapshots[_loc]; } - auto_ptr Snapshots::computeDelta( int numBack ){ + auto_ptr Snapshots::computeDelta( int numBack ) { scoped_lock lk(_lock); auto_ptr p; if ( numBack < numDeltas() ) @@ -80,43 +79,43 @@ namespace mongo { return p; } - const SnapshotData& Snapshots::getPrev( int numBack ){ + const SnapshotData& Snapshots::getPrev( int numBack ) { int x = _loc - numBack; if ( x < 0 ) x += _n; return _snapshots[x]; } - void Snapshots::outputLockInfoHTML( stringstream& ss ){ + void Snapshots::outputLockInfoHTML( stringstream& ss ) { scoped_lock lk(_lock); ss << "\n
"; - for ( int i=0; i 4100 ) + if( e < 3900 || e > 4100 ) ss << '(' << e / 1000.0 << "s)"; ss << ' '; } ss << "
\n"; } - void SnapshotThread::run(){ + void SnapshotThread::run() { Client::initThread("snapshotthread"); Client& client = cc(); long long numLoops = 0; - + const SnapshotData* prev = 0; - while ( ! inShutdown() ){ + while ( ! inShutdown() ) { try { const SnapshotData* s = statsSnapshots.takeSnapshot(); - - if ( prev ){ + + if ( prev ) { unsigned long long elapsed = s->_created - prev->_created; - if ( cmdLine.cpu ){ + if ( cmdLine.cpu ) { SnapshotDelta d( *prev , *s ); log() << "cpu: elapsed:" << (elapsed/1000) <<" writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl; } @@ -125,14 +124,14 @@ namespace mongo { prev = s; } - catch ( std::exception& e ){ + catch ( std::exception& e ) { log() << "ERROR in SnapshotThread: " << e.what() << endl; } - + numLoops++; sleepsecs(4); } - + client.shutdown(); } @@ -140,15 +139,15 @@ namespace mongo { class WriteLockStatus : public WebStatusPlugin { public: - WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ){} - virtual void init(){} + WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ) {} + virtual void init() {} - virtual void run( stringstream& ss ){ + virtual void run( stringstream& ss ) { statsSnapshots.outputLockInfoHTML( ss ); ss << ""; + "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" " + "title=\"snapshot: was the db in the write lock when this page was generated?\">"; ss << "write locked now: " << (dbMutex.info().isLocked() ? "true" : "false") << "\n"; } @@ -156,22 +155,26 @@ namespace mongo { class DBTopStatus : public WebStatusPlugin { public: - DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurences|percent of elapsed)" ){} + DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurences|percent of elapsed)" ) {} - void display( stringstream& ss , double elapsed , const Top::UsageData& usage ){ + void display( stringstream& ss , double elapsed , const Top::UsageData& usage ) { ss << ""; ss << usage.count; ss << ""; double per = 100 * ((double)usage.time)/elapsed; - ss << setprecision(1) << fixed << per << "%"; + if( per == (int) per ) + ss << (int) per; + else + ss << setprecision(1) << fixed << per; + ss << '%'; ss << ""; } - void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ){ - if ( ns != "GLOBAL" && data.total.count == 0 ) + void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ) { + if ( ns != "TOTAL" && data.total.count == 0 ) return; ss << "" << ns << ""; - + display( ss , elapsed , data.total ); display( ss , elapsed , data.readLock ); @@ -182,43 +185,43 @@ namespace mongo { display( ss , elapsed , data.insert ); display( ss , elapsed , data.update ); display( ss , elapsed , data.remove ); - + ss << "\n"; } - void run( stringstream& ss ){ + void run( stringstream& ss ) { auto_ptr delta = statsSnapshots.computeDelta(); if ( ! delta.get() ) return; - + ss << ""; ss << "" - "" - "" - "" - "" - "" - "" - "" - ""; + ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") << + "NS" + "" + "" + "" + "" + "" + "" + "" + ""; ss << "\n"; - - display( ss , (double) delta->elapsed() , "GLOBAL" , delta->globalUsageDiff() ); - + + display( ss , (double) delta->elapsed() , "TOTAL" , delta->globalUsageDiff() ); + Top::UsageMap usage = delta->collectionUsageDiff(); - for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ){ + for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ) { display( ss , (double) delta->elapsed() , i->first , i->second ); } - + ss << "
"; - ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") << - "NStotalReadsWritesQueriesGetMoresInsertsUpdatesRemovestotalReadsWritesQueriesGetMoresInsertsUpdatesRemoves
"; - + } - virtual void init(){} + virtual void init() {} } dbtopStatus; Snapshots statsSnapshots; - SnapshotThread snapshotThread; + SnapshotThread snapshotThread; } diff --git a/db/stats/snapshots.h b/db/stats/snapshots.h index 6d8e23d..d9b8e5e 100644 --- a/db/stats/snapshots.h +++ b/db/stats/snapshots.h @@ -28,7 +28,7 @@ namespace mongo { class SnapshotThread; - + /** * stores a point in time snapshot * i.e. all counters at a given time @@ -45,14 +45,14 @@ namespace mongo { friend class SnapshotDelta; friend class Snapshots; }; - + /** * contains performance information for a time period */ class SnapshotDelta { public: SnapshotDelta( const SnapshotData& older , const SnapshotData& newer ); - + unsigned long long start() const { return _older._created; } @@ -60,7 +60,7 @@ namespace mongo { unsigned long long elapsed() const { return _elapsed; } - + unsigned long long timeInWriteLock() const { return _newer._totalWriteLockedTime - _older._totalWriteLockedTime; } @@ -83,15 +83,15 @@ namespace mongo { class Snapshots { public: Snapshots(int n=100); - + const SnapshotData* takeSnapshot(); - + int numDeltas() const { return _stored-1; } const SnapshotData& getPrev( int numBack = 0 ); auto_ptr computeDelta( int numBack = 0 ); - - + + void outputLockInfoHTML( stringstream& ss ); private: mongo::mutex _lock; @@ -103,10 +103,10 @@ namespace mongo { class SnapshotThread : public BackgroundJob { public: - string name() { return "snapshot"; } + virtual string name() const { return "snapshot"; } void run(); }; - + extern Snapshots statsSnapshots; extern SnapshotThread snapshotThread; diff --git a/db/stats/top.cpp b/db/stats/top.cpp index 3e65261..77aef0d 100644 --- a/db/stats/top.cpp +++ b/db/stats/top.cpp @@ -22,16 +22,16 @@ #include "../commands.h" namespace mongo { - - Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) - : time(newer.time-older.time) , - count(newer.count-older.count) - { - + + Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) { + // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative + time = (newer.time > older.time) ? (newer.time - older.time) : newer.time; + count = (newer.count > older.count) ? (newer.count - older.count) : newer.count; + } Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer ) - : total( older.total , newer.total ) , + : total( older.total , newer.total ) , readLock( older.readLock , newer.readLock ) , writeLock( older.writeLock , newer.writeLock ) , queries( older.queries , newer.queries ) , @@ -39,17 +39,18 @@ namespace mongo { insert( older.insert , newer.insert ) , update( older.update , newer.update ) , remove( older.remove , newer.remove ), - commands( older.commands , newer.commands ) - { - + commands( older.commands , newer.commands ) { + } - - void Top::record( const string& ns , int op , int lockType , long long micros , bool command ){ + void Top::record( const string& ns , int op , int lockType , long long micros , bool command ) { + if ( ns[0] == '?' ) + return; + //cout << "record: " << ns << "\t" << op << "\t" << command << endl; scoped_lock lk(_lock); - - if ( ( command || op == dbQuery ) && ns == _lastDropped ){ + + if ( ( command || op == dbQuery ) && ns == _lastDropped ) { _lastDropped = ""; return; } @@ -59,22 +60,15 @@ namespace mongo { _record( _global , op , lockType , micros , command ); } - void Top::collectionDropped( const string& ns ){ - //cout << "collectionDropped: " << ns << endl; - scoped_lock lk(_lock); - _usage.erase(ns); - _lastDropped = ns; - } - - void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ){ + void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ) { c.total.inc( micros ); - + if ( lockType > 0 ) c.writeLock.inc( micros ); else if ( lockType < 0 ) c.readLock.inc( micros ); - - switch ( op ){ + + switch ( op ) { case 0: // use 0 for unknown, non-specific break; @@ -98,7 +92,7 @@ namespace mongo { break; case dbKillCursors: break; - case opReply: + case opReply: case dbMsg: log() << "unexpected op in Top::record: " << op << endl; break; @@ -108,55 +102,62 @@ namespace mongo { } - void Top::cloneMap(Top::UsageMap& out){ + void Top::collectionDropped( const string& ns ) { + //cout << "collectionDropped: " << ns << endl; + scoped_lock lk(_lock); + _usage.erase(ns); + _lastDropped = ns; + } + + void Top::cloneMap(Top::UsageMap& out) const { scoped_lock lk(_lock); out = _usage; } - void Top::append( BSONObjBuilder& b ){ + void Top::append( BSONObjBuilder& b ) { scoped_lock lk( _lock ); - append( b , _usage ); + _appendToUsageMap( b , _usage ); } - void Top::append( BSONObjBuilder& b , const char * name , const UsageData& map ){ - BSONObjBuilder bb( b.subobjStart( name ) ); - bb.appendNumber( "time" , map.time ); - bb.appendNumber( "count" , map.count ); - bb.done(); - } + void Top::_appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const { + for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ) { + BSONObjBuilder bb( b.subobjStart( i->first ) ); - void Top::append( BSONObjBuilder& b , const UsageMap& map ){ - for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ){ - BSONObjBuilder bb( b.subobjStart( i->first.c_str() ) ); - const CollectionData& coll = i->second; - - append( b , "total" , coll.total ); - - append( b , "readLock" , coll.readLock ); - append( b , "writeLock" , coll.writeLock ); - - append( b , "queries" , coll.queries ); - append( b , "getmore" , coll.getmore ); - append( b , "insert" , coll.insert ); - append( b , "update" , coll.update ); - append( b , "remove" , coll.remove ); - append( b , "commands" , coll.commands ); - + + _appendStatsEntry( b , "total" , coll.total ); + + _appendStatsEntry( b , "readLock" , coll.readLock ); + _appendStatsEntry( b , "writeLock" , coll.writeLock ); + + _appendStatsEntry( b , "queries" , coll.queries ); + _appendStatsEntry( b , "getmore" , coll.getmore ); + _appendStatsEntry( b , "insert" , coll.insert ); + _appendStatsEntry( b , "update" , coll.update ); + _appendStatsEntry( b , "remove" , coll.remove ); + _appendStatsEntry( b , "commands" , coll.commands ); + bb.done(); } } + void Top::_appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const { + BSONObjBuilder bb( b.subobjStart( statsName ) ); + bb.appendNumber( "time" , map.time ); + bb.appendNumber( "count" , map.count ); + bb.done(); + } + class TopCmd : public Command { public: - TopCmd() : Command( "top", true ){} + TopCmd() : Command( "top", true ) {} virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream& help ) const { help << "usage by collection"; } - virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { { BSONObjBuilder b( result.subobjStart( "totals" ) ); Top::global.append( b ); @@ -164,11 +165,11 @@ namespace mongo { } return true; } - + } topCmd; Top Top::global; - + TopOld::T TopOld::_snapshotStart = TopOld::currentTime(); TopOld::D TopOld::_snapshotDuration; TopOld::UsageMap TopOld::_totalUsage; diff --git a/db/stats/top.h b/db/stats/top.h index 135e8f8..9645ed1 100644 --- a/db/stats/top.h +++ b/db/stats/top.h @@ -31,29 +31,27 @@ namespace mongo { public: Top() : _lock("Top") { } - class UsageData { - public: - UsageData() : time(0) , count(0){} + struct UsageData { + UsageData() : time(0) , count(0) {} UsageData( const UsageData& older , const UsageData& newer ); long long time; long long count; - void inc( long long micros ){ + void inc( long long micros ) { count++; time += micros; } }; - class CollectionData { - public: + struct CollectionData { /** * constructs a diff */ - CollectionData(){} + CollectionData() {} CollectionData( const CollectionData& older , const CollectionData& newer ); - + UsageData total; - + UsageData readLock; UsageData writeLock; @@ -66,25 +64,23 @@ namespace mongo { }; typedef map UsageMap; - + public: void record( const string& ns , int op , int lockType , long long micros , bool command ); void append( BSONObjBuilder& b ); - void cloneMap(UsageMap& out); - CollectionData getGlobalData(){ return _global; } + void cloneMap(UsageMap& out) const; + CollectionData getGlobalData() const { return _global; } void collectionDropped( const string& ns ); public: // static stuff static Top global; - - void append( BSONObjBuilder& b , const char * name , const UsageData& map ); - void append( BSONObjBuilder& b , const UsageMap& map ); - + private: - + void _appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const; + void _appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const; void _record( CollectionData& c , int op , int lockType , long long micros , bool command ); - mongo::mutex _lock; + mutable mongo::mutex _lock; CollectionData _global; UsageMap _usage; string _lastDropped; @@ -99,9 +95,9 @@ namespace mongo { typedef boost::tuple< D, int, int, int > UsageData; public: TopOld() : _read(false), _write(false) { } - + /* these are used to record activity: */ - + void clientStart( const char *client ) { clientStop(); _currentStart = currentTime(); @@ -130,11 +126,11 @@ namespace mongo { /* these are used to fetch the stats: */ - struct Usage { - string ns; - D time; - double pct; - int reads, writes, calls; + struct Usage { + string ns; + D time; + double pct; + int reads, writes, calls; }; static void usage( vector< Usage > &res ) { @@ -145,7 +141,7 @@ namespace mongo { UsageMap totalUsage; fillParentNamespaces( snapshot, _snapshot ); fillParentNamespaces( totalUsage, _totalUsage ); - + multimap< D, string, more > sorted; for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i ) sorted.insert( make_pair( i->second.get<0>(), i->first ) ); @@ -181,7 +177,8 @@ namespace mongo { if ( &_snapshot == &_snapshotA ) { _snapshot = _snapshotB; _nextSnapshot = _snapshotA; - } else { + } + else { _snapshot = _snapshotA; _nextSnapshot = _snapshotB; } @@ -211,7 +208,7 @@ namespace mongo { g.get< 1 >()++; else if ( !_read && _write ) g.get< 2 >()++; - g.get< 3 >()++; + g.get< 3 >()++; } static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) { for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) { @@ -224,8 +221,8 @@ namespace mongo { current = current.substr( 0, dot ); inc( to[ current ], i->second ); dot = current.rfind( "." ); - } - } + } + } } static void inc( UsageData &to, const UsageData &from ) { to.get<0>() += from.get<0>(); diff --git a/db/storage.cpp b/db/storage.cpp deleted file mode 100644 index 63e7639..0000000 --- a/db/storage.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// storage.cpp -/* - * Copyright (C) 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - - -#include "pch.h" -#include "pdfile.h" -//#include "reccache.h" -#include "rec.h" -#include "db.h" - -namespace mongo { - -// pick your store for indexes by setting this typedef -// this doesn't need to be an ifdef, we can make it dynamic -#if defined(_RECSTORE) -RecStoreInterface *btreeStore = new CachedBasicRecStore(); -#else -MongoMemMapped_RecStore *btreeStore = new MongoMemMapped_RecStore(); -#endif - -#if 0 - -#if defined(_RECSTORE) - static int inited; -#endif - -void writerThread(); - -void BasicRecStore::init(const char *fn, unsigned recsize) -{ - massert( 10394 , "compile packing problem recstore?", sizeof(RecStoreHeader) == 8192); - filename = fn; - f.open(fn); - uassert( 10130 , string("couldn't open file:")+fn, f.is_open() ); - len = f.len(); - if( len == 0 ) { - log() << "creating recstore file " << fn << '\n'; - h.recsize = recsize; - len = sizeof(RecStoreHeader); - f.write(0, (const char *) &h, sizeof(RecStoreHeader)); - } - else { - f.read(0, (char *) &h, sizeof(RecStoreHeader)); - massert( 10395 , string("recstore was not closed cleanly: ")+fn, h.cleanShutdown==0); - massert( 10396 , string("recstore recsize mismatch, file:")+fn, h.recsize == recsize); - massert( 10397 , string("bad recstore [1], file:")+fn, (h.leof-sizeof(RecStoreHeader)) % recsize == 0); - if( h.leof > len ) { - stringstream ss; - ss << "bad recstore, file:" << fn << " leof:" << h.leof << " len:" << len; - massert( 10398 , ss.str(), false); - } - if( h.cleanShutdown ) - log() << "warning: non-clean shutdown for file " << fn << '\n'; - h.cleanShutdown = 2; - writeHeader(); - f.fsync(); - } -#if defined(_RECSTORE) - if( inited++ == 0 ) { - boost::thread t(writerThread); - } -#endif -} - -#endif - -} diff --git a/db/taskqueue.h b/db/taskqueue.h new file mode 100644 index 0000000..c6a5667 --- /dev/null +++ b/db/taskqueue.h @@ -0,0 +1,106 @@ +// @file deferredinvoker.h + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#pragma once + +#include "mongomutex.h" + +namespace mongo { + + /** defer work items by queueing them for invocation by another thread. presumption is that + consumer thread is outside of locks more than the source thread. Additional presumption + is that several objects or micro-tasks will be queued and that having a single thread + processing them in batch is hepful as they (in the first use case) use a common data + structure that can then be in local cpu classes. + + this class is in db/ as it is dbMutex (mongomutex) specific (so far). + + using a functor instead of go() might be more elegant too, once again, would like to test any + performance differential. also worry that operator() hides things? + + MT - copyable "micro task" object we can queue + must have a static method void MT::go(const MT&) + + see DefInvoke in dbtests/ for an example. + */ + template< class MT > + class TaskQueue { + public: + TaskQueue() : _which(0), _invokeMutex("deferredinvoker") { } + + void defer(MT mt) { + // only one writer allowed. however the invoke processing below can occur concurrently with + // writes (for the most part) + DEV dbMutex.assertWriteLocked(); + + _queues[_which].push_back(mt); + } + + /** call to process deferrals. + + concurrency: handled herein. multiple threads could call invoke(), but their efforts will be + serialized. the common case is that there is a single processor calling invoke(). + + normally, you call this outside of any lock. but if you want to fully drain the queue, + call from within a read lock. for example: + { + // drain with minimal time in lock + d.invoke(); + readlock lk; + d.invoke(); + ... + } + you can also call invoke periodically to do some work and then pick up later on more. + */ + void invoke() { + mutex::scoped_lock lk2(_invokeMutex); + int toDrain = 0; + { + // flip queueing to the other queue (we are double buffered) + readlocktry lk("", 5); + if( !lk.got() ) + return; + toDrain = _which; + _which = _which ^ 1; + wassert( _queues[_which].empty() ); // we are in dbMutex, so it should be/stay empty til we exit dbMutex + } + + _drain( _queues[toDrain] ); + assert( _queues[toDrain].empty() ); + } + + private: + int _which; // 0 or 1 + typedef vector< MT > Queue; + Queue _queues[2]; + + // lock order when multiple locks: dbMutex, _invokeMutex + mongo::mutex _invokeMutex; + + void _drain(Queue& queue) { + unsigned oldCap = queue.capacity(); + for( typename Queue::iterator i = queue.begin(); i != queue.end(); i++ ) { + const MT& v = *i; + MT::go(v); + } + queue.clear(); + DEV assert( queue.capacity() == oldCap ); // just checking that clear() doesn't deallocate, we don't want that + } + }; + +} diff --git a/db/tests.cpp b/db/tests.cpp index 1218f1b..00f299e 100644 --- a/db/tests.cpp +++ b/db/tests.cpp @@ -32,7 +32,7 @@ namespace mongo { MemoryMappedFile f; - long len = 64*1024*1024; + unsigned long long len = 64*1024*1024; char *p = (char *) f.map("/tmp/test.dat", len); char *start = p; char *end = p + 64*1024*1024-2; diff --git a/db/update.cpp b/db/update.cpp index e178e0f..7de9bb1 100644 --- a/db/update.cpp +++ b/db/update.cpp @@ -31,21 +31,25 @@ namespace mongo { const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" , - "$bitand" , "$bitor" , "$bit" , "$addToSet" }; + "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename" + }; unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*); bool Mod::_pullElementMatch( BSONElement& toMatch ) const { - - if ( elt.type() != Object ){ + + if ( elt.type() != Object ) { // if elt isn't an object, then comparison will work return toMatch.woCompare( elt , false ) == 0; } - if ( toMatch.type() != Object ){ + if ( matcherOnPrimitive ) + return matcher->matches( toMatch.wrap( "" ) ); + + if ( toMatch.type() != Object ) { // looking for an object, so this can't match return false; } - + // now we have an object on both sides return matcher->matches( toMatch.embeddedObject() ); } @@ -54,41 +58,53 @@ namespace mongo { void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const { BSONType a = in.type(); BSONType b = elt.type(); - - if ( a == NumberDouble || b == NumberDouble ){ + + if ( a == NumberDouble || b == NumberDouble ) { ms.incType = NumberDouble; ms.incdouble = elt.numberDouble() + in.numberDouble(); } - else if ( a == NumberLong || b == NumberLong ){ + else if ( a == NumberLong || b == NumberLong ) { ms.incType = NumberLong; ms.inclong = elt.numberLong() + in.numberLong(); } else { - ms.incType = NumberInt; - ms.incint = elt.numberInt() + in.numberInt(); + int x = elt.numberInt() + in.numberInt(); + if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) { + // overflow + ms.incType = NumberLong; + ms.inclong = elt.numberLong() + in.numberLong(); + } + else { + ms.incType = NumberInt; + ms.incint = elt.numberInt() + in.numberInt(); + } } - + ms.appendIncValue( bb , false ); } template< class Builder > void appendUnset( Builder &b ) { } - + template<> void appendUnset( BSONArrayBuilder &b ) { b.appendNull(); } - + template< class Builder > void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const { - switch ( op ){ - + if ( ms.dontApply ) { + return; + } + + switch ( op ) { + case INC: { appendIncremented( b , in , ms ); break; } - + case SET: { _checkForAppending( elt ); b.appendAs( elt , shortFieldName ); @@ -99,13 +115,13 @@ namespace mongo { appendUnset( b ); break; } - + case PUSH: { uassert( 10131 , "$push can only be applied to an array" , in.type() == Array ); BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); BSONObjIterator i( in.embeddedObject() ); int n=0; - while ( i.more() ){ + while ( i.more() ) { bb.append( i.next() ); n++; } @@ -116,28 +132,35 @@ namespace mongo { bb.done(); break; } - + case ADDTOSET: { uassert( 12592 , "$addToSet can only be applied to an array" , in.type() == Array ); BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); - + BSONObjIterator i( in.embeddedObject() ); - int n=0; + int n=0; + + if ( isEach() ) { - if ( isEach() ){ - BSONElementSet toadd; parseEach( toadd ); - - while ( i.more() ){ + + while ( i.more() ) { BSONElement cur = i.next(); bb.append( cur ); - n++; + n++; toadd.erase( cur ); } - - for ( BSONElementSet::iterator j=toadd.begin(); j!=toadd.end(); j++ ){ - bb.appendAs( *j , BSONObjBuilder::numStr( n++ ) ); + + { + BSONObjIterator i( getEach() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( toadd.count(e) ) { + bb.appendAs( e , BSONObjBuilder::numStr( n++ ) ); + toadd.erase( e ); + } + } } } @@ -145,34 +168,34 @@ namespace mongo { bool found = false; - while ( i.more() ){ + while ( i.more() ) { BSONElement cur = i.next(); bb.append( cur ); n++; if ( elt.woCompare( cur , false ) == 0 ) found = true; } - + if ( ! found ) bb.appendAs( elt , bb.numStr( n ) ); - + } - + bb.done(); break; } - + case PUSH_ALL: { uassert( 10132 , "$pushAll can only be applied to an array" , in.type() == Array ); uassert( 10133 , "$pushAll has to be passed an array" , elt.type() ); BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); - + BSONObjIterator i( in.embeddedObject() ); int n=0; - while ( i.more() ){ + while ( i.more() ) { bb.append( i.next() ); n++; } @@ -180,34 +203,34 @@ namespace mongo { ms.pushStartSize = n; i = BSONObjIterator( elt.embeddedObject() ); - while ( i.more() ){ + while ( i.more() ) { bb.appendAs( i.next() , bb.numStr( n++ ) ); } bb.done(); break; } - + case PULL: case PULL_ALL: { uassert( 10134 , "$pull/$pullAll can only be applied to an array" , in.type() == Array ); BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); - + int n = 0; BSONObjIterator i( in.embeddedObject() ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); bool allowed = true; - if ( op == PULL ){ + if ( op == PULL ) { allowed = ! _pullElementMatch( e ); } else { BSONObjIterator j( elt.embeddedObject() ); while( j.more() ) { BSONElement arrJ = j.next(); - if ( e.woCompare( arrJ, false ) == 0 ){ + if ( e.woCompare( arrJ, false ) == 0 ) { allowed = false; break; } @@ -217,7 +240,7 @@ namespace mongo { if ( allowed ) bb.appendAs( e , bb.numStr( n++ ) ); } - + bb.done(); break; } @@ -225,13 +248,13 @@ namespace mongo { case POP: { uassert( 10135 , "$pop can only be applied to an array" , in.type() == Array ); BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); - + int n = 0; BSONObjIterator i( in.embeddedObject() ); - if ( elt.isNumber() && elt.number() < 0 ){ + if ( elt.isNumber() && elt.number() < 0 ) { // pop from front - if ( i.more() ){ + if ( i.more() ) { i.next(); n++; } @@ -246,7 +269,7 @@ namespace mongo { while( i.more() ) { n++; BSONElement arrI = i.next(); - if ( i.more() ){ + if ( i.more() ) { bb.append( arrI ); } } @@ -262,23 +285,23 @@ namespace mongo { uassert( 10136 , "$bit needs an array" , elt.type() == Object ); uassert( 10137 , "$bit can only be applied to numbers" , in.isNumber() ); uassert( 10138 , "$bit can't use a double" , in.type() != NumberDouble ); - + int x = in.numberInt(); long long y = in.numberLong(); BSONObjIterator it( elt.embeddedObject() ); - while ( it.more() ){ + while ( it.more() ) { BSONElement e = it.next(); uassert( 10139 , "$bit field must be number" , e.isNumber() ); - if ( strcmp( e.fieldName() , "and" ) == 0 ){ - switch( in.type() ){ + if ( strcmp( e.fieldName() , "and" ) == 0 ) { + switch( in.type() ) { case NumberInt: x = x&e.numberInt(); break; case NumberLong: y = y&e.numberLong(); break; default: assert( 0 ); } } - else if ( strcmp( e.fieldName() , "or" ) == 0 ){ - switch( in.type() ){ + else if ( strcmp( e.fieldName() , "or" ) == 0 ) { + switch( in.type() ) { case NumberInt: x = x|e.numberInt(); break; case NumberLong: y = y|e.numberLong(); break; default: assert( 0 ); @@ -289,8 +312,8 @@ namespace mongo { throw UserException( 9016, (string)"unknown bit mod:" + e.fieldName() ); } } - - switch( in.type() ){ + + switch( in.type() ) { case NumberInt: b.append( shortFieldName , x ); break; case NumberLong: b.append( shortFieldName , y ); break; default: assert( 0 ); @@ -299,6 +322,15 @@ namespace mongo { break; } + case RENAME_FROM: { + break; + } + + case RENAME_TO: { + ms.handleRename( b, shortFieldName ); + break; + } + default: stringstream ss; ss << "Mod::apply can't handle type: " << op; @@ -306,11 +338,30 @@ namespace mongo { } } + // -1 inside a non-object (non-object could be array) + // 0 missing + // 1 found + int validRenamePath( BSONObj obj, const char *path ) { + while( const char *p = strchr( path, '.' ) ) { + string left( path, p - path ); + BSONElement e = obj.getField( left ); + if ( e.eoo() ) { + return 0; + } + if ( e.type() != Object ) { + return -1; + } + obj = e.embeddedObject(); + path = p + 1; + } + return !obj.getField( path ).eoo(); + } + auto_ptr ModSet::prepare(const BSONObj &obj) const { DEBUGUPDATE( "\t start prepare" ); - ModSetState * mss = new ModSetState( obj ); - - + auto_ptr mss( new ModSetState( obj ) ); + + // Perform this check first, so that we don't leave a partially modified object on uassert. for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) { DEBUGUPDATE( "\t\t prepare : " << i->first ); @@ -318,23 +369,51 @@ namespace mongo { const Mod& m = i->second; BSONElement e = obj.getFieldDotted(m.fieldName); - + ms.m = &m; ms.old = e; + if ( m.op == Mod::RENAME_FROM ) { + int source = validRenamePath( obj, m.fieldName ); + uassert( 13489, "$rename source field invalid", source != -1 ); + if ( source != 1 ) { + ms.dontApply = true; + } + continue; + } + + if ( m.op == Mod::RENAME_TO ) { + int source = validRenamePath( obj, m.renameFrom() ); + if ( source == 1 ) { + int target = validRenamePath( obj, m.fieldName ); + uassert( 13490, "$rename target field invalid", target != -1 ); + ms.newVal = obj.getFieldDotted( m.renameFrom() ); + mss->amIInPlacePossible( false ); + } + else { + ms.dontApply = true; + } + continue; + } + if ( e.eoo() ) { mss->amIInPlacePossible( m.op == Mod::UNSET ); continue; - } - + } + switch( m.op ) { case Mod::INC: uassert( 10140 , "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() ); - if ( mss->amIInPlacePossible( e.isNumber() ) ){ + if ( mss->amIInPlacePossible( e.isNumber() ) ) { // check more typing info here - if ( m.elt.type() != e.type() ){ + if ( m.elt.type() != e.type() ) { // if i'm incrememnting with a double, then the storage has to be a double - mss->amIInPlacePossible( m.elt.type() != NumberDouble ); + mss->amIInPlacePossible( m.elt.type() != NumberDouble ); + } + + // check for overflow + if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits::max() ) { + mss->amIInPlacePossible( false ); } } break; @@ -343,7 +422,7 @@ namespace mongo { mss->amIInPlacePossible( m.elt.type() == e.type() && m.elt.valuesize() == e.valuesize() ); break; - + case Mod::PUSH: case Mod::PUSH_ALL: uassert( 10141 , "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() ); @@ -358,7 +437,7 @@ namespace mongo { BSONElement arrI = i.next(); if ( m.op == Mod::PULL ) { mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) ); - } + } else if ( m.op == Mod::PULL_ALL ) { BSONObjIterator j( m.elt.embeddedObject() ); while( mss->_inPlacePossible && j.moreWithEOO() ) { @@ -377,12 +456,12 @@ namespace mongo { mss->amIInPlacePossible( e.embeddedObject().isEmpty() ); break; } - + case Mod::ADDTOSET: { uassert( 12591 , "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() ); - + BSONObjIterator i( e.embeddedObject() ); - if ( m.isEach() ){ + if ( m.isEach() ) { BSONElementSet toadd; m.parseEach( toadd ); while( i.more() ) { @@ -395,7 +474,7 @@ namespace mongo { bool found = false; while( i.more() ) { BSONElement arrI = i.next(); - if ( arrI.woCompare( m.elt , false ) == 0 ){ + if ( arrI.woCompare( m.elt , false ) == 0 ) { found = true; break; } @@ -404,7 +483,7 @@ namespace mongo { } break; } - + default: // mods we don't know about shouldn't be done in place mss->amIInPlacePossible( false ); @@ -412,28 +491,49 @@ namespace mongo { } DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" ); - - return auto_ptr( mss ); + + return mss; } void ModState::appendForOpLog( BSONObjBuilder& b ) const { - if ( incType ){ + if ( dontApply ) { + return; + } + + if ( incType ) { DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName ); BSONObjBuilder bb( b.subobjStart( "$set" ) ); appendIncValue( bb , true ); bb.done(); return; } - + + if ( m->op == Mod::RENAME_FROM ) { + DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fielName:" << m->fieldName ); + BSONObjBuilder bb( b.subobjStart( "$unset" ) ); + bb.append( m->fieldName, 1 ); + bb.done(); + return; + } + + if ( m->op == Mod::RENAME_TO ) { + DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fielName:" << m->fieldName ); + BSONObjBuilder bb( b.subobjStart( "$set" ) ); + bb.appendAs( newVal, m->fieldName ); + return; + } + const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()]; DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName ); BSONObjBuilder bb( b.subobjStart( name ) ); - if ( fixed ) + if ( fixed ) { bb.appendAs( *fixed , m->fieldName ); - else + } + else { bb.appendAs( m->elt , m->fieldName ); + } bb.done(); } @@ -445,30 +545,55 @@ namespace mongo { ss << " fixed: " << fixed; return ss.str(); } - - void ModSetState::applyModsInPlace() { + + template< class Builder > + void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) { + newObjBuilder.appendAs( newVal , shortFieldName ); + BSONObjBuilder b; + b.appendAs( newVal, shortFieldName ); + assert( _objData.isEmpty() ); + _objData = b.obj(); + newVal = _objData.firstElement(); + } + + void ModSetState::applyModsInPlace( bool isOnDisk ) { + // TODO i think this assert means that we can get rid of the isOnDisk param + // and just use isOwned as the determination + DEV assert( isOnDisk == ! _obj.isOwned() ); + for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) { ModState& m = i->second; - - switch ( m.m->op ){ + + if ( m.dontApply ) { + continue; + } + + switch ( m.m->op ) { case Mod::UNSET: case Mod::PULL: case Mod::PULL_ALL: case Mod::ADDTOSET: + case Mod::RENAME_FROM: + case Mod::RENAME_TO: // this should have been handled by prepare break; - - // [dm] the BSONElementManipulator statements below are for replication (correct?) + // [dm] the BSONElementManipulator statements below are for replication (correct?) case Mod::INC: - m.m->incrementMe( m.old ); + if ( isOnDisk ) + m.m->IncrementMe( m.old ); + else + m.m->incrementMe( m.old ); m.fixedOpName = "$set"; m.fixed = &(m.old); break; case Mod::SET: - BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt ); + if ( isOnDisk ) + BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt ); + else + BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt ); break; default: - uassert( 10144 , "can't apply mod in place - shouldn't have gotten here" , 0 ); + uassert( 13478 , "can't apply mod in place - shouldn't have gotten here" , 0 ); } } } @@ -488,61 +613,62 @@ namespace mongo { empty = false; } if ( empty ) - fields[ base + top.fieldName() ] = top; + fields[ base + top.fieldName() ] = top; } - + template< class Builder > - void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set& onedownseen ){ + void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set& onedownseen ) { const char * temp = m.fieldName(); temp += root.size(); const char * dot = strchr( temp , '.' ); - if ( dot ){ + if ( dot ) { string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) ); string nf( temp , 0 , dot - temp ); if ( onedownseen.count( nf ) ) return; onedownseen.insert( nf ); - BSONObjBuilder bb ( b.subobjStart( nf.c_str() ) ); + BSONObjBuilder bb ( b.subobjStart( nf ) ); createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name bb.done(); } else { appendNewFromMod( m , b ); } - + } - + template< class Builder > - void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ){ + void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) { DEBUGUPDATE( "\t\t createNewFromMods root: " << root ); BSONObjIteratorSorted es( obj ); BSONElement e = es.next(); - + ModStateHolder::iterator m = _mods.lower_bound( root ); StringBuilder buf(root.size() + 2 ); buf << root << (char)255; ModStateHolder::iterator mend = _mods.lower_bound( buf.str() ); - + set onedownseen; - - while ( e.type() && m != mend ){ + + while ( e.type() && m != mend ) { string field = root + e.fieldName(); FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field ); DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() ); - - switch ( cmp ){ - + + switch ( cmp ) { + case LEFT_SUBFIELD: { // Mod is embeddeed under this element - uassert( 10145 , "LEFT_SUBFIELD only supports Object" , e.type() == Object || e.type() == Array ); - if ( onedownseen.count( e.fieldName() ) == 0 ){ + uassert( 10145 , str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array ); + if ( onedownseen.count( e.fieldName() ) == 0 ) { onedownseen.insert( e.fieldName() ); if ( e.type() == Object ) { BSONObjBuilder bb( b.subobjStart( e.fieldName() ) ); stringstream nr; nr << root << e.fieldName() << "."; createNewFromMods( nr.str() , bb , e.embeddedObject() ); - bb.done(); - } else { + bb.done(); + } + else { BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) ); stringstream nr; nr << root << e.fieldName() << "."; createNewFromMods( nr.str() , ba , e.embeddedObject() ); @@ -578,22 +704,22 @@ namespace mongo { e = es.next(); continue; case RIGHT_SUBFIELD: - massert( 10399 , "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 ); + massert( 10399 , "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 ); break; default: massert( 10400 , "unhandled case" , 0 ); } } - + // finished looping the mods, just adding the rest of the elements - while ( e.type() ){ + while ( e.type() ) { DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() ); b.append( e ); // if array, ignore field name e = es.next(); } - + // do mods that don't have fields already - for ( ; m != mend; m++ ){ + for ( ; m != mend; m++ ) { DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName ); _appendNewFromMods( root , m->second , b , onedownseen ); } @@ -602,30 +728,30 @@ namespace mongo { BSONObj ModSetState::createNewFromMods() { BSONObjBuilder b( (int)(_obj.objsize() * 1.1) ); createNewFromMods( "" , b , _obj ); - return b.obj(); + return _newFromMods = b.obj(); } string ModSetState::toString() const { stringstream ss; - for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ){ + for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) { ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n"; } return ss.str(); } - BSONObj ModSet::createNewFromQuery( const BSONObj& query ){ + BSONObj ModSet::createNewFromQuery( const BSONObj& query ) { BSONObj newObj; { BSONObjBuilder bb; EmbeddedBuilder eb( &bb ); BSONObjIteratorSorted i( query ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add continue; - if ( e.type() == Object && e.embeddedObject().firstElement().fieldName()[0] == '$' ){ + if ( e.type() == Object && e.embeddedObject().firstElement().fieldName()[0] == '$' ) { // this means this is a $gt type filter, so don't make part of the new object continue; } @@ -635,17 +761,17 @@ namespace mongo { eb.done(); newObj = bb.obj(); } - + auto_ptr mss = prepare( newObj ); if ( mss->canApplyInPlace() ) - mss->applyModsInPlace(); + mss->applyModsInPlace( false ); else newObj = mss->createNewFromMods(); - + return newObj; } - + /* get special operations like $inc { $inc: { a:1, b:1 } } { $set: { a:77 } } @@ -656,21 +782,21 @@ namespace mongo { NOTE: MODIFIES source from object! */ ModSet::ModSet( - const BSONObj &from , + const BSONObj &from , const set& idxKeys, const set *backgroundKeys) : _isIndexed(0) , _hasDynamicArray( false ) { - + BSONObjIterator it(from); - + while ( it.more() ) { BSONElement e = it.next(); const char *fn = e.fieldName(); - + uassert( 10147 , "Invalid modifier specified" + string( fn ), e.type() == Object ); BSONObj j = e.embeddedObject(); DEBUGUPDATE( "\t" << j ); - + BSONObjIterator jt(j); Mod::Op op = opFromStr( fn ); @@ -685,18 +811,45 @@ namespace mongo { uassert( 10151 , "have conflicting mods in update" , ! haveConflictingMod( fieldName ) ); uassert( 10152 , "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC ); uassert( 10153 , "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) ); - + + if ( op == Mod::RENAME_TO ) { + uassert( 13494, "$rename target must be a string", f.type() == String ); + const char *target = f.valuestr(); + uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 ); + uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] ); + uassert( 13479, "invalid mod field name, target may not be empty", target[0] ); + uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' ); + uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' ); + uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) ); + uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) ); + uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) ); + uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) ); + uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) ); + uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 ); + uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 ); + + Mod from; + from.init( Mod::RENAME_FROM, f ); + from.setFieldName( fieldName ); + updateIsIndexed( from, idxKeys, backgroundKeys ); + _mods[ from.fieldName ] = from; + + Mod to; + to.init( Mod::RENAME_TO, f ); + to.setFieldName( target ); + updateIsIndexed( to, idxKeys, backgroundKeys ); + _mods[ to.fieldName ] = to; + + DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName ); + continue; + } + _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0; - + Mod m; m.init( op , f ); m.setFieldName( f.fieldName() ); - - if ( m.isIndexed( idxKeys ) || - (backgroundKeys && m.isIndexed(*backgroundKeys)) ) { - _isIndexed++; - } - + updateIsIndexed( m, idxKeys, backgroundKeys ); _mods[m.fieldName] = m; DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray ); @@ -709,10 +862,10 @@ namespace mongo { ModSet * n = new ModSet(); n->_isIndexed = _isIndexed; n->_hasDynamicArray = _hasDynamicArray; - for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ){ + for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) { string s = i->first; size_t idx = s.find( ".$" ); - if ( idx == string::npos ){ + if ( idx == string::npos ) { n->_mods[s] = i->second; continue; } @@ -726,7 +879,7 @@ namespace mongo { } return n; } - + void checkNoMods( BSONObj o ) { BSONObjIterator i( o ); while( i.moreWithEOO() ) { @@ -736,10 +889,10 @@ namespace mongo { uassert( 10154 , "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' ); } } - + class UpdateOp : public MultiCursor::CursorOp { public: - UpdateOp( bool hasPositionalField ) : _nscanned(), _hasPositionalField( hasPositionalField ){} + UpdateOp( bool hasPositionalField ) : _nscanned(), _hasPositionalField( hasPositionalField ) {} virtual void _init() { _c = qp().newCursor(); if ( ! _c->ok() ) { @@ -751,14 +904,18 @@ namespace mongo { _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) ); } return _cc->prepareToYield( _yieldData ); - } + } virtual void recoverFromYield() { if ( !ClientCursor::recoverFromYield( _yieldData ) ) { _c.reset(); _cc.reset(); massert( 13339, "cursor dropped during update", false ); } - } + } + virtual long long nscanned() { + assert( _c.get() ); + return _c->nscanned(); + } virtual void next() { if ( ! _c->ok() ) { setComplete(); @@ -789,64 +946,62 @@ namespace mongo { }; static void checkTooLarge(const BSONObj& newObj) { - uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= ( 4 * 1024 * 1024 ) ); + uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize ); } - /* note: this is only (as-is) called for + /* note: this is only (as-is) called for - not multi - not mods is indexed - not upsert */ - static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d, + static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d, NamespaceDetailsTransient *nsdt, - bool god, const char *ns, - const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) - { + bool god, const char *ns, + const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) { DiskLoc loc; { IndexDetails& i = d->idx(idIdxNo); BSONObj key = i.getKeyFromQuery( patternOrig ); loc = i.head.btree()->findSingle(i, i.head, key); - if( loc.isNull() ) { + if( loc.isNull() ) { // no upsert support in _updateById yet, so we are done. return UpdateResult(0, 0, 0); } } Record *r = loc.rec(); - + /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some regular ones at the moment. */ - if ( isOperatorUpdate ) { - const BSONObj& onDisk = loc.obj(); + if ( isOperatorUpdate ) { + const BSONObj& onDisk = loc.obj(); auto_ptr mss = mods->prepare( onDisk ); - + if( mss->canApplyInPlace() ) { - mss->applyModsInPlace(); + mss->applyModsInPlace(true); DEBUGUPDATE( "\t\t\t updateById doing in place update" ); /*if ( profile ) ss << " fastmod "; */ - } + } else { BSONObj newObj = mss->createNewFromMods(); checkTooLarge(newObj); - bool changedId; assert(nsdt); - DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug, changedId); + DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug); } - + if ( logop ) { DEV assert( mods->size() ); - + BSONObj pattern = patternOrig; if ( mss->haveArrayDepMod() ) { BSONObjBuilder patternBuilder; patternBuilder.appendElements( pattern ); mss->appendSizeSpecForArrayDepMods( patternBuilder ); - pattern = patternBuilder.obj(); + pattern = patternBuilder.obj(); } - + if( mss->needOpLogRewrite() ) { DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() ); logOp("u", ns, mss->getOpLogRewrite() , &pattern ); @@ -857,24 +1012,18 @@ namespace mongo { } return UpdateResult( 1 , 1 , 1); } // end $operator update - + // regular update BSONElementManipulator::lookForTimestamps( updateobj ); checkNoMods( updateobj ); - bool changedId = false; assert(nsdt); - theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, changedId); + theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug ); if ( logop ) { - if ( !changedId ) { - logOp("u", ns, updateobj, &patternOrig ); - } else { - logOp("d", ns, patternOrig ); - logOp("i", ns, updateobj ); - } + logOp("u", ns, updateobj, &patternOrig ); } return UpdateResult( 1 , 0 , 1 ); } - + UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) { DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi ); Client& client = cc(); @@ -883,20 +1032,20 @@ namespace mongo { if ( logLevel > 2 ) ss << " update: " << updateobj.toString(); - + /* idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case */ /* NOTE: when yield() is added herein, these must be refreshed after each call to yield! */ NamespaceDetails *d = nsdetails(ns); // can be null if an upsert... NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns); /* end note */ - + auto_ptr mods; bool isOperatorUpdate = updateobj.firstElement().fieldName()[0] == '$'; int modsIsIndexed = false; // really the # of indexes - if ( isOperatorUpdate ){ - if( d && d->backgroundIndexBuildInProgress ) { + if ( isOperatorUpdate ) { + if( d && d->indexBuildInProgress ) { set bgKeys; - d->backgroundIdx().keyPattern().getFieldNames(bgKeys); + d->inProgIdx().keyPattern().getFieldNames(bgKeys); mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) ); } else { @@ -914,30 +1063,30 @@ namespace mongo { } set seenObjects; - + int numModded = 0; long long nscanned = 0; MatchDetails details; shared_ptr< MultiCursor::CursorOp > opPtr( new UpdateOp( mods.get() && mods->hasDynamicArray() ) ); shared_ptr< MultiCursor > c( new MultiCursor( ns, patternOrig, BSONObj(), opPtr, true ) ); - + auto_ptr cc; - + while ( c->ok() ) { nscanned++; bool atomic = c->matcher()->docMatcher().atomic(); - + // May have already matched in UpdateOp, but do again to get details set correctly - if ( ! c->matcher()->matches( c->currKey(), c->currLoc(), &details ) ){ + if ( ! c->matcher()->matches( c->currKey(), c->currLoc(), &details ) ) { c->advance(); - - if ( nscanned % 256 == 0 && ! atomic ){ + + if ( nscanned % 256 == 0 && ! atomic ) { if ( cc.get() == 0 ) { shared_ptr< Cursor > cPtr = c; cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) ); } - if ( ! cc->yield() ){ + if ( ! cc->yield() ) { cc.release(); // TODO should we assert or something? break; @@ -948,20 +1097,20 @@ namespace mongo { } continue; } - + Record *r = c->_current(); DiskLoc loc = c->currLoc(); - + // TODO Maybe this is unnecessary since we have seenObjects - if ( c->getsetdup( loc ) ){ + if ( c->getsetdup( loc ) ) { c->advance(); continue; } - + BSONObj js(r); - + BSONObj pattern = patternOrig; - + if ( logop ) { BSONObjBuilder idPattern; BSONElement id; @@ -977,80 +1126,79 @@ namespace mongo { uassert( 10157 , "multi-update requires all modified objects to have an _id" , ! multi ); } } - + if ( profile ) ss << " nscanned:" << nscanned; - + /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some regular ones at the moment. */ if ( isOperatorUpdate ) { - - if ( multi ){ + + if ( multi ) { c->advance(); // go to next record in case this one moves if ( seenObjects.count( loc ) ) continue; } - + const BSONObj& onDisk = loc.obj(); - + ModSet * useMods = mods.get(); bool forceRewrite = false; - + auto_ptr mymodset; - if ( details.elemMatchKey && mods->hasDynamicArray() ){ + if ( details.elemMatchKey && mods->hasDynamicArray() ) { useMods = mods->fixDynamicArray( details.elemMatchKey ); mymodset.reset( useMods ); forceRewrite = true; } - + auto_ptr mss = useMods->prepare( onDisk ); - + bool indexHack = multi && ( modsIsIndexed || ! mss->canApplyInPlace() ); - - if ( indexHack ){ + + if ( indexHack ) { if ( cc.get() ) cc->updateLocation(); else c->noteLocation(); } - - if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ){ - mss->applyModsInPlace();// const_cast(onDisk) ); - + + if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) { + mss->applyModsInPlace( true );// const_cast(onDisk) ); + DEBUGUPDATE( "\t\t\t doing in place update" ); if ( profile ) ss << " fastmod "; - - if ( modsIsIndexed ){ + + if ( modsIsIndexed ) { seenObjects.insert( loc ); } - } + } else { if ( rs ) rs->goingToDelete( onDisk ); BSONObj newObj = mss->createNewFromMods(); checkTooLarge(newObj); - bool changedId; - DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug, changedId); + DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug); if ( newLoc != loc || modsIsIndexed ) { // object moved, need to make sure we don' get again seenObjects.insert( newLoc ); } - + } - + if ( logop ) { DEV assert( mods->size() ); - + if ( mss->haveArrayDepMod() ) { BSONObjBuilder patternBuilder; patternBuilder.appendElements( pattern ); mss->appendSizeSpecForArrayDepMods( patternBuilder ); - pattern = patternBuilder.obj(); + pattern = patternBuilder.obj(); } - - if ( forceRewrite || mss->needOpLogRewrite() ){ + + if ( forceRewrite || mss->needOpLogRewrite() ) { DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() ); logOp("u", ns, mss->getOpLogRewrite() , &pattern ); } @@ -1063,13 +1211,13 @@ namespace mongo { return UpdateResult( 1 , 1 , numModded ); if ( indexHack ) c->checkLocation(); - - if ( nscanned % 64 == 0 && ! atomic ){ + + if ( nscanned % 64 == 0 && ! atomic ) { if ( cc.get() == 0 ) { shared_ptr< Cursor > cPtr = c; cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) ); } - if ( ! cc->yield() ){ + if ( ! cc->yield() ) { cc.release(); break; } @@ -1077,35 +1225,32 @@ namespace mongo { break; } } - + + if (atomic) + getDur().commitIfNeeded(); + continue; - } - + } + uassert( 10158 , "multi update only works with $ operators" , ! multi ); - + BSONElementManipulator::lookForTimestamps( updateobj ); checkNoMods( updateobj ); - bool changedId = false; - theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, changedId, god); + theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god); if ( logop ) { DEV if( god ) log() << "REALLY??" << endl; // god doesn't get logged, this would be bad. - if ( !changedId ) { - logOp("u", ns, updateobj, &pattern ); - } else { - logOp("d", ns, pattern ); - logOp("i", ns, updateobj ); - } + logOp("u", ns, updateobj, &pattern ); } return UpdateResult( 1 , 0 , 1 ); } - + if ( numModded ) return UpdateResult( 1 , 1 , numModded ); - + if ( profile ) ss << " nscanned:" << nscanned; - + if ( upsert ) { if ( updateobj.firstElement().fieldName()[0] == '$' ) { /* upsert of an $inc. build a default */ @@ -1115,7 +1260,7 @@ namespace mongo { theDataFileMgr.insertWithObjMod(ns, newObj, god); if ( logop ) logOp( "i", ns, newObj ); - + return UpdateResult( 0 , 1 , 1 , newObj ); } uassert( 10159 , "multi update only works with $ operators" , ! multi ); @@ -1130,14 +1275,14 @@ namespace mongo { } return UpdateResult( 0 , 0 , 0 ); } - + UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) { uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 ); if ( strstr(ns, ".system.") ) { /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */ - uassert( 10156 , "cannot update system collection", legalClientSystemNS( ns , true ) ); + uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) ); } return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug); } - + } diff --git a/db/update.h b/db/update.h index b7950de..d8396b5 100644 --- a/db/update.h +++ b/db/update.h @@ -26,32 +26,42 @@ namespace mongo { class ModState; class ModSetState; - /* Used for modifiers such as $inc, $set, $push, ... + /* Used for modifiers such as $inc, $set, $push, ... * stores the info about a single operation * once created should never be modified */ struct Mod { // See opFromStr below - // 0 1 2 3 4 5 6 7 8 9 10 11 - enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET } op; - + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op; + static const char* modNames[]; static unsigned modNamesNum; const char *fieldName; const char *shortFieldName; - + BSONElement elt; // x:5 note: this is the actual element from the updateobj boost::shared_ptr matcher; + bool matcherOnPrimitive; - void init( Op o , BSONElement& e ){ + void init( Op o , BSONElement& e ) { op = o; elt = e; - if ( op == PULL && e.type() == Object ) - matcher.reset( new Matcher( e.embeddedObject() ) ); + if ( op == PULL && e.type() == Object ) { + BSONObj t = e.embeddedObject(); + if ( t.firstElement().getGtLtOp() == 0 ) { + matcher.reset( new Matcher( t ) ); + matcherOnPrimitive = false; + } + else { + matcher.reset( new Matcher( BSON( "" << t ) ) ); + matcherOnPrimitive = true; + } + } } - void setFieldName( const char * s ){ + void setFieldName( const char * s ) { fieldName = s; shortFieldName = strrchr( fieldName , '.' ); if ( shortFieldName ) @@ -59,14 +69,13 @@ namespace mongo { else shortFieldName = fieldName; } - + /** * @param in incrememnts the actual value inside in */ void incrementMe( BSONElement& in ) const { BSONElementManipulator manip( in ); - - switch ( in.type() ){ + switch ( in.type() ) { case NumberDouble: manip.setNumber( elt.numberDouble() + in.numberDouble() ); break; @@ -79,18 +88,33 @@ namespace mongo { default: assert(0); } - } - + void IncrementMe( BSONElement& in ) const { + BSONElementManipulator manip( in ); + switch ( in.type() ) { + case NumberDouble: + manip.SetNumber( elt.numberDouble() + in.numberDouble() ); + break; + case NumberLong: + manip.SetLong( elt.numberLong() + in.numberLong() ); + break; + case NumberInt: + manip.SetInt( elt.numberInt() + in.numberInt() ); + break; + default: + assert(0); + } + } + template< class Builder > void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const; - + bool operator<( const Mod &other ) const { return strcmp( fieldName, other.fieldName ) < 0; } - + bool arrayDep() const { - switch (op){ + switch (op) { case PUSH: case PUSH_ALL: case POP: @@ -99,8 +123,8 @@ namespace mongo { return false; } } - - static bool isIndexed( const string& fullName , const set& idxKeys ){ + + static bool isIndexed( const string& fullName , const set& idxKeys ) { const char * fieldName = fullName.c_str(); // check if there is an index key that is a parent of mod for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) ) @@ -117,23 +141,23 @@ namespace mongo { return false; } - + bool isIndexed( const set& idxKeys ) const { string fullName = fieldName; - + if ( isIndexed( fullName , idxKeys ) ) return true; - - if ( strstr( fieldName , "." ) ){ + + if ( strstr( fieldName , "." ) ) { // check for a.0.1 StringBuilder buf( fullName.size() + 1 ); - for ( size_t i=0; i 0 && fullName[i-1] == '.' && - i+1 0 && fullName[i-1] == '.' && + i+1 void apply( Builder& b , BSONElement in , ModState& ms ) const; - + /** * @return true iff toMatch should be removed from the array */ bool _pullElementMatch( BSONElement& toMatch ) const; void _checkForAppending( const BSONElement& e ) const { - if ( e.type() == Object ){ + if ( e.type() == Object ) { // this is a tiny bit slow, but rare and important // only when setting something TO an object, not setting something in an object - // and it checks for { $set : { x : { 'a.b' : 1 } } } + // and it checks for { $set : { x : { 'a.b' : 1 } } } // which is feel has been common uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() ); } } - + bool isEach() const { if ( elt.type() != Object ) return false; @@ -199,14 +223,18 @@ namespace mongo { BSONObj getEach() const { return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck(); } - + void parseEach( BSONElementSet& s ) const { BSONObjIterator i(getEach()); - while ( i.more() ){ + while ( i.more() ) { s.insert( i.next() ); } } - + + const char *renameFrom() const { + massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO ); + return elt.fieldName(); + } }; /** @@ -220,7 +248,7 @@ namespace mongo { bool _hasDynamicArray; static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ); - + FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const { bool mDone = ( m == _mods.end() ); bool pDone = ( p == pEnd ); @@ -236,11 +264,11 @@ namespace mongo { return compareDottedFieldNames( m->first, p->first.c_str() ); } - + bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) { for( string left = EmbeddedBuilder::splitDot( right ); - left.length() > 0 && left[ left.length() - 1 ] != '.'; - left += "." + EmbeddedBuilder::splitDot( right ) ) { + left.length() > 0 && left[ left.length() - 1 ] != '.'; + left += "." + EmbeddedBuilder::splitDot( right ) ) { if ( existing.count( left ) > 0 && existing[ left ].type() != Object ) return false; if ( haveModForField( left.c_str() ) ) @@ -250,7 +278,7 @@ namespace mongo { } static Mod::Op opFromStr( const char *fn ) { assert( fn[0] == '$' ); - switch( fn[1] ){ + switch( fn[1] ) { case 'i': { if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 ) return Mod::INC; @@ -262,14 +290,14 @@ namespace mongo { break; } case 'p': { - if ( fn[2] == 'u' ){ - if ( fn[3] == 's' && fn[4] == 'h' ){ + if ( fn[2] == 'u' ) { + if ( fn[3] == 's' && fn[4] == 'h' ) { if ( fn[5] == 0 ) return Mod::PUSH; if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 ) return Mod::PUSH_ALL; } - else if ( fn[3] == 'l' && fn[4] == 'l' ){ + else if ( fn[3] == 'l' && fn[4] == 'l' ) { if ( fn[5] == 0 ) return Mod::PULL; if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 ) @@ -286,7 +314,7 @@ namespace mongo { break; } case 'b': { - if ( fn[2] == 'i' && fn[3] == 't' ){ + if ( fn[2] == 'i' && fn[3] == 't' ) { if ( fn[4] == 0 ) return Mod::BIT; if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 ) @@ -297,27 +325,41 @@ namespace mongo { break; } case 'a': { - if ( fn[2] == 'd' && fn[3] == 'd' ){ + if ( fn[2] == 'd' && fn[3] == 'd' ) { // add if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 ) return Mod::ADDTOSET; - + + } + break; + } + case 'r': { + if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) { + return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM } + break; } default: break; } uassert( 10161 , "Invalid modifier specified " + string( fn ), false ); return Mod::INC; } - - ModSet(){} + + ModSet() {} + + void updateIsIndexed( const Mod &m, const set &idxKeys, const set *backgroundKeys ) { + if ( m.isIndexed( idxKeys ) || + (backgroundKeys && m.isIndexed(*backgroundKeys)) ) { + _isIndexed++; + } + } public: - - ModSet( const BSONObj &from , - const set& idxKeys = set(), - const set* backgroundKeys = 0 - ); + + ModSet( const BSONObj &from , + const set& idxKeys = set(), + const set* backgroundKeys = 0 + ); // TODO: this is inefficient - should probably just handle when iterating ModSet * fixDynamicArray( const char * elemMatchKey ) const; @@ -329,7 +371,7 @@ namespace mongo { * doesn't change or modify this ModSet or any underying Mod */ auto_ptr prepare( const BSONObj& obj ) const; - + /** * given a query pattern, builds an object suitable for an upsert * will take the query spec and combine all $ operators @@ -349,15 +391,15 @@ namespace mongo { return _mods.find( fieldName ) != _mods.end(); } - bool haveConflictingMod( const string& fieldName ){ + bool haveConflictingMod( const string& fieldName ) { size_t idx = fieldName.find( '.' ); if ( idx == string::npos ) idx = fieldName.size(); - + ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx)); - for ( ; start != _mods.end(); start++ ){ + for ( ; start != _mods.end(); start++ ) { FieldCompareResult r = compareDottedFieldNames( fieldName , start->first ); - switch ( r ){ + switch ( r ) { case LEFT_SUBFIELD: return true; case LEFT_BEFORE: return false; case SAME: return true; @@ -367,9 +409,9 @@ namespace mongo { } return false; - + } - + }; /** @@ -379,23 +421,28 @@ namespace mongo { public: const Mod * m; BSONElement old; - + BSONElement newVal; + BSONObj _objData; + const char * fixedOpName; BSONElement * fixed; int pushStartSize; - + BSONType incType; int incint; double incdouble; long long inclong; - - ModState(){ + + bool dontApply; + + ModState() { fixedOpName = 0; fixed = 0; pushStartSize = -1; incType = EOO; + dontApply = false; } - + Mod::Op op() const { return m->op; } @@ -403,12 +450,18 @@ namespace mongo { const char * fieldName() const { return m->fieldName; } - + bool needOpLogRewrite() const { + if ( dontApply ) + return false; + if ( fixed || fixedOpName || incType ) return true; - - switch( op() ){ + + switch( op() ) { + case Mod::RENAME_FROM: + case Mod::RENAME_TO: + return true; case Mod::BIT: case Mod::BITAND: case Mod::BITOR: @@ -418,19 +471,19 @@ namespace mongo { return false; } } - + void appendForOpLog( BSONObjBuilder& b ) const; template< class Builder > - void apply( Builder& b , BSONElement in ){ + void apply( Builder& b , BSONElement in ) { m->apply( b , in , *this ); } - + template< class Builder > void appendIncValue( Builder& b , bool useFullName ) const { const char * n = useFullName ? m->fieldName : m->shortFieldName; - switch ( incType ){ + switch ( incType ) { case NumberDouble: b.append( n , incdouble ); break; case NumberLong: @@ -443,8 +496,11 @@ namespace mongo { } string toString() const; + + template< class Builder > + void handleRename( Builder &newObjBuilder, const char *shortFieldName ); }; - + /** * this is used to hold state, meta data while applying a ModSet to a BSONObj * the goal is to make ModSet const so its re-usable @@ -459,15 +515,16 @@ namespace mongo { const BSONObj& _obj; ModStateHolder _mods; bool _inPlacePossible; - - ModSetState( const BSONObj& obj ) - : _obj( obj ) , _inPlacePossible(true){ + BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it + + ModSetState( const BSONObj& obj ) + : _obj( obj ) , _inPlacePossible(true) { } - + /** * @return if in place is still possible */ - bool amIInPlacePossible( bool inPlacePossible ){ + bool amIInPlacePossible( bool inPlacePossible ) { if ( ! inPlacePossible ) _inPlacePossible = false; return _inPlacePossible; @@ -478,17 +535,21 @@ namespace mongo { template< class Builder > void _appendNewFromMods( const string& root , ModState& m , Builder& b , set& onedownseen ); - + template< class Builder > - void appendNewFromMod( ModState& ms , Builder& b ){ + void appendNewFromMod( ModState& ms , Builder& b ) { + if ( ms.dontApply ) { + return; + } + //const Mod& m = *(ms.m); // HACK Mod& m = *((Mod*)(ms.m)); // HACK - - switch ( m.op ){ - - case Mod::PUSH: - case Mod::ADDTOSET: { - if ( m.isEach() ){ + + switch ( m.op ) { + + case Mod::PUSH: + case Mod::ADDTOSET: { + if ( m.isEach() ) { b.appendArray( m.shortFieldName , m.getEach() ); } else { @@ -497,19 +558,19 @@ namespace mongo { arr.done(); } break; - } - + } + case Mod::PUSH_ALL: { b.appendAs( m.elt, m.shortFieldName ); break; - } - + } + case Mod::UNSET: case Mod::PULL: case Mod::PULL_ALL: // no-op b/c unset/pull of nothing does nothing break; - + case Mod::INC: ms.fixedOpName = "$set"; case Mod::SET: { @@ -517,24 +578,29 @@ namespace mongo { b.appendAs( m.elt, m.shortFieldName ); break; } - default: + // shouldn't see RENAME_FROM here + case Mod::RENAME_TO: + ms.handleRename( b, m.shortFieldName ); + break; + default: stringstream ss; ss << "unknown mod in appendNewFromMod: " << m.op; throw UserException( 9015, ss.str() ); } - + } public: - + bool canApplyInPlace() const { return _inPlacePossible; } - + /** * modified underlying _obj + * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable */ - void applyModsInPlace(); + void applyModsInPlace( bool isOnDisk ); BSONObj createNewFromMods(); @@ -544,9 +610,9 @@ namespace mongo { for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) if ( i->second.needOpLogRewrite() ) return true; - return false; + return false; } - + BSONObj getOpLogRewrite() const { BSONObjBuilder b; for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) @@ -564,7 +630,7 @@ namespace mongo { void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const { for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) { const ModState& m = i->second; - if ( m.m->arrayDep() ){ + if ( m.m->arrayDep() ) { if ( m.pushStartSize == -1 ) b.appendNull( m.fieldName() ); else @@ -577,6 +643,6 @@ namespace mongo { friend class ModSet; }; - + } diff --git a/dbtests/background_job_test.cpp b/dbtests/background_job_test.cpp new file mode 100644 index 0000000..f2bf7d8 --- /dev/null +++ b/dbtests/background_job_test.cpp @@ -0,0 +1,109 @@ +// @file background_job_test.cpp + +/** + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "../pch.h" +#include + +#include "dbtests.h" +#include "../util/time_support.h" +#include "../util/background.h" + +namespace BackgroundJobTests { + + // a global variable that can be accessed independent of the IncTester object below + // IncTester keeps it up-to-date + int GLOBAL_val; + + class IncTester : public mongo::BackgroundJob { + public: + explicit IncTester( long long millis , bool selfDelete = false ) + : BackgroundJob(selfDelete), _val(0), _millis(millis) { GLOBAL_val = 0; } + + void waitAndInc( long long millis ) { + if ( millis ) + mongo::sleepmillis( millis ); + ++_val; + ++GLOBAL_val; + } + + int getVal() { return _val; } + + /* --- BackgroundJob virtuals --- */ + + string name() const { return "IncTester"; } + + void run() { waitAndInc( _millis ); } + + private: + int _val; + long long _millis; + }; + + + class NormalCase { + public: + void run() { + IncTester tester( 0 /* inc without wait */ ); + tester.go(); + ASSERT( tester.wait() ); + ASSERT_EQUALS( tester.getVal() , 1 ); + } + }; + + class TimeOutCase { + public: + void run() { + IncTester tester( 1000 /* wait 1sec before inc-ing */ ); + tester.go(); + ASSERT( ! tester.wait( 100 /* ms */ ) ); // should time out + ASSERT_EQUALS( tester.getVal() , 0 ); + + // if we wait longer than the IncTester, we should see the increment + ASSERT( tester.wait( 1500 /* ms */ ) ); // should not time out + ASSERT_EQUALS( tester.getVal() , 1 ); + } + }; + + class SelfDeletingCase { + public: + void run() { + BackgroundJob* j = new IncTester( 0 /* inc without wait */ , true /* self delete */ ); + j->go(); + + + // the background thread should have continued running and this test should pass the + // heap-checker as well + mongo::sleepmillis( 1000 ); + ASSERT_EQUALS( GLOBAL_val, 1 ); + } + }; + + + class BackgroundJobSuite : public Suite { + public: + BackgroundJobSuite() : Suite( "background_job" ) {} + + void setupTests() { + add< NormalCase >(); + add< TimeOutCase >(); + add< SelfDeletingCase >(); + } + + } backgroundJobSuite; + +} // namespace BackgroundJobTests diff --git a/dbtests/balancer_policy_tests.cpp b/dbtests/balancer_policy_tests.cpp new file mode 100644 index 0000000..6f7c4a5 --- /dev/null +++ b/dbtests/balancer_policy_tests.cpp @@ -0,0 +1,203 @@ +// @file balancer_policy_test.cpp + +/** + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "dbtests.h" + +// TODO SERVER-1822 +//#include "../s/config.h" // for ShardFields +//#include "../s/balancer_policy.h" + +namespace BalancerPolicyTests { + +// +// TODO SERVER-1822 +// +#if 0 + + typedef mongo::ShardFields sf; // fields from 'shards' colleciton + typedef mongo::LimitsFields lf; // fields from the balancer's limits map + + class SizeMaxedShardTest { + public: + void run() { + BSONObj shard0 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) ); + ASSERT( ! BalancerPolicy::isSizeMaxed( shard0 ) ); + + BSONObj shard1 = BSON( sf::maxSize(100LL) << lf::currSize(80LL) ); + ASSERT( ! BalancerPolicy::isSizeMaxed( shard1 ) ); + + BSONObj shard2 = BSON( sf::maxSize(100LL) << lf::currSize(110LL) ); + ASSERT( BalancerPolicy::isSizeMaxed( shard2 ) ); + + BSONObj empty; + ASSERT( ! BalancerPolicy::isSizeMaxed( empty ) ); + } + }; + + class DrainingShardTest { + public: + void run() { + BSONObj shard0 = BSON( sf::draining(true) ); + ASSERT( BalancerPolicy::isDraining( shard0 ) ); + + BSONObj shard1 = BSON( sf::draining(false) ); + ASSERT( ! BalancerPolicy::isDraining( shard1 ) ); + + BSONObj empty; + ASSERT( ! BalancerPolicy::isDraining( empty ) ); + } + }; + + class BalanceNormalTest { + public: + void run() { + // 2 chunks and 0 chunk shards + BalancerPolicy::ShardToChunksMap chunkMap; + vector chunks; + chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << + "max" << BSON( "x" << 49 ))); + chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << + "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); + chunkMap["shard0"] = chunks; + chunks.clear(); + chunkMap["shard1"] = chunks; + + // no limits + BalancerPolicy::ShardToLimitsMap limitsMap; + BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(false) << lf::hasOpsQueued(false) ); + BSONObj limits1 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) << sf::draining(false) << lf::hasOpsQueued(false) ); + limitsMap["shard0"] = limits0; + limitsMap["shard1"] = limits1; + + BalancerPolicy::ChunkInfo* c = NULL; + c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 1 ); + ASSERT( c ); + } + }; + + class BalanceDrainingTest { + public: + void run() { + // one normal, one draining + // 2 chunks and 0 chunk shards + BalancerPolicy::ShardToChunksMap chunkMap; + vector chunks; + chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << + "max" << BSON( "x" << 49 ))); + chunkMap["shard0"] = chunks; + chunks.clear(); + chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << + "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); + chunkMap["shard1"] = chunks; + + // shard0 is draining + BalancerPolicy::ShardToLimitsMap limitsMap; + BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(true) ); + BSONObj limits1 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) << sf::draining(false) ); + limitsMap["shard0"] = limits0; + limitsMap["shard1"] = limits1; + + BalancerPolicy::ChunkInfo* c = NULL; + c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 ); + ASSERT( c ); + ASSERT_EQUALS( c->to , "shard1" ); + ASSERT_EQUALS( c->from , "shard0" ); + ASSERT( ! c->chunk.isEmpty() ); + } + }; + + class BalanceEndedDrainingTest { + public: + void run() { + // 2 chunks and 0 chunk (drain completed) shards + BalancerPolicy::ShardToChunksMap chunkMap; + vector chunks; + chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << + "max" << BSON( "x" << 49 ))); + chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << + "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); + chunkMap["shard0"] = chunks; + chunks.clear(); + chunkMap["shard1"] = chunks; + + // no limits + BalancerPolicy::ShardToLimitsMap limitsMap; + BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(false) ); + BSONObj limits1 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) << sf::draining(true) ); + limitsMap["shard0"] = limits0; + limitsMap["shard1"] = limits1; + + BalancerPolicy::ChunkInfo* c = NULL; + c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 ); + ASSERT( ! c ); + } + }; + + class BalanceImpasseTest { + public: + void run() { + // one maxed out, one draining + // 2 chunks and 0 chunk shards + BalancerPolicy::ShardToChunksMap chunkMap; + vector chunks; + chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << + "max" << BSON( "x" << 49 ))); + chunkMap["shard0"] = chunks; + chunks.clear(); + chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << + "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); + chunkMap["shard1"] = chunks; + + // shard0 is draining, shard1 is maxed out, shard2 has writebacks pending + BalancerPolicy::ShardToLimitsMap limitsMap; + BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(true) ); + BSONObj limits1 = BSON( sf::maxSize(1LL) << lf::currSize(1LL) << sf::draining(false) ); + BSONObj limits2 = BSON( sf::maxSize(0LL) << lf::currSize(1LL) << lf::hasOpsQueued(true) ); + limitsMap["shard0"] = limits0; + limitsMap["shard1"] = limits1; + limitsMap["shard2"] = limits2; + + BalancerPolicy::ChunkInfo* c = NULL; + c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 ); + ASSERT( ! c ); + } + }; + +// +// TODO SERVER-1822 +// +#endif // #if 0 + + class All : public Suite { + public: + All() : Suite( "balancer_policy" ) { + } + + void setupTests() { + // TODO SERVER-1822 + // add< SizeMaxedShardTest >(); + // add< DrainingShardTest >(); + // add< BalanceNormalTest >(); + // add< BalanceDrainingTest >(); + // add< BalanceEndedDrainingTest >(); + // add< BalanceImpasseTest >(); + } + } allTests; + +} // namespace BalancerPolicyTests diff --git a/dbtests/basictests.cpp b/dbtests/basictests.cpp index f1e788a..3e0eecd 100644 --- a/dbtests/basictests.cpp +++ b/dbtests/basictests.cpp @@ -23,6 +23,8 @@ #include "../util/base64.h" #include "../util/array.h" #include "../util/text.h" +#include "../util/queue.h" +#include "../util/paths.h" namespace BasicTests { @@ -49,21 +51,21 @@ namespace BasicTests { RARELY ++c; } }; - + class Base64Tests { public: - - void roundTrip( string s ){ + + void roundTrip( string s ) { ASSERT_EQUALS( s , base64::decode( base64::encode( s ) ) ); } - - void roundTrip( const unsigned char * _data , int len ){ + + void roundTrip( const unsigned char * _data , int len ) { const char *data = (const char *) _data; string s = base64::encode( data , len ); string out = base64::decode( s ); ASSERT_EQUALS( out.size() , static_cast(len) ); bool broke = false; - for ( int i=0; i= 0 && sec <= 2 ); + t.reset(); + } + ASSERT( matches >= 2 ); - t.reset(); sleepmicros( 1527123 ); ASSERT( t.micros() > 1000000 ); ASSERT( t.micros() < 2000000 ); @@ -202,17 +211,17 @@ namespace BasicTests { sleepmillis( 1727 ); ASSERT( t.millis() >= 1000 ); ASSERT( t.millis() <= 2500 ); - + { int total = 1200; int ms = 2; t.reset(); - for ( int i=0; i<(total/ms); i++ ){ + for ( int i=0; i<(total/ms); i++ ) { sleepmillis( ms ); } { int x = t.millis(); - if ( x < 1000 || x > 2500 ){ + if ( x < 1000 || x > 2500 ) { cout << "sleeptest x: " << x << endl; ASSERT( x >= 1000 ); ASSERT( x <= 20000 ); @@ -226,12 +235,12 @@ namespace BasicTests { int micros = 100; t.reset(); int numSleeps = 1000*(total/micros); - for ( int i=0; i 2500 ){ + if ( y < 1000 || y > 2500 ) { cout << "sleeptest y: " << y << endl; ASSERT( y >= 1000 ); /* ASSERT( y <= 100000 ); */ @@ -239,9 +248,9 @@ namespace BasicTests { } } #endif - + } - + }; class AssertTests { @@ -249,15 +258,15 @@ namespace BasicTests { int x; - AssertTests(){ + AssertTests() { x = 0; } - string foo(){ + string foo() { x++; return ""; } - void run(){ + void run() { uassert( -1 , foo() , 1 ); if( x != 0 ) { ASSERT_EQUALS( 0 , x ); @@ -265,7 +274,7 @@ namespace BasicTests { try { uassert( -1 , foo() , 0 ); } - catch ( ... ){} + catch ( ... ) {} ASSERT_EQUALS( 1 , x ); } }; @@ -273,13 +282,13 @@ namespace BasicTests { namespace ArrayTests { class basic1 { public: - void run(){ + void run() { FastArray a(100); a.push_back( 5 ); a.push_back( 6 ); - + ASSERT_EQUALS( 2 , a.size() ); - + FastArray::iterator i = a.begin(); ASSERT( i != a.end() ); ASSERT_EQUALS( 5 , *i ); @@ -291,10 +300,10 @@ namespace BasicTests { } }; }; - + class ThreadSafeStringTest { public: - void run(){ + void run() { ThreadSafeString s; s = "eliot"; ASSERT_EQUALS( s , "eliot" ); @@ -302,8 +311,8 @@ namespace BasicTests { ThreadSafeString s2 = s; ASSERT_EQUALS( s2 , "eliot" ); - - + + { string foo; { @@ -315,11 +324,11 @@ namespace BasicTests { } } }; - + class LexNumCmp { public: void run() { - + ASSERT( ! isNumber( (char)255 ) ); ASSERT_EQUALS( 0, lexNumCmp( "a", "a" ) ); @@ -355,7 +364,7 @@ namespace BasicTests { ASSERT_EQUALS( -1, lexNumCmp( "a1{", "a1{a" ) ); ASSERT_EQUALS( 1, lexNumCmp("21", "11") ); ASSERT_EQUALS( -1, lexNumCmp("11", "21") ); - + ASSERT_EQUALS( -1 , lexNumCmp( "a.0" , "a.1" ) ); ASSERT_EQUALS( -1 , lexNumCmp( "a.0.b" , "a.1" ) ); @@ -363,52 +372,78 @@ namespace BasicTests { ASSERT_EQUALS( -1 , lexNumCmp( "b.0e" , (string("b.") + (char)255).c_str() ) ); ASSERT_EQUALS( -1 , lexNumCmp( "b." , "b.0e" ) ); - ASSERT_EQUALS( 0, lexNumCmp( "238947219478347782934718234", "238947219478347782934718234")); - ASSERT_EQUALS( 0, lexNumCmp( "000238947219478347782934718234", "238947219478347782934718234")); - ASSERT_EQUALS( 1, lexNumCmp( "000238947219478347782934718235", "238947219478347782934718234")); - ASSERT_EQUALS( -1, lexNumCmp( "238947219478347782934718234", "238947219478347782934718234.1")); - ASSERT_EQUALS( 0, lexNumCmp( "238", "000238")); - ASSERT_EQUALS( 0, lexNumCmp( "002384", "0002384")); - ASSERT_EQUALS( 0, lexNumCmp( "00002384", "0002384")); - ASSERT_EQUALS( 0, lexNumCmp( "0", "0")); - ASSERT_EQUALS( 0, lexNumCmp( "0000", "0")); + ASSERT_EQUALS( 0, lexNumCmp( "238947219478347782934718234", "238947219478347782934718234")); + ASSERT_EQUALS( 0, lexNumCmp( "000238947219478347782934718234", "238947219478347782934718234")); + ASSERT_EQUALS( 1, lexNumCmp( "000238947219478347782934718235", "238947219478347782934718234")); + ASSERT_EQUALS( -1, lexNumCmp( "238947219478347782934718234", "238947219478347782934718234.1")); + ASSERT_EQUALS( 0, lexNumCmp( "238", "000238")); + ASSERT_EQUALS( 0, lexNumCmp( "002384", "0002384")); + ASSERT_EQUALS( 0, lexNumCmp( "00002384", "0002384")); + ASSERT_EQUALS( 0, lexNumCmp( "0", "0")); + ASSERT_EQUALS( 0, lexNumCmp( "0000", "0")); ASSERT_EQUALS( 0, lexNumCmp( "0", "000")); ASSERT_EQUALS( -1, lexNumCmp( "0000", "0.0")); - ASSERT_EQUALS( 1, lexNumCmp( "2380", "238")); - ASSERT_EQUALS( 1, lexNumCmp( "2385", "2384")); - ASSERT_EQUALS( 1, lexNumCmp( "2385", "02384")); - ASSERT_EQUALS( 1, lexNumCmp( "2385", "002384")); - ASSERT_EQUALS( -1, lexNumCmp( "123.234.4567", "00238")); - ASSERT_EQUALS( 0, lexNumCmp( "123.234", "00123.234")); - ASSERT_EQUALS( 0, lexNumCmp( "a.123.b", "a.00123.b")); - ASSERT_EQUALS( 1, lexNumCmp( "a.123.b", "a.b.00123.b")); - ASSERT_EQUALS( -1, lexNumCmp( "a.00.0", "a.0.1")); - ASSERT_EQUALS( 0, lexNumCmp( "01.003.02", "1.3.2")); - ASSERT_EQUALS( -1, lexNumCmp( "1.3.2", "10.300.20")); - ASSERT_EQUALS( 0, lexNumCmp( "10.300.20", "000000000000010.0000300.000000020")); - ASSERT_EQUALS( 0, lexNumCmp( "0000a", "0a")); - ASSERT_EQUALS( -1, lexNumCmp( "a", "0a")); - ASSERT_EQUALS( -1, lexNumCmp( "000a", "001a")); - ASSERT_EQUALS( 0, lexNumCmp( "010a", "0010a")); + ASSERT_EQUALS( 1, lexNumCmp( "2380", "238")); + ASSERT_EQUALS( 1, lexNumCmp( "2385", "2384")); + ASSERT_EQUALS( 1, lexNumCmp( "2385", "02384")); + ASSERT_EQUALS( 1, lexNumCmp( "2385", "002384")); + ASSERT_EQUALS( -1, lexNumCmp( "123.234.4567", "00238")); + ASSERT_EQUALS( 0, lexNumCmp( "123.234", "00123.234")); + ASSERT_EQUALS( 0, lexNumCmp( "a.123.b", "a.00123.b")); + ASSERT_EQUALS( 1, lexNumCmp( "a.123.b", "a.b.00123.b")); + ASSERT_EQUALS( -1, lexNumCmp( "a.00.0", "a.0.1")); + ASSERT_EQUALS( 0, lexNumCmp( "01.003.02", "1.3.2")); + ASSERT_EQUALS( -1, lexNumCmp( "1.3.2", "10.300.20")); + ASSERT_EQUALS( 0, lexNumCmp( "10.300.20", "000000000000010.0000300.000000020")); + ASSERT_EQUALS( 0, lexNumCmp( "0000a", "0a")); + ASSERT_EQUALS( -1, lexNumCmp( "a", "0a")); + ASSERT_EQUALS( -1, lexNumCmp( "000a", "001a")); + ASSERT_EQUALS( 0, lexNumCmp( "010a", "0010a")); } }; class DatabaseValidNames { public: - void run(){ + void run() { ASSERT( Database::validDBName( "foo" ) ); ASSERT( ! Database::validDBName( "foo/bar" ) ); ASSERT( ! Database::validDBName( "foo.bar" ) ); - ASSERT( nsDollarCheck( "asdads" ) ); - ASSERT( ! nsDollarCheck( "asda$ds" ) ); - ASSERT( nsDollarCheck( "local.oplog.$main" ) ); + ASSERT( isANormalNSName( "asdads" ) ); + ASSERT( ! isANormalNSName( "asda$ds" ) ); + ASSERT( isANormalNSName( "local.oplog.$main" ) ); + } + }; + + class DatabaseOwnsNS { + public: + void run() { + + bool isNew = false; + // this leaks as ~Database is private + // if that changes, should put this on the stack + Database * db = new Database( "dbtests_basictests_ownsns" , isNew ); + assert( isNew ); + + ASSERT( db->ownsNS( "dbtests_basictests_ownsns.x" ) ); + ASSERT( db->ownsNS( "dbtests_basictests_ownsns.x.y" ) ); + ASSERT( ! db->ownsNS( "dbtests_basictests_ownsn.x.y" ) ); + ASSERT( ! db->ownsNS( "dbtests_basictests_ownsnsa.x.y" ) ); + } + }; + + class NSValidNames { + public: + void run() { + ASSERT( isValidNS( "test.foo" ) ); + ASSERT( ! isValidNS( "test." ) ); + ASSERT( ! isValidNS( "test" ) ); } }; - + class PtrTests { public: - void run(){ + void run() { scoped_ptr p1 (new int(1)); boost::shared_ptr p2 (new int(2)); scoped_ptr p3 (new int(3)); @@ -419,7 +454,7 @@ namespace BasicTests { ASSERT_EQUALS( p2.get() , ptr(p2) ); ASSERT_EQUALS( p2.get() , ptr(p2.get()) ); // T* constructor ASSERT_EQUALS( p2.get() , ptr(ptr(p2)) ); // copy constructor - ASSERT_EQUALS( *p2 , *ptr(p2)); + ASSERT_EQUALS( *p2 , *ptr(p2)); ASSERT_EQUALS( p2.get() , ptr >(&p2)->get() ); // operator-> //const @@ -431,14 +466,14 @@ namespace BasicTests { ASSERT_EQUALS( p4.get() , ptr(p4.get()) ); ASSERT_EQUALS( p2.get() , ptr(ptr(p2)) ); ASSERT_EQUALS( p2.get() , ptr(ptr(p2)) ); // constizing copy constructor - ASSERT_EQUALS( *p2 , *ptr(p2)); + ASSERT_EQUALS( *p2 , *ptr(p2)); ASSERT_EQUALS( p2.get() , ptr >(&p2)->get() ); //bool context ASSERT( ptr(p1) ); ASSERT( !ptr(NULL) ); ASSERT( !ptr() ); - + #if 0 // These shouldn't compile ASSERT_EQUALS( p3.get() , ptr(p3) ); @@ -450,12 +485,12 @@ namespace BasicTests { struct StringSplitterTest { - void test( string s ){ + void test( string s ) { vector v = StringSplitter::split( s , "," ); ASSERT_EQUALS( s , StringSplitter::join( v , "," ) ); } - void run(){ + void run() { test( "a" ); test( "a,b" ); test( "a,b,c" ); @@ -496,16 +531,68 @@ namespace BasicTests { }; + class QueueTest { + public: + void run() { + BlockingQueue q; + Timer t; + int x; + ASSERT( ! q.blockingPop( x , 5 ) ); + ASSERT( t.seconds() > 3 && t.seconds() < 9 ); + + } + }; + + class StrTests { + public: + + void run() { + ASSERT_EQUALS( 1u , str::count( "abc" , 'b' ) ); + ASSERT_EQUALS( 3u , str::count( "babab" , 'b' ) ); + } + + }; + + class HostAndPortTests { + public: + void run() { + HostAndPort a( "x1" , 1000 ); + HostAndPort b( "x1" , 1000 ); + HostAndPort c( "x1" , 1001 ); + HostAndPort d( "x2" , 1000 ); + + ASSERT( a == b ); + ASSERT( a != c ); + ASSERT( a != d ); + + } + }; + + class RelativePathTest { + public: + void run() { + RelativePath a = RelativePath::fromRelativePath( "a" ); + RelativePath b = RelativePath::fromRelativePath( "a" ); + RelativePath c = RelativePath::fromRelativePath( "b" ); + RelativePath d = RelativePath::fromRelativePath( "a/b" ); + + + ASSERT( a == b ); + ASSERT( a != c ); + ASSERT( a != d ); + ASSERT( c != d ); + } + }; class All : public Suite { public: - All() : Suite( "basic" ){ + All() : Suite( "basic" ) { } - - void setupTests(){ + + void setupTests() { add< Rarely >(); add< Base64Tests >(); - + add< stringbuildertests::simple1 >(); add< stringbuildertests::simple2 >(); add< stringbuildertests::reset1 >(); @@ -513,18 +600,28 @@ namespace BasicTests { add< sleeptest >(); add< AssertTests >(); - + add< ArrayTests::basic1 >(); add< LexNumCmp >(); add< DatabaseValidNames >(); + add< DatabaseOwnsNS >(); + + add< NSValidNames >(); add< PtrTests >(); add< StringSplitterTest >(); add< IsValidUTF8Test >(); + + add< QueueTest >(); + + add< StrTests >(); + + add< HostAndPortTests >(); + add< RelativePathTest >(); } } myall; - + } // namespace BasicTests diff --git a/dbtests/btreetests.cpp b/dbtests/btreetests.cpp index a90a097..4da7375 100644 --- a/dbtests/btreetests.cpp +++ b/dbtests/btreetests.cpp @@ -29,7 +29,12 @@ namespace BtreeTests { const char* ns() { return "unittests.btreetests"; } - + + // dummy, valid record loc + const DiskLoc recordLoc() { + return DiskLoc( 0, 2 ); + } + class Ensure { public: Ensure() { @@ -41,45 +46,55 @@ namespace BtreeTests { private: DBDirectClient _c; }; - + class Base : public Ensure { public: - Base() : - _context( ns() ) { + Base() : + _context( ns() ) { { bool f = false; assert( f = true ); massert( 10402 , "assert is misdefined", f); } } + virtual ~Base() {} + static string bigNumString( long long n, int len = 800 ) { + char sub[17]; + sprintf( sub, "%.16llx", n ); + string val( len, ' ' ); + for( int i = 0; i < len; ++i ) { + val[ i ] = sub[ i % 16 ]; + } + return val; + } protected: - BtreeBucket* bt() { + const BtreeBucket* bt() { return id().head.btree(); } DiskLoc dl() { return id().head; } IndexDetails& id() { - return nsdetails( ns() )->idx( 1 ); - } - // dummy, valid record loc - static DiskLoc recordLoc() { - return DiskLoc( 0, 2 ); + NamespaceDetails *nsd = nsdetails( ns() ); + assert( nsd ); + return nsd->idx( 1 ); } void checkValid( int nKeys ) { ASSERT( bt() ); ASSERT( bt()->isHead() ); bt()->assertValid( order(), true ); - ASSERT_EQUALS( nKeys, bt()->fullValidate( dl(), order() ) ); + ASSERT_EQUALS( nKeys, bt()->fullValidate( dl(), order(), 0, true ) ); } void dump() { bt()->dumpTree( dl(), order() ); } void insert( BSONObj &key ) { bt()->bt_insert( dl(), recordLoc(), key, Ordering::make(order()), true, id(), true ); + getDur().commitIfNeeded(); } - void unindex( BSONObj &key ) { - bt()->unindex( dl(), id(), key, recordLoc() ); + bool unindex( BSONObj &key ) { + getDur().commitIfNeeded(); + return bt()->unindex( dl(), id(), key, recordLoc() ); } static BSONObj simpleKey( char c, int n = 1 ) { BSONObjBuilder builder; @@ -98,9 +113,38 @@ namespace BtreeTests { ASSERT( location == expectedLocation ); ASSERT_EQUALS( expectedPos, pos ); } + bool present( BSONObj &key, int direction ) { + int pos; + bool found; + bt()->locate( id(), dl(), key, Ordering::make(order()), pos, found, recordLoc(), direction ); + return found; + } BSONObj order() { return id().keyPattern(); } + const BtreeBucket *child( const BtreeBucket *b, int i ) { + assert( i <= b->nKeys() ); + DiskLoc d; + if ( i == b->nKeys() ) { + d = b->getNextChild(); + } + else { + d = const_cast< DiskLoc& >( b->keyNode( i ).prevChildBucket ); + } + assert( !d.isNull() ); + return d.btree(); + } + void checkKey( char i ) { + stringstream ss; + ss << i; + checkKey( ss.str() ); + } + void checkKey( const string &k ) { + BSONObj key = BSON( "" << k ); +// log() << "key: " << key << endl; + ASSERT( present( key, 1 ) ); + ASSERT( present( key, -1 ) ); + } private: dblock lk_; Client::Context _context; @@ -140,6 +184,8 @@ namespace BtreeTests { insert( longKey ); } checkValid( 20 ); + ASSERT_EQUALS( 1, bt()->nKeys() ); + checkSplit(); } protected: virtual char shortToken( int i ) const = 0; @@ -150,6 +196,7 @@ namespace BtreeTests { static char rightToken( int i ) { return 'z' - i; } + virtual void checkSplit() = 0; }; class SplitRightHeavyBucket : public SplitUnevenBucketBase { @@ -160,6 +207,10 @@ namespace BtreeTests { virtual char longToken( int i ) const { return rightToken( i ); } + virtual void checkSplit() { + ASSERT_EQUALS( 15, child( bt(), 0 )->nKeys() ); + ASSERT_EQUALS( 4, child( bt(), 1 )->nKeys() ); + } }; class SplitLeftHeavyBucket : public SplitUnevenBucketBase { @@ -170,6 +221,10 @@ namespace BtreeTests { virtual char longToken( int i ) const { return leftToken( i ); } + virtual void checkSplit() { + ASSERT_EQUALS( 4, child( bt(), 0 )->nKeys() ); + ASSERT_EQUALS( 15, child( bt(), 1 )->nKeys() ); + } }; class MissingLocate : public Base { @@ -225,7 +280,7 @@ namespace BtreeTests { } void insert( int i ) { BSONObj k = key( 'b' + 2 * i ); - Base::insert( k ); + Base::insert( k ); } }; @@ -247,20 +302,21 @@ namespace BtreeTests { } void insert( int i ) { BSONObj k = key( 'b' + 2 * i ); - Base::insert( k ); - } + Base::insert( k ); + } }; - class ReuseUnused : public Base { + class DontReuseUnused : public Base { public: void run() { for ( int i = 0; i < 10; ++i ) { insert( i ); } +// dump(); BSONObj root = key( 'p' ); unindex( root ); Base::insert( root ); - locate( root, 0, true, dl(), 1 ); + locate( root, 0, true, bt()->getNextChild(), 1 ); } private: BSONObj key( char c ) { @@ -268,16 +324,17 @@ namespace BtreeTests { } void insert( int i ) { BSONObj k = key( 'b' + 2 * i ); - Base::insert( k ); - } + Base::insert( k ); + } }; - + class PackUnused : public Base { public: void run() { for ( long long i = 0; i < 1000000; i += 1000 ) { insert( i ); } +// dump(); string orig, after; { stringstream ss; @@ -294,8 +351,9 @@ namespace BtreeTests { while( c->ok() ) { if ( !c->currKeyNode().prevChildBucket.isNull() ) { toDel.push_back( c->currKey().firstElement().valuestr() ); - } else { - other.push_back( c->currKey().firstElement().valuestr() ); + } + else { + other.push_back( c->currKey().firstElement().valuestr() ); } c->advance(); } @@ -311,30 +369,25 @@ namespace BtreeTests { } int unused = 0; - ASSERT_EQUALS( 0, bt()->fullValidate( dl(), order(), &unused ) ); + ASSERT_EQUALS( 0, bt()->fullValidate( dl(), order(), &unused, true ) ); for ( long long i = 50000; i < 50100; ++i ) { insert( i ); - } + } int unused2 = 0; - ASSERT_EQUALS( 100, bt()->fullValidate( dl(), order(), &unused2 ) ); + ASSERT_EQUALS( 100, bt()->fullValidate( dl(), order(), &unused2, true ) ); - ASSERT( unused2 < unused ); +// log() << "old unused: " << unused << ", new unused: " << unused2 << endl; +// + ASSERT( unused2 <= unused ); } protected: void insert( long long n ) { - string val( 800, ' ' ); - for( int i = 0; i < 800; i += 8 ) { - for( int j = 0; j < 8; ++j ) { - // probably we won't get > 56 bits - unsigned char v = 0x80 | ( n >> ( ( 8 - j - 1 ) * 7 ) & 0x000000000000007f ); - val[ i + j ] = v; - } - } + string val = bigNumString( n ); BSONObj k = BSON( "a" << val ); - Base::insert( k ); - } + Base::insert( k ); + } }; class DontDropReferenceKey : public PackUnused { @@ -344,7 +397,7 @@ namespace BtreeTests { for ( long long i = 0; i < 80; i += 1 ) { insert( i ); } - + BSONObjBuilder start; start.appendMinKey( "a" ); BSONObjBuilder end; @@ -360,19 +413,1220 @@ namespace BtreeTests { c->advance(); } // too much work to try to make this happen through inserts and deletes - const_cast< DiskLoc& >( bt()->keyNode( 1 ).prevChildBucket ) = DiskLoc(); - const_cast< DiskLoc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() |= 1; // make unused + // we are intentionally manipulating the btree bucket directly here + getDur().writingDiskLoc( const_cast< DiskLoc& >( bt()->keyNode( 1 ).prevChildBucket ) ) = DiskLoc(); + getDur().writingInt( const_cast< DiskLoc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused BSONObj k = BSON( "a" << toInsert ); Base::insert( k ); } }; - + + class MergeBuckets : public Base { + public: + virtual ~MergeBuckets() {} + void run() { + for ( int i = 0; i < 10; ++i ) { + insert( i ); + } +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + int expectedCount = 10 - unindexKeys(); +// dump(); + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + int unused = 0; + ASSERT_EQUALS( expectedCount, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + } + protected: + BSONObj key( char c ) { + return simpleKey( c, 800 ); + } + void insert( int i ) { + BSONObj k = key( 'b' + 2 * i ); + Base::insert( k ); + } + virtual int unindexKeys() = 0; + }; + + class MergeBucketsLeft : public MergeBuckets { + virtual int unindexKeys() { + BSONObj k = key( 'b' ); + unindex( k ); + k = key( 'b' + 2 ); + unindex( k ); + k = key( 'b' + 4 ); + unindex( k ); + k = key( 'b' + 6 ); + unindex( k ); + return 4; + } + }; + + class MergeBucketsRight : public MergeBuckets { + virtual int unindexKeys() { + BSONObj k = key( 'b' + 2 * 9 ); + unindex( k ); + return 1; + } + }; + + // deleting from head won't coalesce yet +// class MergeBucketsHead : public MergeBuckets { +// virtual BSONObj unindexKey() { return key( 'p' ); } +// }; + + class MergeBucketsDontReplaceHead : public Base { + public: + void run() { + for ( int i = 0; i < 18; ++i ) { + insert( i ); + } + // dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = key( 'a' + 17 ); + unindex( k ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + int unused = 0; + ASSERT_EQUALS( 17, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + } + private: + BSONObj key( char c ) { + return simpleKey( c, 800 ); + } + void insert( int i ) { + BSONObj k = key( 'a' + i ); + Base::insert( k ); + } + }; + + // Tool to construct custom trees for tests. + class ArtificialTree : public BtreeBucket { + public: + void push( const BSONObj &key, const DiskLoc &child ) { + pushBack( dummyDiskLoc(), key, Ordering::make( BSON( "a" << 1 ) ), child ); + } + void setNext( const DiskLoc &child ) { + nextChild = child; + } + static DiskLoc make( IndexDetails &id ) { + DiskLoc ret = addBucket( id ); + is( ret )->init(); + getDur().commitIfNeeded(); + return ret; + } + static ArtificialTree *is( const DiskLoc &l ) { + return static_cast< ArtificialTree * >( l.btreemod() ); + } + static DiskLoc makeTree( const string &spec, IndexDetails &id ) { + return makeTree( fromjson( spec ), id ); + } + static DiskLoc makeTree( const BSONObj &spec, IndexDetails &id ) { + DiskLoc node = make( id ); + ArtificialTree *n = ArtificialTree::is( node ); + BSONObjIterator i( spec ); + while( i.more() ) { + BSONElement e = i.next(); + DiskLoc child; + if ( e.type() == Object ) { + child = makeTree( e.embeddedObject(), id ); + } + if ( e.fieldName() == string( "_" ) ) { + n->setNext( child ); + } + else { + n->push( BSON( "" << expectedKey( e.fieldName() ) ), child ); + } + } + n->fixParentPtrs( node ); + return node; + } + static void setTree( const string &spec, IndexDetails &id ) { + set( makeTree( spec, id ), id ); + } + static void set( const DiskLoc &l, IndexDetails &id ) { + ArtificialTree::is( id.head )->deallocBucket( id.head, id ); + getDur().writingDiskLoc(id.head) = l; + } + static string expectedKey( const char *spec ) { + if ( spec[ 0 ] != '$' ) { + return spec; + } + char *endPtr; + // parsing a long long is a pain, so just allow shorter keys for now + unsigned long long num = strtol( spec + 1, &endPtr, 16 ); + int len = 800; + if( *endPtr == '$' ) { + len = strtol( endPtr + 1, 0, 16 ); + } + return Base::bigNumString( num, len ); + } + static void checkStructure( const BSONObj &spec, const IndexDetails &id, const DiskLoc node ) { + ArtificialTree *n = ArtificialTree::is( node ); + BSONObjIterator j( spec ); + for( int i = 0; i < n->n; ++i ) { + ASSERT( j.more() ); + BSONElement e = j.next(); + KeyNode kn = n->keyNode( i ); + string expected = expectedKey( e.fieldName() ); + ASSERT( present( id, BSON( "" << expected ), 1 ) ); + ASSERT( present( id, BSON( "" << expected ), -1 ) ); + ASSERT_EQUALS( expected, kn.key.firstElement().valuestr() ); + if ( kn.prevChildBucket.isNull() ) { + ASSERT( e.type() == jstNULL ); + } + else { + ASSERT( e.type() == Object ); + checkStructure( e.embeddedObject(), id, kn.prevChildBucket ); + } + } + if ( n->nextChild.isNull() ) { + // maybe should allow '_' field with null value? + ASSERT( !j.more() ); + } + else { + BSONElement e = j.next(); + ASSERT_EQUALS( string( "_" ), e.fieldName() ); + ASSERT( e.type() == Object ); + checkStructure( e.embeddedObject(), id, n->nextChild ); + } + ASSERT( !j.more() ); + } + static void checkStructure( const string &spec, const IndexDetails &id ) { + checkStructure( fromjson( spec ), id, id.head ); + } + static bool present( const IndexDetails &id, const BSONObj &key, int direction ) { + int pos; + bool found; + id.head.btree()->locate( id, id.head, key, Ordering::make(id.keyPattern()), pos, found, recordLoc(), direction ); + return found; + } + int headerSize() const { return BtreeBucket::headerSize(); } + int packedDataSize( int pos ) const { return BtreeBucket::packedDataSize( pos ); } + void fixParentPtrs( const DiskLoc &thisLoc ) { BtreeBucket::fixParentPtrs( thisLoc ); } + void forcePack() { + topSize += emptySize; + emptySize = 0; + setNotPacked(); + } + private: + DiskLoc dummyDiskLoc() const { return DiskLoc( 0, 2 ); } + }; + + /** + * We could probably refactor the following tests, but it's easier to debug + * them in the present state. + */ + + class MergeBucketsDelInternal : public Base { + public: + void run() { + ArtificialTree::setTree( "{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "bb" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}", id() ); + } + }; + + class MergeBucketsRightNull : public Base { + public: + void run() { + ArtificialTree::setTree( "{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "bb" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}", id() ); + } + }; + + // not yet handling this case + class DontMergeSingleBucket : public Base { + public: + void run() { + ArtificialTree::setTree( "{d:{b:{a:null},c:null}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "c" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{d:{b:{a:null}}}", id() ); + } + }; + + class ParentMergeNonRightToLeft : public Base { + public: + void run() { + ArtificialTree::setTree( "{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "bb" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + // child does not currently replace parent in this case + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}", id() ); + } + }; + + class ParentMergeNonRightToRight : public Base { + public: + void run() { + ArtificialTree::setTree( "{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "ff" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + // child does not currently replace parent in this case + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}", id() ); + } + }; + + class CantMergeRightNoMerge : public Base { + public: + void run() { + ArtificialTree::setTree( "{d:{b:{a:null},bb:null,cc:{c:null}},dd:null,_:{f:{e:null},h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "bb" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{d:{b:{a:null},cc:{c:null}},dd:null,_:{f:{e:null},h:{g:null}}}", id() ); + } + }; + + class CantMergeLeftNoMerge : public Base { + public: + void run() { + ArtificialTree::setTree( "{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "g" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 6, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{c:{b:{a:null}},d:null,_:{f:{e:null}}}", id() ); + } + }; + + class MergeOption : public Base { + public: + void run() { + ArtificialTree::setTree( "{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "ee" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}", id() ); + } + }; + + class ForceMergeLeft : public Base { + public: + void run() { + ArtificialTree::setTree( "{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "ee" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}", id() ); + } + }; + + class ForceMergeRight : public Base { + public: + void run() { + ArtificialTree::setTree( "{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "ee" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}", id() ); + } + }; + + class RecursiveMerge : public Base { + public: + void run() { + ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "c" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + // height is not currently reduced in this case + ArtificialTree::checkStructure( "{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}", id() ); + } + }; + + class RecursiveMergeRightBucket : public Base { + public: + void run() { + ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "c" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}", id() ); + } + }; + + class RecursiveMergeDoubleRightBucket : public Base { + public: + void run() { + ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}", id() ); +// dump(); + string ns = id().indexNamespace(); + ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + + BSONObj k = BSON( "" << "c" ); + assert( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + // no recursion currently in this case + ArtificialTree::checkStructure( "{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}", id() ); + } + }; + + class MergeSizeBase : public Base { + public: + MergeSizeBase() : _count() {} + virtual ~MergeSizeBase() {} + void run() { + typedef ArtificialTree A; + A::set( A::make( id() ), id() ); + A* root = A::is( dl() ); + DiskLoc left = A::make( id() ); + root->push( biggestKey( 'm' ), left ); + _count = 1; + A* l = A::is( left ); + DiskLoc right = A::make( id() ); + root->setNext( right ); + A* r = A::is( right ); + root->fixParentPtrs( dl() ); + + ASSERT_EQUALS( bigSize(), bigSize() / 2 * 2 ); + fillToExactSize( l, leftSize(), 'a' ); + fillToExactSize( r, rightSize(), 'n' ); + ASSERT( leftAdditional() <= 2 ); + if ( leftAdditional() >= 2 ) { + l->push( bigKey( 'k' ), DiskLoc() ); + } + if ( leftAdditional() >= 1 ) { + l->push( bigKey( 'l' ), DiskLoc() ); + } + ASSERT( rightAdditional() <= 2 ); + if ( rightAdditional() >= 2 ) { + r->push( bigKey( 'y' ), DiskLoc() ); + } + if ( rightAdditional() >= 1 ) { + r->push( bigKey( 'z' ), DiskLoc() ); + } + _count += leftAdditional() + rightAdditional(); + +// dump(); + + initCheck(); + string ns = id().indexNamespace(); + const char *keys = delKeys(); + for( const char *i = keys; *i; ++i ) { + int unused = 0; + ASSERT_EQUALS( _count, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = bigKey( *i ); + unindex( k ); +// dump(); + --_count; + } + +// dump(); + + int unused = 0; + ASSERT_EQUALS( _count, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + validate(); + if ( !merge() ) { + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + } + else { + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + } + } + protected: + virtual int leftAdditional() const { return 2; } + virtual int rightAdditional() const { return 2; } + virtual void initCheck() {} + virtual void validate() {} + virtual int leftSize() const = 0; + virtual int rightSize() const = 0; + virtual const char * delKeys() const { return "klyz"; } + virtual bool merge() const { return true; } + void fillToExactSize( ArtificialTree *t, int targetSize, char startKey ) { + int size = 0; + while( size < targetSize ) { + int space = targetSize - size; + int nextSize = space - sizeof( _KeyNode ); + assert( nextSize > 0 ); + BSONObj newKey = key( startKey++, nextSize ); + t->push( newKey, DiskLoc() ); + size += newKey.objsize() + sizeof( _KeyNode ); + _count += 1; + } + ASSERT_EQUALS( t->packedDataSize( 0 ), targetSize ); + } + static BSONObj key( char a, int size ) { + if ( size >= bigSize() ) { + return bigKey( a ); + } + return simpleKey( a, size - ( bigSize() - 801 ) ); + } + static BSONObj bigKey( char a ) { + return simpleKey( a, 801 ); + } + static BSONObj biggestKey( char a ) { + int size = BtreeBucket::getKeyMax() - bigSize() + 801; + return simpleKey( a, size ); + } + static int bigSize() { + return bigKey( 'a' ).objsize(); + } + static int biggestSize() { + return biggestKey( 'a' ).objsize(); + } + int _count; + }; + + class MergeSizeJustRightRight : public MergeSizeBase { + protected: + virtual int rightSize() const { return BtreeBucket::getLowWaterMark() - 1; } + virtual int leftSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) - ( BtreeBucket::getLowWaterMark() - 1 ); } + }; + + class MergeSizeJustRightLeft : public MergeSizeBase { + protected: + virtual int leftSize() const { return BtreeBucket::getLowWaterMark() - 1; } + virtual int rightSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) - ( BtreeBucket::getLowWaterMark() - 1 ); } + virtual const char * delKeys() const { return "yzkl"; } + }; + + class MergeSizeRight : public MergeSizeJustRightRight { + virtual int rightSize() const { return MergeSizeJustRightRight::rightSize() - 1; } + virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() + 1; } + }; + + class MergeSizeLeft : public MergeSizeJustRightLeft { + virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() + 1; } + virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize() - 1; } + }; + + class NoMergeBelowMarkRight : public MergeSizeJustRightRight { + virtual int rightSize() const { return MergeSizeJustRightRight::rightSize() + 1; } + virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() - 1; } + virtual bool merge() const { return false; } + }; + + class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft { + virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() - 1; } + virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize() + 1; } + virtual bool merge() const { return false; } + }; + + class MergeSizeRightTooBig : public MergeSizeJustRightLeft { + virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() + 1; } + virtual bool merge() const { return false; } + }; + + class MergeSizeLeftTooBig : public MergeSizeJustRightRight { + virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() + 1; } + virtual bool merge() const { return false; } + }; + + class BalanceOneLeftToRight : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},b:{$20:null,$30:null,$40:null,$50:null,a:null},_:{c:null}}", id() ); + ASSERT_EQUALS( 14, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x40 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},b:{$10:null,$20:null,$30:null,$50:null,a:null},_:{c:null}}", id() ); + } + }; + + class BalanceOneRightToLeft : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null},b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},_:{c:null}}", id() ); + ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x3 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$20:{$1:null,$2:null,$4:null,$10:null},b:{$30:null,$40:null,$50:null,$60:null,$70:null},_:{c:null}}", id() ); + } + }; + + class BalanceThreeLeftToRight : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}},b:{$30:null,$40:{$35:null},$50:{$45:null}},_:{c:null}}", id() ); + ASSERT_EQUALS( 23, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 14, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x30 ) ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 22, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 14, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$9:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},_:{$8:null}},b:{$11:{$10:null},$13:{$12:null},$20:{$14:null},$40:{$35:null},$50:{$45:null}},_:{c:null}}", id() ); + } + }; + + class BalanceThreeRightToLeft : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}},b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null},$70:{$65:null},$80:{$75:null},$90:{$85:null},$100:{$95:null}},_:{c:null}}", id() ); + ASSERT_EQUALS( 25, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 15, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x5 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 24, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 15, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null},$30:{$25:null},$40:{$35:null},_:{$45:null}},b:{$60:{$55:null},$70:{$65:null},$80:{$75:null},$90:{$85:null},$100:{$95:null}},_:{c:null}}", id() ); + } + }; + + class BalanceSingleParentKey : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},_:{$20:null,$30:null,$40:null,$50:null,a:null}}", id() ); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x40 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$10:null,$20:null,$30:null,$50:null,a:null}}", id() ); + } + }; + + class PackEmpty : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null}", id() ); + BSONObj k = BSON( "" << "a" ); + ASSERT( unindex( k ) ); + ArtificialTree *t = ArtificialTree::is( dl() ); + t->forcePack(); + Tester::checkEmpty( t, id() ); + } + class Tester : public ArtificialTree { + public: + static void checkEmpty( ArtificialTree *a, const IndexDetails &id ) { + Tester *t = static_cast< Tester * >( a ); + ASSERT_EQUALS( 0, t->n ); + ASSERT( !( t->flags & Packed ) ); + Ordering o = Ordering::make( id.keyPattern() ); + int zero = 0; + t->_packReadyForMod( o, zero ); + ASSERT_EQUALS( 0, t->n ); + ASSERT_EQUALS( 0, t->topSize ); + ASSERT_EQUALS( BtreeBucket::bodySize(), t->emptySize ); + ASSERT( t->flags & Packed ); + } + }; + }; + + class PackedDataSizeEmpty : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null}", id() ); + BSONObj k = BSON( "" << "a" ); + ASSERT( unindex( k ) ); + ArtificialTree *t = ArtificialTree::is( dl() ); + t->forcePack(); + Tester::checkEmpty( t, id() ); + } + class Tester : public ArtificialTree { + public: + static void checkEmpty( ArtificialTree *a, const IndexDetails &id ) { + Tester *t = static_cast< Tester * >( a ); + ASSERT_EQUALS( 0, t->n ); + ASSERT( !( t->flags & Packed ) ); + int zero = 0; + ASSERT_EQUALS( 0, t->packedDataSize( zero ) ); + ASSERT( !( t->flags & Packed ) ); + } + }; + }; + + class BalanceSingleParentKeyPackParent : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},_:{$20:null,$30:null,$40:null,$50:null,a:null}}", id() ); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + // force parent pack + ArtificialTree::is( dl() )->forcePack(); + BSONObj k = BSON( "" << bigNumString( 0x40 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$10:null,$20:null,$30:null,$50:null,a:null}}", id() ); + } + }; + + class BalanceSplitParent : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10$10:{$1:null,$2:null,$3:null,$4:null},$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null},$200:null,$300:null,$400:null,$500:null,$600:null,$700:null,$800:null,$900:null,_:{c:null}}", id() ); + ASSERT_EQUALS( 22, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x3 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 21, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$500:{$30:{$1:null,$2:null,$4:null,$10$10:null,$20:null},$100:{$40:null,$50:null,$60:null,$70:null,$80:null},$200:null,$300:null,$400:null},_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}", id() ); + } + }; + + class RebalancedSeparatorBase : public Base { + public: + void run() { + ArtificialTree::setTree( treeSpec(), id() ); + modTree(); + Tester::checkSeparator( id(), expectedSeparator() ); + } + virtual string treeSpec() const = 0; + virtual int expectedSeparator() const = 0; + virtual void modTree() {} + struct Tester : public ArtificialTree { + static void checkSeparator( const IndexDetails& id, int expected ) { + ASSERT_EQUALS( expected, static_cast< Tester * >( id.head.btreemod() )->rebalancedSeparatorPos( id.head, 0 ) ); + } + }; + }; + + class EvenRebalanceLeft : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$7:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null,$6:null},_:{$8:null,$9:null,$10$31e:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class EvenRebalanceLeftCusp : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null},_:{$7:null,$8:null,$9$31e:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class EvenRebalanceRight : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class EvenRebalanceRightCusp : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class EvenRebalanceCenter : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class OddRebalanceLeft : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class OddRebalanceRight : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class OddRebalanceCenter : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + class RebalanceEmptyRight : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$b:null}}"; } + virtual void modTree() { + BSONObj k = BSON( "" << bigNumString( 0xb ) ); + ASSERT( unindex( k ) ); + } + virtual int expectedSeparator() const { return 4; } + }; + + class RebalanceEmptyLeft : public RebalancedSeparatorBase { + virtual string treeSpec() const { return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$18:null,$19:null}}"; } + virtual void modTree() { + BSONObj k = BSON( "" << bigNumString( 0x1 ) ); + ASSERT( unindex( k ) ); + } + virtual int expectedSeparator() const { return 4; } + }; + + class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight { + virtual int rightSize() const { return MergeSizeJustRightRight::rightSize() + 1; } + virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key; } + virtual void validate() { ASSERT_EQUALS( _oldTop, bt()->keyNode( 0 ).key ); } + virtual bool merge() const { return false; } + protected: + BSONObj _oldTop; + }; + + class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight { + virtual int rightSize() const { return MergeSizeJustRightRight::rightSize(); } + virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() + 1; } + // different top means we rebalanced + virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key ) ); } + }; + + class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft { + virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize() + 1; } + virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key; } + virtual void validate() { ASSERT_EQUALS( _oldTop, bt()->keyNode( 0 ).key ); } + virtual bool merge() const { return false; } + protected: + BSONObj _oldTop; + }; + + class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft { + virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize(); } + virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() + 1; } + // different top means we rebalanced + virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key ) ); } + }; + + class PreferBalanceLeft : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},$20:{$11:null,$12:null,$13:null,$14:null},_:{$30:null}}", id() ); + ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x12 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$5:{$1:null,$2:null,$3:null,$4:null},$20:{$6:null,$10:null,$11:null,$13:null,$14:null},_:{$30:null}}", id() ); + } + }; + + class PreferBalanceRight : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$1:null},$20:{$11:null,$12:null,$13:null,$14:null},_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}", id() ); + ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x12 ) ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$10:{$1:null},$31:{$11:null,$13:null,$14:null,$20:null},_:{$32:null,$33:null,$34:null,$35:null,$36:null}}", id() ); + } + }; + + class RecursiveMergeThenBalance : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}},_:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null,$90:null}}", id() ); + ASSERT_EQUALS( 15, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x7 ) ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 14, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null},_:{$50:null,$60:null,$70:null,$80:null,$90:null}}", id() ); + } + }; + + class MergeRightEmpty : public MergeSizeBase { + protected: + virtual int rightAdditional() const { return 1; } + virtual int leftAdditional() const { return 1; } + virtual const char * delKeys() const { return "lz"; } + virtual int rightSize() const { return 0; } + virtual int leftSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ); } + }; + + class MergeMinRightEmpty : public MergeSizeBase { + protected: + virtual int rightAdditional() const { return 1; } + virtual int leftAdditional() const { return 0; } + virtual const char * delKeys() const { return "z"; } + virtual int rightSize() const { return 0; } + virtual int leftSize() const { return bigSize() + sizeof( _KeyNode ); } + }; + + class MergeLeftEmpty : public MergeSizeBase { + protected: + virtual int rightAdditional() const { return 1; } + virtual int leftAdditional() const { return 1; } + virtual const char * delKeys() const { return "zl"; } + virtual int leftSize() const { return 0; } + virtual int rightSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ); } + }; + + class MergeMinLeftEmpty : public MergeSizeBase { + protected: + virtual int leftAdditional() const { return 1; } + virtual int rightAdditional() const { return 0; } + virtual const char * delKeys() const { return "l"; } + virtual int leftSize() const { return 0; } + virtual int rightSize() const { return bigSize() + sizeof( _KeyNode ); } + }; + + class BalanceRightEmpty : public MergeRightEmpty { + protected: + virtual int leftSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) + 1; } + virtual bool merge() const { return false; } + virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key; } + virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key ) ); } + private: + BSONObj _oldTop; + }; + + class BalanceLeftEmpty : public MergeLeftEmpty { + protected: + virtual int rightSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) + 1; } + virtual bool merge() const { return false; } + virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key; } + virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key ) ); } + private: + BSONObj _oldTop; + }; + + class DelEmptyNoNeighbors : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{b:{a:null}}", id() ); + ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "a" ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 1, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{b:null}", id() ); + } + }; + + class DelEmptyEmptyNeighbors : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() ); + ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "b" ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), 0, true ) ); + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{a:null,c:null,d:null}", id() ); + } + }; + + class DelInternal : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() ); + int unused = 0; + ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "c" ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{a:null,b:null,d:null}", id() ); + } + }; + + class DelInternalReplaceWithUnused : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() ); + getDur().writingInt( const_cast< DiskLoc& >( bt()->keyNode( 1 ).prevChildBucket.btree()->keyNode( 0 ).recordLoc ).GETOFS() ) |= 1; // make unused + int unused = 0; + ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 1, unused ); + ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "c" ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + unused = 0; + ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 1, unused ); + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + // doesn't discriminate between used and unused + ArtificialTree::checkStructure( "{a:null,b:null,d:null}", id() ); + } + }; + + class DelInternalReplaceRight : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,_:{b:null}}", id() ); + int unused = 0; + ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "a" ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + unused = 0; + ASSERT_EQUALS( 1, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{b:null}", id() ); + } + }; + + class DelInternalPromoteKey : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}", id() ); + int unused = 0; + ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "y" ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + unused = 0; + ASSERT_EQUALS( 6, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{a:null,e:{c:{b:null},d:null},z:null}", id() ); + } + }; + + class DelInternalPromoteRightKey : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,_:{e:{c:null},_:{f:null}}}", id() ); + int unused = 0; + ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "a" ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + unused = 0; + ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{c:null,_:{e:null,f:null}}", id() ); + } + }; + + class DelInternalReplacementPrevNonNull : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,d:{c:{b:null}},e:null}", id() ); + int unused = 0; + ASSERT_EQUALS( 5, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "d" ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 1, unused ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{a:null,d:{c:{b:null}},e:null}", id() ); + ASSERT( bt()->keyNode( 1 ).recordLoc.getOfs() & 1 ); // check 'unused' key + } + }; + + class DelInternalReplacementNextNonNull : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{a:null,_:{c:null,_:{d:null}}}", id() ); + int unused = 0; + ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << "a" ); + // dump(); + ASSERT( unindex( k ) ); + // dump(); + ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 1, unused ); + ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{a:null,_:{c:null,_:{d:null}}}", id() ); + ASSERT( bt()->keyNode( 0 ).recordLoc.getOfs() & 1 ); // check 'unused' key + } + }; + + class DelInternalSplitPromoteLeft : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}},$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}", id() ); + int unused = 0; + ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x30, 0x10 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$60:{$10:null,$20:null,$27:{$23:null,$25:null},$40:null,$50:null},_:{$70:null,$80:null,$90:null,$100:null}}", id() ); + } + }; + + class DelInternalSplitPromoteRight : public Base { + public: + void run() { + string ns = id().indexNamespace(); + ArtificialTree::setTree( "{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}", id() ); + int unused = 0; + ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + BSONObj k = BSON( "" << bigNumString( 0x100, 0x10 ) ); +// dump(); + ASSERT( unindex( k ) ); +// dump(); + ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), &unused, true ) ); + ASSERT_EQUALS( 0, unused ); + ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords ); + ArtificialTree::checkStructure( "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},_:{$90:null,$97:{$93:null,$95:null}}}", id() ); + } + }; + class All : public Suite { public: - All() : Suite( "btree" ){ + All() : Suite( "btree" ) { } - - void setupTests(){ + + void setupTests() { add< Create >(); add< SimpleInsertDelete >(); add< SplitRightHeavyBucket >(); @@ -380,9 +1634,77 @@ namespace BtreeTests { add< MissingLocate >(); add< MissingLocateMultiBucket >(); add< SERVER983 >(); - add< ReuseUnused >(); + add< DontReuseUnused >(); add< PackUnused >(); add< DontDropReferenceKey >(); + add< MergeBucketsLeft >(); + add< MergeBucketsRight >(); +// add< MergeBucketsHead >(); + add< MergeBucketsDontReplaceHead >(); + add< MergeBucketsDelInternal >(); + add< MergeBucketsRightNull >(); + add< DontMergeSingleBucket >(); + add< ParentMergeNonRightToLeft >(); + add< ParentMergeNonRightToRight >(); + add< CantMergeRightNoMerge >(); + add< CantMergeLeftNoMerge >(); + add< MergeOption >(); + add< ForceMergeLeft >(); + add< ForceMergeRight >(); + add< RecursiveMerge >(); + add< RecursiveMergeRightBucket >(); + add< RecursiveMergeDoubleRightBucket >(); + add< MergeSizeJustRightRight >(); + add< MergeSizeJustRightLeft >(); + add< MergeSizeRight >(); + add< MergeSizeLeft >(); + add< NoMergeBelowMarkRight >(); + add< NoMergeBelowMarkLeft >(); + add< MergeSizeRightTooBig >(); + add< MergeSizeLeftTooBig >(); + add< BalanceOneLeftToRight >(); + add< BalanceOneRightToLeft >(); + add< BalanceThreeLeftToRight >(); + add< BalanceThreeRightToLeft >(); + add< BalanceSingleParentKey >(); + add< PackEmpty >(); + add< PackedDataSizeEmpty >(); + add< BalanceSingleParentKeyPackParent >(); + add< BalanceSplitParent >(); + add< EvenRebalanceLeft >(); + add< EvenRebalanceLeftCusp >(); + add< EvenRebalanceRight >(); + add< EvenRebalanceRightCusp >(); + add< EvenRebalanceCenter >(); + add< OddRebalanceLeft >(); + add< OddRebalanceRight >(); + add< OddRebalanceCenter >(); + add< RebalanceEmptyRight >(); + add< RebalanceEmptyLeft >(); + add< NoMoveAtLowWaterMarkRight >(); + add< MoveBelowLowWaterMarkRight >(); + add< NoMoveAtLowWaterMarkLeft >(); + add< MoveBelowLowWaterMarkLeft >(); + add< PreferBalanceLeft >(); + add< PreferBalanceRight >(); + add< RecursiveMergeThenBalance >(); + add< MergeRightEmpty >(); + add< MergeMinRightEmpty >(); + add< MergeLeftEmpty >(); + add< MergeMinLeftEmpty >(); + add< BalanceRightEmpty >(); + add< BalanceLeftEmpty >(); + add< DelEmptyNoNeighbors >(); + add< DelEmptyEmptyNeighbors >(); + add< DelInternal >(); + add< DelInternalReplaceWithUnused >(); + add< DelInternalReplaceRight >(); + add< DelInternalPromoteKey >(); + add< DelInternalPromoteRightKey >(); + add< DelInternalReplacementPrevNonNull >(); + add< DelInternalReplacementNextNonNull >(); + add< DelInternalSplitPromoteLeft >(); + add< DelInternalSplitPromoteRight >(); } } myall; } diff --git a/dbtests/clienttests.cpp b/dbtests/clienttests.cpp index 58287e9..f51b765 100644 --- a/dbtests/clienttests.cpp +++ b/dbtests/clienttests.cpp @@ -20,40 +20,40 @@ #include "../client/dbclient.h" #include "dbtests.h" #include "../db/concurrency.h" - + namespace ClientTests { - + class Base { public: - - Base( string coll ){ + + Base( string coll ) { _ns = (string)"test." + coll; } - - virtual ~Base(){ + + virtual ~Base() { db.dropCollection( _ns ); } - - const char * ns(){ return _ns.c_str(); } - + + const char * ns() { return _ns.c_str(); } + string _ns; DBDirectClient db; }; - + class DropIndex : public Base { public: - DropIndex() : Base( "dropindex" ){} - void run(){ + DropIndex() : Base( "dropindex" ) {} + void run() { db.insert( ns() , BSON( "x" << 2 ) ); ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() ); - + db.ensureIndex( ns() , BSON( "x" << 1 ) ); ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() ); - + db.dropIndex( ns() , BSON( "x" << 1 ) ); ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() ); - + db.ensureIndex( ns() , BSON( "x" << 1 ) ); ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() ); @@ -61,18 +61,18 @@ namespace ClientTests { ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() ); } }; - + class ReIndex : public Base { public: - ReIndex() : Base( "reindex" ){} - void run(){ - + ReIndex() : Base( "reindex" ) {} + void run() { + db.insert( ns() , BSON( "x" << 2 ) ); ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() ); - + db.ensureIndex( ns() , BSON( "x" << 1 ) ); ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() ); - + db.reIndex( ns() ); ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() ); } @@ -81,15 +81,15 @@ namespace ClientTests { class ReIndex2 : public Base { public: - ReIndex2() : Base( "reindex2" ){} - void run(){ - + ReIndex2() : Base( "reindex2" ) {} + void run() { + db.insert( ns() , BSON( "x" << 2 ) ); ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() ); - + db.ensureIndex( ns() , BSON( "x" << 1 ) ); ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() ); - + BSONObj out; ASSERT( db.runCommand( "test" , BSON( "reIndex" << "reindex2" ) , out ) ); ASSERT_EQUALS( 2 , out["nIndexes"].number() ); @@ -106,7 +106,7 @@ namespace ClientTests { for( int i = 0; i < 1111; ++i ) db.insert( ns(), BSON( "a" << i << "b" << longs ) ); db.ensureIndex( ns(), BSON( "a" << 1 << "b" << 1 ) ); - + auto_ptr< DBClientCursor > c = db.query( ns(), Query().sort( BSON( "a" << 1 << "b" << 1 ) ) ); ASSERT_EQUALS( 1111, c->itcount() ); } @@ -161,20 +161,37 @@ namespace ClientTests { ASSERT( db.runCommand( "unittests", BSON( "collstats" << "clienttests.create" ), info ) ); } }; + + class ConnectionStringTests { + public: + void run() { + { + ConnectionString s( "a/b,c,d" , ConnectionString::SET ); + ASSERT_EQUALS( ConnectionString::SET , s.type() ); + ASSERT_EQUALS( "a" , s.getSetName() ); + vector v = s.getServers(); + ASSERT_EQUALS( 3U , v.size() ); + ASSERT_EQUALS( "b" , v[0].host() ); + ASSERT_EQUALS( "c" , v[1].host() ); + ASSERT_EQUALS( "d" , v[2].host() ); + } + } + }; class All : public Suite { public: - All() : Suite( "client" ){ + All() : Suite( "client" ) { } - void setupTests(){ + void setupTests() { add(); add(); add(); add(); add(); add(); + add(); } - + } all; } diff --git a/dbtests/commandtests.cpp b/dbtests/commandtests.cpp index fa0014d..fa6204d 100644 --- a/dbtests/commandtests.cpp +++ b/dbtests/commandtests.cpp @@ -23,19 +23,19 @@ using namespace mongo; namespace CommandTests { // one namespace per command - namespace FileMD5{ + namespace FileMD5 { struct Base { - Base(){ + Base() { db.dropCollection(ns()); db.ensureIndex(ns(), BSON( "files_id" << 1 << "n" << 1 )); } const char* ns() { return "test.fs.chunks"; } - + DBDirectClient db; }; struct Type0 : Base { - void run(){ + void run() { { BSONObjBuilder b; b.genOID(); @@ -58,8 +58,8 @@ namespace CommandTests { ASSERT_EQUALS( string("5eb63bbbe01eeed093cb22bb8f5acdc3") , result["md5"].valuestr() ); } }; - struct Type2 : Base{ - void run(){ + struct Type2 : Base { + void run() { { BSONObjBuilder b; b.genOID(); @@ -86,13 +86,13 @@ namespace CommandTests { class All : public Suite { public: - All() : Suite( "commands" ){ + All() : Suite( "commands" ) { } - void setupTests(){ + void setupTests() { add< FileMD5::Type0 >(); add< FileMD5::Type2 >(); } - + } all; } diff --git a/dbtests/cursortests.cpp b/dbtests/cursortests.cpp index 954c8b0..ddd7b03 100644 --- a/dbtests/cursortests.cpp +++ b/dbtests/cursortests.cpp @@ -25,12 +25,12 @@ #include "dbtests.h" namespace CursorTests { - + namespace BtreeCursorTests { // The ranges expressed in these tests are impossible given our query // syntax, so going to do them a hacky way. - + class Base { protected: FieldRangeVector *vec( int *vals, int len, int direction = 1 ) { @@ -40,7 +40,8 @@ namespace CursorTests { FieldRangeSet s2( "", _objs.back() ); if ( i == 0 ) { s.range( "a" ) = s2.range( "a" ); - } else { + } + else { s.range( "a" ) |= s2.range( "a" ); } } @@ -49,7 +50,7 @@ namespace CursorTests { private: vector< BSONObj > _objs; }; - + class MultiRange : public Base { public: void run() { @@ -103,7 +104,7 @@ namespace CursorTests { ASSERT( !c.ok() ); } }; - + class MultiRangeReverse : public Base { public: void run() { @@ -129,7 +130,7 @@ namespace CursorTests { ASSERT( !c.ok() ); } }; - + class Base2 { public: virtual ~Base2() { _c.dropCollection( ns() ); } @@ -167,7 +168,7 @@ namespace CursorTests { dblock _lk; vector< BSONObj > _objs; }; - + class EqEq : public Base2 { public: void run() { @@ -194,7 +195,7 @@ namespace CursorTests { check( BSON( "a" << 4 << "b" << BSON( "$gte" << 1 << "$lte" << 10 ) ) ); } virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); } - }; + }; class EqIn : public Base2 { public: @@ -210,7 +211,7 @@ namespace CursorTests { check( BSON( "a" << 4 << "b" << BSON( "$in" << BSON_ARRAY( 5 << 6 << 11 ) ) ) ); } virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); } - }; + }; class RangeEq : public Base2 { public: @@ -227,7 +228,7 @@ namespace CursorTests { check( BSON( "a" << BSON( "$gte" << 1 << "$lte" << 10 ) << "b" << 4 ) ); } virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); } - }; + }; class RangeIn : public Base2 { public: @@ -244,15 +245,15 @@ namespace CursorTests { check( BSON( "a" << BSON( "$gte" << 1 << "$lte" << 10 ) << "b" << BSON( "$in" << BSON_ARRAY( 4 << 6 ) ) ) ); } virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); } - }; - + }; + } // namespace BtreeCursorTests - + class All : public Suite { public: - All() : Suite( "cursor" ){} - - void setupTests(){ + All() : Suite( "cursor" ) {} + + void setupTests() { add< BtreeCursorTests::MultiRange >(); add< BtreeCursorTests::MultiRangeGap >(); add< BtreeCursorTests::MultiRangeReverse >(); diff --git a/dbtests/d_chunk_manager_tests.cpp b/dbtests/d_chunk_manager_tests.cpp new file mode 100644 index 0000000..bcfe9fa --- /dev/null +++ b/dbtests/d_chunk_manager_tests.cpp @@ -0,0 +1,467 @@ +//@file d_chunk_manager_tests.cpp : s/d_chunk_manager.{h,cpp} tests + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" +#include "dbtests.h" + +#include "../s/d_chunk_manager.h" + +namespace { + + class BasicTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 ) << + "unique" << false ); + + // single-chunk collection + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" << + "ns" << "test.foo" << + "min" << BSON( "a" << MINKEY ) << + "max" << BSON( "a" << MAXKEY ) ) ); + + ShardChunkManager s ( collection , chunks ); + + BSONObj k1 = BSON( "a" << MINKEY ); + ASSERT( s.belongsToMe( k1 ) ); + BSONObj k2 = BSON( "a" << MAXKEY ); + ASSERT( ! s.belongsToMe( k2 ) ); + BSONObj k3 = BSON( "a" << 1 << "b" << 2 ); + ASSERT( s.belongsToMe( k3 ) ); + } + }; + + class BasicCompoundTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1) << + "unique" << false ); + + // single-chunk collection + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKeyb_MinKey" << + "ns" << "test.foo" << + "min" << BSON( "a" << MINKEY << "b" << MINKEY ) << + "max" << BSON( "a" << MAXKEY << "b" << MAXKEY ) ) ); + + ShardChunkManager s ( collection , chunks ); + + BSONObj k1 = BSON( "a" << MINKEY << "b" << MINKEY ); + ASSERT( s.belongsToMe( k1 ) ); + BSONObj k2 = BSON( "a" << MAXKEY << "b" << MAXKEY ); + ASSERT( ! s.belongsToMe( k2 ) ); + BSONObj k3 = BSON( "a" << MINKEY << "b" << 10 ); + ASSERT( s.belongsToMe( k3 ) ); + BSONObj k4 = BSON( "a" << 10 << "b" << 20 ); + ASSERT( s.belongsToMe( k4 ) ); + } + }; + + class RangeTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "x.y" << + "dropped" << false << + "key" << BSON( "a" << 1 ) << + "unique" << false ); + + // 3-chunk collection, 2 of them being contiguous + // [min->10) , [10->20) , , [30->max) + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "x.y-a_MinKey" << + "ns" << "x.y" << + "min" << BSON( "a" << MINKEY ) << + "max" << BSON( "a" << 10 ) ) << + BSON( "_id" << "x.y-a_10" << + "ns" << "x.y" << + "min" << BSON( "a" << 10 ) << + "max" << BSON( "a" << 20 ) ) << + BSON( "_id" << "x.y-a_30" << + "ns" << "x.y" << + "min" << BSON( "a" << 30 ) << + "max" << BSON( "a" << MAXKEY ) ) ); + + ShardChunkManager s ( collection , chunks ); + + BSONObj k1 = BSON( "a" << 5 ); + ASSERT( s.belongsToMe( k1 ) ); + BSONObj k2 = BSON( "a" << 10 ); + ASSERT( s.belongsToMe( k2 ) ); + BSONObj k3 = BSON( "a" << 25 ); + ASSERT( ! s.belongsToMe( k3 ) ); + BSONObj k4 = BSON( "a" << 30 ); + ASSERT( s.belongsToMe( k4 ) ); + BSONObj k5 = BSON( "a" << 40 ); + ASSERT( s.belongsToMe( k5 ) ); + } + }; + + class GetNextTests { + public: + void run() { + + BSONObj collection = BSON( "_id" << "x.y" << + "dropped" << false << + "key" << BSON( "a" << 1 ) << + "unique" << false ); + // empty collection + BSONArray chunks1 = BSONArray(); + ShardChunkManager s1( collection , chunks1 ); + + BSONObj empty; + BSONObj arbitraryKey = BSON( "a" << 10 ); + BSONObj foundMin, foundMax; + + ASSERT( s1.getNextChunk( empty , &foundMin , &foundMax ) ); + ASSERT( foundMin.isEmpty() ); + ASSERT( foundMax.isEmpty() ); + ASSERT( s1.getNextChunk( arbitraryKey , &foundMin , &foundMax ) ); + ASSERT( foundMin.isEmpty() ); + ASSERT( foundMax.isEmpty() ); + + // single-chunk collection + // [10->20] + BSONObj key_a10 = BSON( "a" << 10 ); + BSONObj key_a20 = BSON( "a" << 20 ); + BSONArray chunks2 = BSON_ARRAY( BSON( "_id" << "x.y-a_10" << + "ns" << "x.y" << + "min" << key_a10 << + "max" << key_a20 ) ); + ShardChunkManager s2( collection , chunks2 ); + ASSERT( s2.getNextChunk( empty , &foundMin , &foundMax ) ); + ASSERT( foundMin.woCompare( key_a10 ) == 0 ); + ASSERT( foundMax.woCompare( key_a20 ) == 0 ); + + // 3-chunk collection, 2 of them being contiguous + // [min->10) , [10->20) , , [30->max) + BSONObj key_a30 = BSON( "a" << 30 ); + BSONObj key_min = BSON( "a" << MINKEY ); + BSONObj key_max = BSON( "a" << MAXKEY ); + BSONArray chunks3 = BSON_ARRAY( BSON( "_id" << "x.y-a_MinKey" << + "ns" << "x.y" << + "min" << key_min << + "max" << key_a10 ) << + BSON( "_id" << "x.y-a_10" << + "ns" << "x.y" << + "min" << key_a10 << + "max" << key_a20 ) << + BSON( "_id" << "x.y-a_30" << + "ns" << "x.y" << + "min" << key_a30 << + "max" << key_max ) ); + ShardChunkManager s3( collection , chunks3 ); + ASSERT( ! s3.getNextChunk( empty , &foundMin , &foundMax ) ); // not eof + ASSERT( foundMin.woCompare( key_min ) == 0 ); + ASSERT( foundMax.woCompare( key_a10 ) == 0 ); + ASSERT( ! s3.getNextChunk( key_a10 , &foundMin , &foundMax ) ); + ASSERT( foundMin.woCompare( key_a30 ) == 0 ); + ASSERT( foundMax.woCompare( key_max ) == 0 ); + ASSERT( s3.getNextChunk( key_a30 , &foundMin , &foundMax ) ); + } + }; + + class DeletedTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << "true" ); + + BSONArray chunks = BSONArray(); + + ASSERT_EXCEPTION( ShardChunkManager s ( collection , chunks ) , UserException ); + } + }; + + class ClonePlusTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1 ) << + "unique" << false ); + // 1-chunk collection + // [10,0-20,0) + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" << + "ns" << "test.foo" << + "min" << BSON( "a" << 10 << "b" << 0 ) << + "max" << BSON( "a" << 20 << "b" << 0 ) ) ); + + ShardChunkManager s ( collection , chunks ); + + // new chunk [20,0-30,0) + BSONObj min = BSON( "a" << 20 << "b" << 0 ); + BSONObj max = BSON( "a" << 30 << "b" << 0 ); + ShardChunkManagerPtr cloned( s.clonePlus( min , max , 1 /* TODO test version */ ) ); + + BSONObj k1 = BSON( "a" << 5 << "b" << 0 ); + ASSERT( ! cloned->belongsToMe( k1 ) ); + BSONObj k2 = BSON( "a" << 20 << "b" << 0 ); + ASSERT( cloned->belongsToMe( k2 ) ); + BSONObj k3 = BSON( "a" << 25 << "b" << 0 ); + ASSERT( cloned->belongsToMe( k3 ) ); + BSONObj k4 = BSON( "a" << 30 << "b" << 0 ); + ASSERT( ! cloned->belongsToMe( k4 ) ); + } + }; + + class ClonePlusExceptionTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1 ) << + "unique" << false ); + // 1-chunk collection + // [10,0-20,0) + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" << + "ns" << "test.foo" << + "min" << BSON( "a" << 10 << "b" << 0 ) << + "max" << BSON( "a" << 20 << "b" << 0 ) ) ); + + ShardChunkManager s ( collection , chunks ); + + // [15,0-25,0) overlaps [10,0-20,0) + BSONObj min = BSON( "a" << 15 << "b" << 0 ); + BSONObj max = BSON( "a" << 25 << "b" << 0 ); + ASSERT_EXCEPTION( s.clonePlus ( min , max , 1 /* TODO test version */ ) , UserException ); + } + }; + + class CloneMinusTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "x.y" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1 ) << + "unique" << false ); + + // 2-chunk collection + // [10,0->20,0) , , [30,0->40,0) + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "x.y-a_10b_0" << + "ns" << "x.y" << + "min" << BSON( "a" << 10 << "b" << 0 ) << + "max" << BSON( "a" << 20 << "b" << 0 ) ) << + BSON( "_id" << "x.y-a_30b_0" << + "ns" << "x.y" << + "min" << BSON( "a" << 30 << "b" << 0 ) << + "max" << BSON( "a" << 40 << "b" << 0 ) ) ); + + ShardChunkManager s ( collection , chunks ); + + // deleting chunk [10,0-20,0) + BSONObj min = BSON( "a" << 10 << "b" << 0 ); + BSONObj max = BSON( "a" << 20 << "b" << 0 ); + ShardChunkManagerPtr cloned( s.cloneMinus( min , max , 1 /* TODO test version */ ) ); + + BSONObj k1 = BSON( "a" << 5 << "b" << 0 ); + ASSERT( ! cloned->belongsToMe( k1 ) ); + BSONObj k2 = BSON( "a" << 15 << "b" << 0 ); + ASSERT( ! cloned->belongsToMe( k2 ) ); + BSONObj k3 = BSON( "a" << 30 << "b" << 0 ); + ASSERT( cloned->belongsToMe( k3 ) ); + BSONObj k4 = BSON( "a" << 35 << "b" << 0 ); + ASSERT( cloned->belongsToMe( k4 ) ); + BSONObj k5 = BSON( "a" << 40 << "b" << 0 ); + ASSERT( ! cloned->belongsToMe( k5 ) ); + } + }; + + class CloneMinusExceptionTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "x.y" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1 ) << + "unique" << false ); + + // 2-chunk collection + // [10,0->20,0) , , [30,0->40,0) + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "x.y-a_10b_0" << + "ns" << "x.y" << + "min" << BSON( "a" << 10 << "b" << 0 ) << + "max" << BSON( "a" << 20 << "b" << 0 ) ) << + BSON( "_id" << "x.y-a_30b_0" << + "ns" << "x.y" << + "min" << BSON( "a" << 30 << "b" << 0 ) << + "max" << BSON( "a" << 40 << "b" << 0 ) ) ); + + ShardChunkManager s ( collection , chunks ); + + // deleting non-existing chunk [25,0-28,0) + BSONObj min1 = BSON( "a" << 25 << "b" << 0 ); + BSONObj max1 = BSON( "a" << 28 << "b" << 0 ); + ASSERT_EXCEPTION( s.cloneMinus( min1 , max1 , 1 /* TODO test version */ ) , UserException ); + + + // deletin an overlapping range (not exactly a chunk) [15,0-25,0) + BSONObj min2 = BSON( "a" << 15 << "b" << 0 ); + BSONObj max2 = BSON( "a" << 25 << "b" << 0 ); + ASSERT_EXCEPTION( s.cloneMinus( min2 , max2 , 1 /* TODO test version */ ) , UserException ); + } + }; + + class CloneSplitTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1 ) << + "unique" << false ); + // 1-chunk collection + // [10,0-20,0) + BSONObj min = BSON( "a" << 10 << "b" << 0 ); + BSONObj max = BSON( "a" << 20 << "b" << 0 ); + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" + << "ns" << "test.foo" + << "min" << min + << "max" << max ) ); + + ShardChunkManager s ( collection , chunks ); + + BSONObj split1 = BSON( "a" << 15 << "b" << 0 ); + BSONObj split2 = BSON( "a" << 18 << "b" << 0 ); + vector splitKeys; + splitKeys.push_back( split1 ); + splitKeys.push_back( split2 ); + ShardChunkVersion version( 1 , 99 ); // first chunk 1|99 , second 1|100 + ShardChunkManagerPtr cloned( s.cloneSplit( min , max , splitKeys , version ) ); + + version.incMinor(); /* second chunk 1|100, first split point */ + version.incMinor(); /* third chunk 1|101, second split point */ + ASSERT_EQUALS( cloned->getVersion() , version /* 1|101 */ ); + ASSERT_EQUALS( s.getNumChunks() , 1u ); + ASSERT_EQUALS( cloned->getNumChunks() , 3u ); + ASSERT( cloned->belongsToMe( min ) ); + ASSERT( cloned->belongsToMe( split1 ) ); + ASSERT( cloned->belongsToMe( split2 ) ); + ASSERT( ! cloned->belongsToMe( max ) ); + } + }; + + class CloneSplitExceptionTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 << "b" << 1 ) << + "unique" << false ); + // 1-chunk collection + // [10,0-20,0) + BSONObj min = BSON( "a" << 10 << "b" << 0 ); + BSONObj max = BSON( "a" << 20 << "b" << 0 ); + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" + << "ns" << "test.foo" + << "min" << min + << "max" << max ) ); + + ShardChunkManager s ( collection , chunks ); + + BSONObj badSplit = BSON( "a" << 5 << "b" << 0 ); + vector splitKeys; + splitKeys.push_back( badSplit ); + ASSERT_EXCEPTION( s.cloneSplit( min , max , splitKeys , ShardChunkVersion( 1 ) ) , UserException ); + + BSONObj badMax = BSON( "a" << 25 << "b" << 0 ); + BSONObj split = BSON( "a" << 15 << "b" << 0 ); + splitKeys.clear(); + splitKeys.push_back( split ); + ASSERT_EXCEPTION( s.cloneSplit( min , badMax, splitKeys , ShardChunkVersion( 1 ) ) , UserException ); + } + }; + + class EmptyShardTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 ) << + "unique" << false ); + + // no chunks on this shard + BSONArray chunks; + + // shard can have zero chunks for an existing collection + // version should be 0, though + ShardChunkManager s( collection , chunks ); + ASSERT_EQUALS( s.getVersion() , ShardChunkVersion( 0 ) ); + ASSERT_EQUALS( s.getNumChunks() , 0u ); + } + }; + + class LastChunkTests { + public: + void run() { + BSONObj collection = BSON( "_id" << "test.foo" << + "dropped" << false << + "key" << BSON( "a" << 1 ) << + "unique" << false ); + + // 1-chunk collection + // [10->20) + BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_10" << + "ns" << "test.foo" << + "min" << BSON( "a" << 10 ) << + "max" << BSON( "a" << 20 ) ) ); + + ShardChunkManager s( collection , chunks ); + BSONObj min = BSON( "a" << 10 ); + BSONObj max = BSON( "a" << 20 ); + + // if we remove the only chunk, the only version accepted is 0 + ShardChunkVersion nonZero = 99; + ASSERT_EXCEPTION( s.cloneMinus( min , max , nonZero ) , UserException ); + ShardChunkManagerPtr empty( s.cloneMinus( min , max , 0 ) ); + ASSERT_EQUALS( empty->getVersion() , ShardChunkVersion( 0 ) ); + ASSERT_EQUALS( empty->getNumChunks() , 0u ); + BSONObj k = BSON( "a" << 15 << "b" << 0 ); + ASSERT( ! empty->belongsToMe( k ) ); + + // we can add a chunk to an empty manager + // version should be provided + ASSERT_EXCEPTION( empty->clonePlus( min , max , 0 ) , UserException ); + ShardChunkManagerPtr cloned( empty->clonePlus( min , max , nonZero ) ); + ASSERT_EQUALS( cloned->getVersion(), nonZero ); + ASSERT_EQUALS( cloned->getNumChunks() , 1u ); + ASSERT( cloned->belongsToMe( k ) ); + } + }; + + class ShardChunkManagerSuite : public Suite { + public: + ShardChunkManagerSuite() : Suite ( "shard_chunk_manager" ) {} + + void setupTests() { + add< BasicTests >(); + add< BasicCompoundTests >(); + add< RangeTests >(); + add< GetNextTests >(); + add< DeletedTests >(); + add< ClonePlusTests >(); + add< ClonePlusExceptionTests >(); + add< CloneMinusTests >(); + add< CloneMinusExceptionTests >(); + add< CloneSplitTests >(); + add< CloneSplitExceptionTests >(); + add< EmptyShardTests >(); + add< LastChunkTests >(); + } + } shardChunkManagerSuite; + +} // anonymous namespace diff --git a/dbtests/dbtests.cpp b/dbtests/dbtests.cpp index 195a1d1..8ede08d 100644 --- a/dbtests/dbtests.cpp +++ b/dbtests/dbtests.cpp @@ -1,4 +1,4 @@ -// dbtests.cpp : Runs db unit tests. +// #file dbtests.cpp : Runs db unit tests. // /** @@ -18,11 +18,9 @@ */ #include "pch.h" - #include "dbtests.h" int main( int argc, char** argv ) { static StaticObserver StaticObserver; return Suite::run(argc, argv, "/tmp/unittest"); } - diff --git a/dbtests/directclienttests.cpp b/dbtests/directclienttests.cpp new file mode 100644 index 0000000..204bf92 --- /dev/null +++ b/dbtests/directclienttests.cpp @@ -0,0 +1,80 @@ +/** @file directclienttests.cpp +*/ + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "../db/query.h" +#include "../db/db.h" +#include "../db/instance.h" +#include "../db/json.h" +#include "../db/lasterror.h" +#include "../db/update.h" +#include "../util/timer.h" +#include "dbtests.h" + +namespace DirectClientTests { + + class ClientBase { + public: + // NOTE: Not bothering to backup the old error record. + ClientBase() { mongo::lastError.reset( new LastError() ); } + virtual ~ClientBase() { } + protected: + static bool error() { + return !_client.getPrevError().getField( "err" ).isNull(); + } + DBDirectClient &client() const { return _client; } + private: + static DBDirectClient _client; + }; + DBDirectClient ClientBase::_client; + + const char *ns = "a.b"; + + class Capped : public ClientBase { + public: + virtual void run() { + for( int pass=0; pass < 3; pass++ ) { + client().createCollection(ns, 1024 * 1024, true, 999); + for( int j =0; j < pass*3; j++ ) + client().insert(ns, BSON("x" << j)); + + // test truncation of a capped collection + if( pass ) { + BSONObj info; + BSONObj cmd = BSON( "captrunc" << "b" << "n" << 1 << "inc" << true ); + cout << cmd.toString() << endl; + bool ok = client().runCommand("a", cmd, info); + cout << info.toString() << endl; + assert(ok); + } + + assert( client().dropCollection(ns) ); + } + } + }; + + class All : public Suite { + public: + All() : Suite( "directclient" ) { + } + void setupTests() { + add< Capped >(); + } + } myall; +} diff --git a/dbtests/framework.cpp b/dbtests/framework.cpp index e624211..c92c8d6 100644 --- a/dbtests/framework.cpp +++ b/dbtests/framework.cpp @@ -25,6 +25,7 @@ #include "framework.h" #include "../util/file_allocator.h" +#include "../db/dur.h" #ifndef _WIN32 #include @@ -34,7 +35,7 @@ namespace po = boost::program_options; namespace mongo { - + CmdLine cmdLine; namespace regression { @@ -46,21 +47,21 @@ namespace mongo { Result( string name ) : _name( name ) , _rc(0) , _tests(0) , _fails(0) , _asserts(0) { } - string toString(){ + string toString() { stringstream ss; char result[128]; sprintf(result, "%-20s | tests: %4d | fails: %4d | assert calls: %6d\n", _name.c_str(), _tests, _fails, _asserts); ss << result; - for ( list::iterator i=_messages.begin(); i!=_messages.end(); i++ ){ + for ( list::iterator i=_messages.begin(); i!=_messages.end(); i++ ) { ss << "\t" << *i << '\n'; } - + return ss.str(); } - int rc(){ + int rc() { return _rc; } @@ -77,7 +78,7 @@ namespace mongo { Result * Result::cur = 0; - Result * Suite::run( const string& filter ){ + Result * Suite::run( const string& filter ) { tlogLevel = -1; log(1) << "\t about to setupTests" << endl; @@ -90,53 +91,53 @@ namespace mongo { /* see note in SavedContext */ //writelock lk(""); - for ( list::iterator i=_tests.begin(); i!=_tests.end(); i++ ){ + for ( list::iterator i=_tests.begin(); i!=_tests.end(); i++ ) { TestCase * tc = *i; - if ( filter.size() && tc->getName().find( filter ) == string::npos ){ + if ( filter.size() && tc->getName().find( filter ) == string::npos ) { log(1) << "\t skipping test: " << tc->getName() << " because doesn't match filter" << endl; continue; } r->_tests++; - + bool passes = false; - + log(1) << "\t going to run test: " << tc->getName() << endl; - + stringstream err; err << tc->getName() << "\t"; - + try { tc->run(); passes = true; } - catch ( MyAssertionException * ae ){ + catch ( MyAssertionException * ae ) { err << ae->ss.str(); delete( ae ); } - catch ( std::exception& e ){ + catch ( std::exception& e ) { err << " exception: " << e.what(); } - catch ( int x ){ + catch ( int x ) { err << " caught int : " << x << endl; } - catch ( ... ){ + catch ( ... ) { cerr << "unknown exception in test: " << tc->getName() << endl; } - - if ( ! passes ){ + + if ( ! passes ) { string s = err.str(); log() << "FAIL: " << s << endl; r->_fails++; r->_messages.push_back( s ); - } + } } - + if ( r->_fails ) r->_rc = 17; log(1) << "\t DONE running tests" << endl; - + return r; } @@ -155,20 +156,23 @@ namespace mongo { po::positional_options_description positional_options; shell_options.add_options() - ("help,h", "show this usage information") - ("dbpath", po::value(&dbpathSpec)->default_value(default_dbpath), - "db data path for this test run. NOTE: the contents of this " - "directory will be overwritten if it already exists") - ("debug", "run tests with verbose output") - ("list,l", "list available test suites") - ("filter,f" , po::value() , "string substring filter on test name" ) - ("verbose,v", "verbose") - ("seed", po::value(&seed), "random number seed") - ; - + ("help,h", "show this usage information") + ("dbpath", po::value(&dbpathSpec)->default_value(default_dbpath), + "db data path for this test run. NOTE: the contents of this " + "directory will be overwritten if it already exists") + ("debug", "run tests with verbose output") + ("list,l", "list available test suites") + ("bigfiles", "use big datafiles instead of smallfiles which is the default") + ("filter,f" , po::value() , "string substring filter on test name" ) + ("verbose,v", "verbose") + ("dur", "enable journaling") + ("nodur", "disable journaling (currently the default)") + ("seed", po::value(&seed), "random number seed") + ; + hidden_options.add_options() - ("suites", po::value< vector >(), "test suites to run") - ; + ("suites", po::value< vector >(), "test suites to run") + ; positional_options.add("suites", -1); @@ -185,7 +189,8 @@ namespace mongo { positional(positional_options). style(command_line_style).run(), params); po::notify(params); - } catch (po::error &e) { + } + catch (po::error &e) { cout << "ERROR: " << e.what() << endl << endl; show_help_text(argv[0], shell_options); return EXIT_BADOPTIONS; @@ -196,6 +201,13 @@ namespace mongo { return EXIT_CLEAN; } + if( params.count("nodur") ) { + cmdLine.dur = false; + } + if( params.count("dur") || cmdLine.dur ) { + cmdLine.dur = true; + } + if (params.count("debug") || params.count("verbose") ) { logLevel = 1; } @@ -217,18 +229,25 @@ namespace mongo { } boost::filesystem::directory_iterator end_iter; for (boost::filesystem::directory_iterator dir_iter(p); - dir_iter != end_iter; ++dir_iter) { + dir_iter != end_iter; ++dir_iter) { boost::filesystem::remove_all(*dir_iter); } - } else { + } + else { boost::filesystem::create_directory(p); } string dbpathString = p.native_directory_string(); dbpath = dbpathString.c_str(); - + cmdLine.prealloc = false; + + // dbtest defaults to smallfiles cmdLine.smallfiles = true; + if( params.count("bigfiles") ) { + cmdLine.dur = true; + } + cmdLine.oplogSize = 10 * 1024 * 1024; Client::initThread("testsuite"); acquirePathLock(); @@ -236,32 +255,39 @@ namespace mongo { srand( (unsigned) seed ); printGitVersion(); printSysInfo(); - out() << "random seed: " << seed << endl; + log() << "random seed: " << seed << endl; - theFileAllocator().start(); + FileAllocator::get()->start(); vector suites; if (params.count("suites")) { suites = params["suites"].as< vector >(); } - + string filter = ""; - if ( params.count( "filter" ) ){ + if ( params.count( "filter" ) ) { filter = params["filter"].as(); } + dur::startup(); + + if( debug && cmdLine.dur ) { + cout << "setting cmdLine.durOptions=8" << endl; + cmdLine.durOptions = 8; + } + int ret = run(suites,filter); #if !defined(_WIN32) && !defined(__sunos__) flock( lockFile, LOCK_UN ); #endif - + cc().shutdown(); dbexit( (ExitCode)ret ); // so everything shuts down cleanly return ret; } - int Suite::run( vector suites , const string& filter ){ + int Suite::run( vector suites , const string& filter ) { for ( unsigned int i = 0; i < suites.size(); i++ ) { if ( _suites->find( suites[i] ) == _suites->end() ) { cout << "invalid test [" << suites[i] << "], use --list to see valid names" << endl; @@ -277,7 +303,7 @@ namespace mongo { list results; - for ( list::iterator i=torun.begin(); i!=torun.end(); i++ ){ + for ( list::iterator i=torun.begin(); i!=torun.end(); i++ ) { string name = *i; Suite * s = (*_suites)[name]; assert( s ); @@ -298,12 +324,12 @@ namespace mongo { int fails = 0; int asserts = 0; - for ( list::iterator i=results.begin(); i!=results.end(); i++ ){ + for ( list::iterator i=results.begin(); i!=results.end(); i++ ) { Result * r = *i; cout << r->toString(); if ( abs( r->rc() ) > abs( rc ) ) rc = r->rc(); - + tests += r->_tests; fails += r->_fails; asserts += r->_asserts; @@ -313,13 +339,13 @@ namespace mongo { totals._tests = tests; totals._fails = fails; totals._asserts = asserts; - + cout << totals.toString(); // includes endl return rc; } - void Suite::registerSuite( string name , Suite * s ){ + void Suite::registerSuite( string name , Suite * s ) { if ( ! _suites ) _suites = new map(); Suite*& m = (*_suites)[name]; @@ -327,37 +353,37 @@ namespace mongo { m = s; } - void assert_pass(){ + void assert_pass() { Result::cur->_asserts++; } - void assert_fail( const char * exp , const char * file , unsigned line ){ + void assert_fail( const char * exp , const char * file , unsigned line ) { Result::cur->_asserts++; - + MyAssertionException * e = new MyAssertionException(); e->ss << "ASSERT FAILED! " << file << ":" << line << endl; throw e; } - void fail( const char * exp , const char * file , unsigned line ){ + void fail( const char * exp , const char * file , unsigned line ) { assert(0); } - MyAssertionException * MyAsserts::getBase(){ + MyAssertionException * MyAsserts::getBase() { MyAssertionException * e = new MyAssertionException(); e->ss << _file << ":" << _line << " " << _aexp << " != " << _bexp << " "; return e; } - - void MyAsserts::printLocation(){ + + void MyAsserts::printLocation() { log() << _file << ":" << _line << " " << _aexp << " != " << _bexp << " "; } - void MyAsserts::_gotAssert(){ + void MyAsserts::_gotAssert() { Result::cur->_asserts++; } } - void setupSignals(){} + void setupSignals( bool inFork ) {} } diff --git a/dbtests/framework.h b/dbtests/framework.h index bec14a2..29ba58b 100644 --- a/dbtests/framework.h +++ b/dbtests/framework.h @@ -49,7 +49,7 @@ namespace mongo { class TestCase { public: - virtual ~TestCase(){} + virtual ~TestCase() {} virtual void run() = 0; virtual string getName() = 0; }; @@ -57,15 +57,15 @@ namespace mongo { template< class T > class TestHolderBase : public TestCase { public: - TestHolderBase(){} - virtual ~TestHolderBase(){} - virtual void run(){ + TestHolderBase() {} + virtual ~TestHolderBase() {} + virtual void run() { auto_ptr t; t.reset( create() ); t->run(); } virtual T * create() = 0; - virtual string getName(){ + virtual string getName() { return demangleName( typeid(T) ); } }; @@ -73,7 +73,7 @@ namespace mongo { template< class T > class TestHolder0 : public TestHolderBase { public: - virtual T * create(){ + virtual T * create() { return new T(); } }; @@ -81,8 +81,8 @@ namespace mongo { template< class T , typename A > class TestHolder1 : public TestHolderBase { public: - TestHolder1( const A& a ) : _a(a){} - virtual T * create(){ + TestHolder1( const A& a ) : _a(a) {} + virtual T * create() { return new T( _a ); } const A& _a; @@ -90,25 +90,25 @@ namespace mongo { class Suite { public: - Suite( string name ) : _name( name ){ + Suite( string name ) : _name( name ) { registerSuite( name , this ); _ran = 0; } virtual ~Suite() { - if ( _ran ){ + if ( _ran ) { DBDirectClient c; c.dropDatabase( "unittests" ); } } template - void add(){ + void add() { _tests.push_back( new TestHolder0() ); } template - void add( const A& a ){ + void add( const A& a ) { _tests.push_back( new TestHolder1(a) ); } @@ -137,7 +137,7 @@ namespace mongo { class MyAssertionException : boost::noncopyable { public: - MyAssertionException(){ + MyAssertionException() { ss << "assertion: "; } stringstream ss; @@ -148,32 +148,32 @@ namespace mongo { class MyAsserts { public: MyAsserts( const char * aexp , const char * bexp , const char * file , unsigned line ) - : _aexp( aexp ) , _bexp( bexp ) , _file( file ) , _line( line ){ + : _aexp( aexp ) , _bexp( bexp ) , _file( file ) , _line( line ) { } - + template - void ae( A a , B b ){ + void ae( A a , B b ) { _gotAssert(); if ( a == b ) return; - + printLocation(); - + MyAssertionException * e = getBase(); e->ss << a << " != " << b << endl; log() << e->ss.str() << endl; throw e; } - + template - void nae( A a , B b ){ + void nae( A a , B b ) { _gotAssert(); if ( a != b ) return; - + printLocation(); - + MyAssertionException * e = getBase(); e->ss << a << " == " << b << endl; log() << e->ss.str() << endl; @@ -182,13 +182,13 @@ namespace mongo { void printLocation(); - + private: - + void _gotAssert(); - + MyAssertionException * getBase(); - + string _aexp; string _bexp; string _file; diff --git a/dbtests/histogram_test.cpp b/dbtests/histogram_test.cpp index 5a8970d..e9cbb5b 100644 --- a/dbtests/histogram_test.cpp +++ b/dbtests/histogram_test.cpp @@ -25,9 +25,9 @@ namespace mongo { using mongo::Histogram; - class BoundariesInit{ + class BoundariesInit { public: - void run(){ + void run() { Histogram::Options opts; opts.numBuckets = 3; opts.bucketSize = 10; @@ -45,9 +45,9 @@ namespace mongo { } }; - class BoundariesExponential{ + class BoundariesExponential { public: - void run(){ + void run() { Histogram::Options opts; opts.numBuckets = 4; opts.bucketSize = 125; @@ -57,13 +57,13 @@ namespace mongo { ASSERT_EQUALS( h.getBoundary( 0 ), 125u ); ASSERT_EQUALS( h.getBoundary( 1 ), 250u ); ASSERT_EQUALS( h.getBoundary( 2 ), 500u ); - ASSERT_EQUALS( h.getBoundary( 3 ), numeric_limits::max() ); + ASSERT_EQUALS( h.getBoundary( 3 ), numeric_limits::max() ); } }; - class BoundariesFind{ + class BoundariesFind { public: - void run(){ + void run() { Histogram::Options opts; opts.numBuckets = 3; opts.bucketSize = 10; @@ -81,14 +81,14 @@ namespace mongo { class HistogramSuite : public Suite { public: - HistogramSuite() : Suite( "histogram" ){} + HistogramSuite() : Suite( "histogram" ) {} - void setupTests(){ + void setupTests() { add< BoundariesInit >(); add< BoundariesExponential >(); add< BoundariesFind >(); // TODO: complete the test suite - } + } } histogramSuite; } // anonymous namespace diff --git a/dbtests/jsobjtests.cpp b/dbtests/jsobjtests.cpp index ea7606f..6804d71 100644 --- a/dbtests/jsobjtests.cpp +++ b/dbtests/jsobjtests.cpp @@ -150,7 +150,7 @@ namespace JsobjTests { class MultiKeySortOrder : public Base { public: - void run(){ + void run() { ASSERT( BSON( "x" << "a" ).woCompare( BSON( "x" << "b" ) ) < 0 ); ASSERT( BSON( "x" << "b" ).woCompare( BSON( "x" << "a" ) ) > 0 ); @@ -255,9 +255,9 @@ namespace JsobjTests { } }; - class AsTempObj{ + class AsTempObj { public: - void run(){ + void run() { { BSONObjBuilder bb; bb << "a" << 1; @@ -267,7 +267,7 @@ namespace JsobjTests { ASSERT(tmp.hasField("a")); ASSERT(!tmp.hasField("b")); ASSERT(tmp == BSON("a" << 1)); - + bb << "b" << 2; BSONObj obj = bb.obj(); ASSERT_EQUALS(obj.objsize() , 4+(1+2+4)+(1+2+4)+1); @@ -285,7 +285,7 @@ namespace JsobjTests { ASSERT(tmp.hasField("a")); ASSERT(!tmp.hasField("b")); ASSERT(tmp == BSON("a" << BSON("$gt" << 1))); - + bb << "b" << LT << 2; BSONObj obj = bb.obj(); ASSERT(obj.objsize() == 4+(1+2+(4+1+4+4+1))+(1+2+(4+1+4+4+1))+1); @@ -293,7 +293,7 @@ namespace JsobjTests { ASSERT(obj.hasField("a")); ASSERT(obj.hasField("b")); ASSERT(obj == BSON("a" << BSON("$gt" << 1) - << "b" << BSON("$lt" << 2))); + << "b" << BSON("$lt" << 2))); } { BSONObjBuilder bb(32); @@ -304,10 +304,10 @@ namespace JsobjTests { ASSERT(tmp.hasField("a")); ASSERT(!tmp.hasField("b")); ASSERT(tmp == BSON("a" << 1)); - + //force a realloc BSONArrayBuilder arr; - for (int i=0; i < 10000; i++){ + for (int i=0; i < 10000; i++) { arr << i; } bb << "b" << arr.arr(); @@ -319,8 +319,8 @@ namespace JsobjTests { } }; - struct AppendIntOrLL{ - void run(){ + struct AppendIntOrLL { + void run() { const long long billion = 1000*1000*1000; BSONObjBuilder b; b.appendIntOrLL("i1", 1); @@ -362,16 +362,16 @@ namespace JsobjTests { }; struct AppendNumber { - void run(){ + void run() { BSONObjBuilder b; b.appendNumber( "a" , 5 ); b.appendNumber( "b" , 5.5 ); b.appendNumber( "c" , (1024LL*1024*1024)-1 ); b.appendNumber( "d" , (1024LL*1024*1024*1024)-1 ); b.appendNumber( "e" , 1024LL*1024*1024*1024*1024*1024 ); - + BSONObj o = b.obj(); - + ASSERT( o["a"].type() == NumberInt ); ASSERT( o["b"].type() == NumberDouble ); ASSERT( o["c"].type() == NumberInt ); @@ -380,7 +380,7 @@ namespace JsobjTests { } }; - + class ToStringArray { public: void run() { @@ -391,28 +391,28 @@ namespace JsobjTests { class ToStringNumber { public: - - void run(){ + + void run() { BSONObjBuilder b; b.append( "a" , (int)4 ); b.append( "b" , (double)5 ); b.append( "c" , (long long)6 ); - + b.append( "d" , 123.456789123456789123456789123456789 ); b.append( "e" , 123456789.123456789123456789123456789 ); b.append( "f" , 1234567891234567891234.56789123456789 ); b.append( "g" , -123.456 ); - + BSONObj x = b.obj(); ASSERT_EQUALS( "4", x["a"].toString( false , true ) ); ASSERT_EQUALS( "5.0", x["b"].toString( false , true ) ); - ASSERT_EQUALS( "6", x["c"].toString( false , true ) ); + ASSERT_EQUALS( "6", x["c"].toString( false , true ) ); ASSERT_EQUALS( "123.4567891234568" , x["d"].toString( false , true ) ); ASSERT_EQUALS( "123456789.1234568" , x["e"].toString( false , true ) ); // ASSERT_EQUALS( "1.234567891234568e+21" , x["f"].toString( false , true ) ); // windows and *nix are different - TODO, work around for test or not bother? - + ASSERT_EQUALS( "-123.456" , x["g"].toString( false , true ) ); } @@ -442,6 +442,46 @@ namespace JsobjTests { }; + class AppendAs { + public: + void run() { + BSONObjBuilder b; + { + BSONObj foo = BSON( "foo" << 1 ); + b.appendAs( foo.firstElement(), "bar" ); + } + ASSERT_EQUALS( BSON( "bar" << 1 ), b.done() ); + } + }; + + class ArrayAppendAs { + public: + void run() { + BSONArrayBuilder b; + { + BSONObj foo = BSON( "foo" << 1 ); + b.appendAs( foo.firstElement(), "3" ); + } + BSONArray a = b.arr(); + BSONObj expected = BSON( "3" << 1 ); + ASSERT_EQUALS( expected.firstElement(), a[ 3 ] ); + ASSERT_EQUALS( 4, a.nFields() ); + } + }; + + class GetField { + public: + void run(){ + BSONObj o = BSON( "a" << 1 << + "b" << BSON( "a" << 2 ) << + "c" << BSON_ARRAY( BSON( "a" << 3 ) << BSON( "a" << 4 ) ) ); + ASSERT_EQUALS( 1 , o.getFieldDotted( "a" ).numberInt() ); + ASSERT_EQUALS( 2 , o.getFieldDotted( "b.a" ).numberInt() ); + ASSERT_EQUALS( 3 , o.getFieldDotted( "c.0.a" ).numberInt() ); + ASSERT_EQUALS( 4 , o.getFieldDotted( "c.1.a" ).numberInt() ); + } + }; + namespace Validation { class Base { @@ -691,12 +731,12 @@ namespace JsobjTests { a.valid(); BSONObj b = fromjson( "{\"one\":2, \"two\":5, \"three\": {}," - "\"four\": { \"five\": { \"six\" : 11 } }," - "\"seven\": [ \"a\", \"bb\", \"ccc\", 5 ]," - "\"eight\": Dbref( \"rrr\", \"01234567890123456789aaaa\" )," - "\"_id\": ObjectId( \"deadbeefdeadbeefdeadbeef\" )," - "\"nine\": { \"$binary\": \"abc=\", \"$type\": \"00\" }," - "\"ten\": Date( 44 ), \"eleven\": /foooooo/i }" ); + "\"four\": { \"five\": { \"six\" : 11 } }," + "\"seven\": [ \"a\", \"bb\", \"ccc\", 5 ]," + "\"eight\": Dbref( \"rrr\", \"01234567890123456789aaaa\" )," + "\"_id\": ObjectId( \"deadbeefdeadbeefdeadbeef\" )," + "\"nine\": { \"$binary\": \"abc=\", \"$type\": \"00\" }," + "\"ten\": Date( 44 ), \"eleven\": /foooooo/i }" ); fuzz( b ); b.valid(); } @@ -723,7 +763,7 @@ namespace JsobjTests { class init1 { public: - void run(){ + void run() { OID a; OID b; @@ -736,7 +776,7 @@ namespace JsobjTests { class initParse1 { public: - void run(){ + void run() { OID a; OID b; @@ -750,7 +790,7 @@ namespace JsobjTests { class append { public: - void run(){ + void run() { BSONObjBuilder b; b.appendOID( "a" , 0 ); b.appendOID( "b" , 0 , false ); @@ -766,18 +806,18 @@ namespace JsobjTests { class increasing { public: - BSONObj g(){ + BSONObj g() { BSONObjBuilder b; b.appendOID( "_id" , 0 , true ); return b.obj(); } - void run(){ + void run() { BSONObj a = g(); BSONObj b = g(); - + ASSERT( a.woCompare( b ) < 0 ); - - // yes, there is a 1/1000 chance this won't increase time(0) + + // yes, there is a 1/1000 chance this won't increase time(0) // and therefore inaccurately say the function is behaving // buf if its broken, it will fail 999/1000, so i think that's good enough sleepsecs( 1 ); @@ -788,7 +828,7 @@ namespace JsobjTests { class ToDate { public: - void run(){ + void run() { OID oid; { @@ -812,7 +852,7 @@ namespace JsobjTests { class FromDate { public: - void run(){ + void run() { OID min, oid, max; Date_t now = jsTime(); oid.init(); // slight chance this has different time. If its a problem, can change. @@ -890,26 +930,26 @@ namespace JsobjTests { class LabelMulti : public LabelBase { BSONObj expected() { return BSON( "z" << "q" - << "a" << BSON( "$gt" << 1 << "$lte" << "x" ) - << "b" << BSON( "$ne" << 1 << "$ne" << "f" << "$ne" << 22.3 ) - << "x" << "p" ); + << "a" << BSON( "$gt" << 1 << "$lte" << "x" ) + << "b" << BSON( "$ne" << 1 << "$ne" << "f" << "$ne" << 22.3 ) + << "x" << "p" ); } BSONObj actual() { return BSON( "z" << "q" - << "a" << GT << 1 << LTE << "x" - << "b" << NE << 1 << NE << "f" << NE << 22.3 - << "x" << "p" ); + << "a" << GT << 1 << LTE << "x" + << "b" << NE << 1 << NE << "f" << NE << 22.3 + << "x" << "p" ); } }; class LabelishOr : public LabelBase { BSONObj expected() { return BSON( "$or" << BSON_ARRAY( - BSON("a" << BSON( "$gt" << 1 << "$lte" << "x" )) - << BSON("b" << BSON( "$ne" << 1 << "$ne" << "f" << "$ne" << 22.3 )) - << BSON("x" << "p" ))); + BSON("a" << BSON( "$gt" << 1 << "$lte" << "x" )) + << BSON("b" << BSON( "$ne" << 1 << "$ne" << "f" << "$ne" << 22.3 )) + << BSON("x" << "p" ))); } BSONObj actual() { - return OR( BSON( "a" << GT << 1 << LTE << "x"), + return OR( BSON( "a" << GT << 1 << LTE << "x"), BSON( "b" << NE << 1 << NE << "f" << NE << 22.3), BSON( "x" << "p" ) ); } @@ -925,7 +965,7 @@ namespace JsobjTests { class ElementAppend { public: - void run(){ + void run() { BSONObj a = BSON( "a" << 17 ); BSONObj b = BSON( "b" << a["a"] ); ASSERT_EQUALS( NumberInt , a["a"].type() ); @@ -998,23 +1038,39 @@ namespace JsobjTests { } }; + class MinMaxKeyBuilder { + public: + void run() { + BSONObj min = BSON( "a" << MINKEY ); + BSONObj max = BSON( "b" << MAXKEY ); + + ASSERT( min.valid() ); + ASSERT( max.valid() ); + + BSONElement minElement = min["a"]; + BSONElement maxElement = max["b"]; + ASSERT( minElement.type() == MinKey ); + ASSERT( maxElement.type() == MaxKey ); + } + }; + class MinMaxElementTest { public: - BSONObj min( int t ){ + BSONObj min( int t ) { BSONObjBuilder b; b.appendMinForType( "a" , t ); return b.obj(); } - BSONObj max( int t ){ + BSONObj max( int t ) { BSONObjBuilder b; b.appendMaxForType( "a" , t ); return b.obj(); } - void run(){ - for ( int t=1; t i = sorter.iterator(); int num=0; - while ( i->more() ){ + while ( i->more() ) { pair p = i->next(); if ( num == 0 ) assert( p.first["x"].number() == 2 ); - else if ( num <= 2 ){ + else if ( num <= 2 ) { assert( p.first["x"].number() == 5 ); } else if ( num == 3 ) @@ -1117,15 +1170,15 @@ namespace JsobjTests { ASSERT( 0 ); num++; } - - + + ASSERT_EQUALS( 0 , sorter.numFiles() ); } }; class Basic2 { public: - void run(){ + void run() { BSONObjExternalSorter sorter( BSONObj() , 10 ); sorter.add( BSON( "x" << 10 ) , 5 , 11 ); sorter.add( BSON( "x" << 2 ) , 3 , 1 ); @@ -1133,18 +1186,18 @@ namespace JsobjTests { sorter.add( BSON( "x" << 5 ) , 7 , 1 ); sorter.sort(); - + auto_ptr i = sorter.iterator(); int num=0; - while ( i->more() ){ + while ( i->more() ) { pair p = i->next(); - if ( num == 0 ){ + if ( num == 0 ) { assert( p.first["x"].number() == 2 ); ASSERT_EQUALS( p.second.toString() , "3:1" ); } else if ( num <= 2 ) assert( p.first["x"].number() == 5 ); - else if ( num == 3 ){ + else if ( num == 3 ) { assert( p.first["x"].number() == 10 ); ASSERT_EQUALS( p.second.toString() , "5:b" ); } @@ -1158,7 +1211,7 @@ namespace JsobjTests { class Basic3 { public: - void run(){ + void run() { BSONObjExternalSorter sorter( BSONObj() , 10 ); sorter.sort(); @@ -1171,23 +1224,23 @@ namespace JsobjTests { class ByDiskLock { public: - void run(){ + void run() { BSONObjExternalSorter sorter; sorter.add( BSON( "x" << 10 ) , 5 , 4); sorter.add( BSON( "x" << 2 ) , 3 , 0 ); sorter.add( BSON( "x" << 5 ) , 6 , 2 ); sorter.add( BSON( "x" << 5 ) , 7 , 3 ); sorter.add( BSON( "x" << 5 ) , 2 , 1 ); - + sorter.sort(); auto_ptr i = sorter.iterator(); int num=0; - while ( i->more() ){ + while ( i->more() ) { pair p = i->next(); if ( num == 0 ) assert( p.first["x"].number() == 2 ); - else if ( num <= 3 ){ + else if ( num <= 3 ) { assert( p.first["x"].number() == 5 ); } else if ( num == 4 ) @@ -1205,9 +1258,9 @@ namespace JsobjTests { class Big1 { public: - void run(){ + void run() { BSONObjExternalSorter sorter( BSONObj() , 2000 ); - for ( int i=0; i<10000; i++ ){ + for ( int i=0; i<10000; i++ ) { sorter.add( BSON( "x" << rand() % 10000 ) , 5 , i ); } @@ -1216,7 +1269,7 @@ namespace JsobjTests { auto_ptr i = sorter.iterator(); int num=0; double prev = 0; - while ( i->more() ){ + while ( i->more() ) { pair p = i->next(); num++; double cur = p.first["x"].number(); @@ -1226,22 +1279,22 @@ namespace JsobjTests { assert( num == 10000 ); } }; - + class Big2 { public: - void run(){ + void run() { const int total = 100000; BSONObjExternalSorter sorter( BSONObj() , total * 2 ); - for ( int i=0; i i = sorter.iterator(); int num=0; double prev = 0; - while ( i->more() ){ + while ( i->more() ) { pair p = i->next(); num++; double cur = p.first["x"].number(); @@ -1255,21 +1308,21 @@ namespace JsobjTests { class D1 { public: - void run(){ - + void run() { + BSONObjBuilder b; b.appendNull(""); BSONObj x = b.obj(); - + BSONObjExternalSorter sorter; sorter.add(x, DiskLoc(3,7)); sorter.add(x, DiskLoc(4,7)); sorter.add(x, DiskLoc(2,7)); sorter.add(x, DiskLoc(1,7)); sorter.add(x, DiskLoc(3,77)); - + sorter.sort(); - + auto_ptr i = sorter.iterator(); while( i->more() ) { BSONObjExternalSorter::Data d = i->next(); @@ -1280,14 +1333,14 @@ namespace JsobjTests { } }; } - + class CompatBSON { public: - + #define JSONBSONTEST(j,s,m) ASSERT_EQUALS( fromjson( j ).objsize() , s ); ASSERT_EQUALS( fromjson( j ).md5() , m ); #define RAWBSONTEST(j,s,m) ASSERT_EQUALS( j.objsize() , s ); ASSERT_EQUALS( j.md5() , m ); - void run(){ + void run() { JSONBSONTEST( "{ 'x' : true }" , 9 , "6fe24623e4efc5cf07f027f9c66b5456" ); JSONBSONTEST( "{ 'x' : null }" , 8 , "12d43430ff6729af501faf0638e68888" ); @@ -1297,20 +1350,20 @@ namespace JsobjTests { JSONBSONTEST( "{ 'a' : { 'b' : 1.1 } }" , 24 , "31887a4b9d55cd9f17752d6a8a45d51f" ); JSONBSONTEST( "{ 'x' : 5.2 , 'y' : { 'a' : 'eliot' , b : true } , 'z' : null }" , 44 , "b3de8a0739ab329e7aea138d87235205" ); JSONBSONTEST( "{ 'x' : 5.2 , 'y' : [ 'a' , 'eliot' , 'b' , true ] , 'z' : null }" , 62 , "cb7bad5697714ba0cbf51d113b6a0ee8" ); - + RAWBSONTEST( BSON( "x" << 4 ) , 12 , "d1ed8dbf79b78fa215e2ded74548d89d" ); - + } }; - + class CompareDottedFieldNamesTest { public: - void t( FieldCompareResult res , const string& l , const string& r ){ + void t( FieldCompareResult res , const string& l , const string& r ) { ASSERT_EQUALS( res , compareDottedFieldNames( l , r ) ); ASSERT_EQUALS( -1 * res , compareDottedFieldNames( r , l ) ); } - - void run(){ + + void run() { t( SAME , "x" , "x" ); t( SAME , "x.a" , "x.a" ); t( LEFT_BEFORE , "a" , "b" ); @@ -1320,13 +1373,13 @@ namespace JsobjTests { } }; - struct NestedDottedConversions{ - void t(const BSONObj& nest, const BSONObj& dot){ + struct NestedDottedConversions { + void t(const BSONObj& nest, const BSONObj& dot) { ASSERT_EQUALS( nested2dotted(nest), dot); ASSERT_EQUALS( nest, dotted2nested(dot)); } - void run(){ + void run() { t( BSON("a" << BSON("b" << 1)), BSON("a.b" << 1) ); t( BSON("a" << BSON("b" << 1 << "c" << 1)), BSON("a.b" << 1 << "a.c" << 1) ); t( BSON("a" << BSON("b" << 1 << "c" << 1) << "d" << 1), BSON("a.b" << 1 << "a.c" << 1 << "d" << 1) ); @@ -1334,8 +1387,8 @@ namespace JsobjTests { } }; - struct BSONArrayBuilderTest{ - void run(){ + struct BSONArrayBuilderTest { + void run() { int i = 0; BSONObjBuilder objb; BSONArrayBuilder arrb; @@ -1374,13 +1427,13 @@ namespace JsobjTests { ASSERT_EQUALS(o["arr2"].type(), Array); } }; - - struct ArrayMacroTest{ - void run(){ + + struct ArrayMacroTest { + void run() { BSONArray arr = BSON_ARRAY( "hello" << 1 << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) ); BSONObj obj = BSON( "0" << "hello" - << "1" << 1 - << "2" << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) ); + << "1" << 1 + << "2" << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) ); ASSERT_EQUALS(arr, obj); ASSERT_EQUALS(arr["2"].type(), Object); @@ -1390,25 +1443,25 @@ namespace JsobjTests { class NumberParsing { public: - void run(){ + void run() { BSONObjBuilder a; BSONObjBuilder b; a.append( "a" , (int)1 ); ASSERT( b.appendAsNumber( "a" , "1" ) ); - + a.append( "b" , 1.1 ); ASSERT( b.appendAsNumber( "b" , "1.1" ) ); a.append( "c" , (int)-1 ); ASSERT( b.appendAsNumber( "c" , "-1" ) ); - + a.append( "d" , -1.1 ); ASSERT( b.appendAsNumber( "d" , "-1.1" ) ); a.append( "e" , (long long)32131231231232313LL ); ASSERT( b.appendAsNumber( "e" , "32131231231232313" ) ); - + ASSERT( ! b.appendAsNumber( "f" , "zz" ) ); ASSERT( ! b.appendAsNumber( "f" , "5zz" ) ); ASSERT( ! b.appendAsNumber( "f" , "zz5" ) ); @@ -1416,10 +1469,10 @@ namespace JsobjTests { ASSERT_EQUALS( a.obj() , b.obj() ); } }; - + class bson2settest { public: - void run(){ + void run() { BSONObj o = BSON( "z" << 1 << "a" << 2 << "m" << 3 << "c" << 4 ); BSONObjIteratorSorted i( o ); stringstream ss; @@ -1429,7 +1482,7 @@ namespace JsobjTests { { Timer t; - for ( int i=0; i<10000; i++ ){ + for ( int i=0; i<10000; i++ ) { BSONObjIteratorSorted j( o ); int l = 0; while ( j.more() ) @@ -1444,22 +1497,22 @@ namespace JsobjTests { class checkForStorageTests { public: - - void good( string s ){ + + void good( string s ) { BSONObj o = fromjson( s ); if ( o.okForStorage() ) return; throw UserException( 12528 , (string)"should be ok for storage:" + s ); } - void bad( string s ){ + void bad( string s ) { BSONObj o = fromjson( s ); if ( ! o.okForStorage() ) return; throw UserException( 12529 , (string)"should NOT be ok for storage:" + s ); } - void run(){ + void run() { good( "{x:1}" ); bad( "{'x.y':1}" ); @@ -1470,7 +1523,7 @@ namespace JsobjTests { class InvalidIDFind { public: - void run(){ + void run() { BSONObj x = BSON( "_id" << 5 << "t" << 2 ); { char * crap = (char*)malloc( x.objsize() ); @@ -1479,7 +1532,7 @@ namespace JsobjTests { ASSERT_EQUALS( x , y ); free( crap ); } - + { char * crap = (char*)malloc( x.objsize() ); memcpy( crap , x.objdata() , x.objsize() ); @@ -1490,21 +1543,21 @@ namespace JsobjTests { BSONObj y( crap , false ); state = 1; } - catch ( std::exception& e ){ + catch ( std::exception& e ) { state = 2; ASSERT( strstr( e.what() , "_id: 5" ) > 0 ); } free( crap ); ASSERT_EQUALS( 2 , state ); } - - + + } }; class ElementSetTest { public: - void run(){ + void run() { BSONObj x = BSON( "a" << 1 << "b" << 1 << "c" << 2 ); BSONElement a = x["a"]; BSONElement b = x["b"]; @@ -1512,7 +1565,7 @@ namespace JsobjTests { cout << "c: " << c << endl; ASSERT( a.woCompare( b ) != 0 ); ASSERT( a.woCompare( b , false ) == 0 ); - + BSONElementSet s; s.insert( a ); ASSERT_EQUALS( 1U , s.size() ); @@ -1523,8 +1576,8 @@ namespace JsobjTests { ASSERT( s.find( a ) != s.end() ); ASSERT( s.find( b ) != s.end() ); ASSERT( s.find( c ) == s.end() ); - - + + s.insert( c ); ASSERT_EQUALS( 2U , s.size() ); @@ -1536,12 +1589,22 @@ namespace JsobjTests { ASSERT( s.count( a ) ); ASSERT( s.count( b ) ); ASSERT( s.count( c ) ); + + { + BSONElementSet x; + BSONObj o = fromjson( "{ 'a' : [ 1 , 2 , 1 ] }" ); + BSONObjIterator i( o["a"].embeddedObjectUserCheck() ); + while ( i.more() ) { + x.insert( i.next() ); + } + ASSERT_EQUALS( 2U , x.size() ); + } } }; class EmbeddedNumbers { public: - void run(){ + void run() { BSONObj x = BSON( "a" << BSON( "b" << 1 ) ); BSONObj y = BSON( "a" << BSON( "b" << 1.0 ) ); ASSERT_EQUALS( x , y ); @@ -1551,12 +1614,12 @@ namespace JsobjTests { class BuilderPartialItearte { public: - void run(){ + void run() { { BSONObjBuilder b; b.append( "x" , 1 ); b.append( "y" , 2 ); - + BSONObjIterator i = b.iterator(); ASSERT( i.more() ); ASSERT_EQUALS( 1 , i.next().numberInt() ); @@ -1577,13 +1640,13 @@ namespace JsobjTests { ASSERT_EQUALS( BSON( "x" << 1 << "y" << 2 << "z" << 3 ) , b.obj() ); } - + } }; class BSONFieldTests { public: - void run(){ + void run() { { BSONField x("x"); BSONObj o = BSON( x << 5 ); @@ -1610,11 +1673,11 @@ namespace JsobjTests { class BSONForEachTest { public: - void run(){ + void run() { BSONObj obj = BSON("a" << 1 << "a" << 2 << "a" << 3); - + int count = 0; - BSONForEach(e, obj){ + BSONForEach(e, obj) { ASSERT_EQUALS( e.fieldName() , string("a") ); count += e.Int(); } @@ -1625,7 +1688,7 @@ namespace JsobjTests { class StringDataTest { public: - void run(){ + void run() { StringData a( string( "aaa" ) ); ASSERT_EQUALS( 3u , a.size() ); @@ -1645,8 +1708,8 @@ namespace JsobjTests { class CompareOps { public: - void run(){ - + void run() { + BSONObj a = BSON("a"<<1); BSONObj b = BSON("a"<<1); BSONObj c = BSON("a"<<2); @@ -1657,7 +1720,7 @@ namespace JsobjTests { ASSERT( ! ( a < b ) ); ASSERT( a <= b ); ASSERT( a < c ); - + ASSERT( f > d ); ASSERT( f >= e ); ASSERT( ! ( f > e ) ); @@ -1666,12 +1729,12 @@ namespace JsobjTests { class HashingTest { public: - void run(){ + void run() { int N = 100000; - BSONObj x = BSON( "name" << "eliot was here" + BSONObj x = BSON( "name" << "eliot was here" << "x" << 5 << "asdasdasdas" << "asldkasldjasldjasldjlasjdlasjdlasdasdasdasdasdasdasd" ); - + { Timer t; for ( int i=0; i(); add< BSONElementBasic >(); add< BSONObjTests::Create >(); @@ -1724,6 +1787,10 @@ namespace JsobjTests { add< BSONObjTests::ToStringArray >(); add< BSONObjTests::ToStringNumber >(); add< BSONObjTests::NullString >(); + add< BSONObjTests::AppendAs >(); + add< BSONObjTests::ArrayAppendAs >(); + add< BSONObjTests::GetField >(); + add< BSONObjTests::Validation::BadType >(); add< BSONObjTests::Validation::EooBeforeEnd >(); add< BSONObjTests::Validation::Undefined >(); @@ -1771,16 +1838,13 @@ namespace JsobjTests { add< ValueStreamTests::LabelishOr >(); add< ValueStreamTests::Unallowed >(); add< ValueStreamTests::ElementAppend >(); - add< SubObjectBuilder >(); - add< DateBuilder >(); - add< DateNowBuilder >(); - add< TimeTBuilder >(); add< ValueStreamTests::Unallowed >(); add< ValueStreamTests::ElementAppend >(); add< SubObjectBuilder >(); add< DateBuilder >(); add< DateNowBuilder >(); add< TimeTBuilder >(); + add< MinMaxKeyBuilder >(); add< MinMaxElementTest >(); add< ComparatorTest >(); add< ExtractFieldsTest >(); @@ -1810,6 +1874,6 @@ namespace JsobjTests { add< HashingTest >(); } } myall; - + } // namespace JsobjTests diff --git a/dbtests/jsontests.cpp b/dbtests/jsontests.cpp index 990558e..b630523 100644 --- a/dbtests/jsontests.cpp +++ b/dbtests/jsontests.cpp @@ -205,11 +205,11 @@ namespace JsonTests { b.appendDBRef( "a", "namespace", oid ); BSONObj built = b.done(); ASSERT_EQUALS( "{ \"a\" : { \"$ref\" : \"namespace\", \"$id\" : \"ffffffffffffffffffffffff\" } }", - built.jsonString( Strict ) ); + built.jsonString( Strict ) ); ASSERT_EQUALS( "{ \"a\" : { \"$ref\" : \"namespace\", \"$id\" : \"ffffffffffffffffffffffff\" } }", - built.jsonString( JS ) ); + built.jsonString( JS ) ); ASSERT_EQUALS( "{ \"a\" : Dbref( \"namespace\", \"ffffffffffffffffffffffff\" ) }", - built.jsonString( TenGen ) ); + built.jsonString( TenGen ) ); } }; @@ -221,7 +221,7 @@ namespace JsonTests { BSONObjBuilder b; b.appendDBRef( "a", "namespace", oid ); ASSERT_EQUALS( "{ \"a\" : { \"$ref\" : \"namespace\", \"$id\" : \"000000000000000000000000\" } }", - b.done().jsonString( Strict ) ); + b.done().jsonString( Strict ) ); } }; @@ -234,9 +234,9 @@ namespace JsonTests { b.appendOID( "a", &oid ); BSONObj built = b.done(); ASSERT_EQUALS( "{ \"a\" : { \"$oid\" : \"ffffffffffffffffffffffff\" } }", - built.jsonString( Strict ) ); + built.jsonString( Strict ) ); ASSERT_EQUALS( "{ \"a\" : ObjectId( \"ffffffffffffffffffffffff\" ) }", - built.jsonString( TenGen ) ); + built.jsonString( TenGen ) ); } }; @@ -258,12 +258,12 @@ namespace JsonTests { BSONObjBuilder c; c.appendBinData( "a", 2, BinDataGeneral, z ); ASSERT_EQUALS( "{ \"a\" : { \"$binary\" : \"YWI=\", \"$type\" : \"00\" } }", - c.done().jsonString( Strict ) ); + c.done().jsonString( Strict ) ); BSONObjBuilder d; d.appendBinData( "a", 1, BinDataGeneral, z ); ASSERT_EQUALS( "{ \"a\" : { \"$binary\" : \"YQ==\", \"$type\" : \"00\" } }", - d.done().jsonString( Strict ) ); + d.done().jsonString( Strict ) ); } }; @@ -295,7 +295,7 @@ namespace JsonTests { b.appendRegex( "a", "abc", "i" ); BSONObj built = b.done(); ASSERT_EQUALS( "{ \"a\" : { \"$regex\" : \"abc\", \"$options\" : \"i\" } }", - built.jsonString( Strict ) ); + built.jsonString( Strict ) ); ASSERT_EQUALS( "{ \"a\" : /abc/i }", built.jsonString( TenGen ) ); ASSERT_EQUALS( "{ \"a\" : /abc/i }", built.jsonString( JS ) ); } @@ -308,7 +308,7 @@ namespace JsonTests { b.appendRegex( "a", "/\"", "i" ); BSONObj built = b.done(); ASSERT_EQUALS( "{ \"a\" : { \"$regex\" : \"/\\\"\", \"$options\" : \"i\" } }", - built.jsonString( Strict ) ); + built.jsonString( Strict ) ); ASSERT_EQUALS( "{ \"a\" : /\\/\\\"/i }", built.jsonString( TenGen ) ); ASSERT_EQUALS( "{ \"a\" : /\\/\\\"/i }", built.jsonString( JS ) ); } @@ -321,7 +321,7 @@ namespace JsonTests { b.appendRegex( "a", "z", "abcgimx" ); BSONObj built = b.done(); ASSERT_EQUALS( "{ \"a\" : { \"$regex\" : \"z\", \"$options\" : \"abcgimx\" } }", - built.jsonString( Strict ) ); + built.jsonString( Strict ) ); ASSERT_EQUALS( "{ \"a\" : /z/gim }", built.jsonString( TenGen ) ); ASSERT_EQUALS( "{ \"a\" : /z/gim }", built.jsonString( JS ) ); } @@ -329,17 +329,17 @@ namespace JsonTests { class CodeTests { public: - void run(){ + void run() { BSONObjBuilder b; b.appendCode( "x" , "function(){ return 1; }" ); BSONObj o = b.obj(); ASSERT_EQUALS( "{ \"x\" : function(){ return 1; } }" , o.jsonString() ); } }; - + class TimestampTests { public: - void run(){ + void run() { BSONObjBuilder b; b.appendTimestamp( "x" , 4000 , 10 ); BSONObj o = b.obj(); @@ -349,7 +349,7 @@ namespace JsonTests { class NullString { public: - void run(){ + void run() { BSONObjBuilder b; b.append( "x" , "a\0b" , 4 ); BSONObj o = b.obj(); @@ -359,7 +359,7 @@ namespace JsonTests { class AllTypes { public: - void run(){ + void run() { OID oid; oid.init(); @@ -384,12 +384,12 @@ namespace JsonTests { b.appendTimestamp( "s" , 123123123123123LL ); b.append( "t" , 12321312312LL ); b.appendMaxKey( "u" ); - + BSONObj o = b.obj(); cout << o.jsonString() << endl; } }; - + } // namespace JsonStringTests namespace FromJsonTests { @@ -504,7 +504,7 @@ namespace JsonTests { virtual ~FancyNumber() {} void run() { ASSERT_EQUALS( int( 1000000 * bson().firstElement().number() ), - int( 1000000 * fromjson( json() ).firstElement().number() ) ); + int( 1000000 * fromjson( json() ).firstElement().number() ) ); } virtual BSONObj bson() const { BSONObjBuilder b; @@ -978,8 +978,8 @@ namespace JsonTests { }; class NumericTypes : public Base { - public: - void run(){ + public: + void run() { Base::run(); BSONObj o = fromjson(json()); @@ -990,12 +990,12 @@ namespace JsonTests { ASSERT(o["long"].numberLong() == 9223372036854775807ll); } - + virtual BSONObj bson() const { return BSON( "int" << 123 - << "long" << 9223372036854775807ll // 2**63 - 1 - << "double" << 3.14 - ); + << "long" << 9223372036854775807ll // 2**63 - 1 + << "double" << 3.14 + ); } virtual string json() const { return "{ \"int\": 123, \"long\": 9223372036854775807, \"double\": 3.14 }"; @@ -1003,8 +1003,8 @@ namespace JsonTests { }; class NegativeNumericTypes : public Base { - public: - void run(){ + public: + void run() { Base::run(); BSONObj o = fromjson(json()); @@ -1015,12 +1015,12 @@ namespace JsonTests { ASSERT(o["long"].numberLong() == -9223372036854775807ll); } - + virtual BSONObj bson() const { return BSON( "int" << -123 - << "long" << -9223372036854775807ll // -1 * (2**63 - 1) - << "double" << -3.14 - ); + << "long" << -9223372036854775807ll // -1 * (2**63 - 1) + << "double" << -3.14 + ); } virtual string json() const { return "{ \"int\": -123, \"long\": -9223372036854775807, \"double\": -3.14 }"; @@ -1029,8 +1029,8 @@ namespace JsonTests { class EmbeddedDatesBase : public Base { public: - - virtual void run(){ + + virtual void run() { BSONObj o = fromjson( json() ); ASSERT_EQUALS( 3 , (o["time.valid"].type()) ); BSONObj e = o["time.valid"].embeddedObjectUserCheck(); @@ -1038,7 +1038,7 @@ namespace JsonTests { ASSERT_EQUALS( 9 , e["$lt"].type() ); Base::run(); } - + BSONObj bson() const { BSONObjBuilder e; e.appendDate( "$gt" , 1257829200000LL ); @@ -1082,10 +1082,10 @@ namespace JsonTests { class All : public Suite { public: - All() : Suite( "json" ){ + All() : Suite( "json" ) { } - void setupTests(){ + void setupTests() { add< JsonStringTests::Empty >(); add< JsonStringTests::SingleStringMember >(); add< JsonStringTests::EscapedCharacters >(); @@ -1116,7 +1116,7 @@ namespace JsonTests { add< JsonStringTests::TimestampTests >(); add< JsonStringTests::NullString >(); add< JsonStringTests::AllTypes >(); - + add< FromJsonTests::Empty >(); add< FromJsonTests::EmptyWithSpace >(); add< FromJsonTests::SingleString >(); diff --git a/dbtests/jstests.cpp b/dbtests/jstests.cpp index a9d9db8..c33b200 100644 --- a/dbtests/jstests.cpp +++ b/dbtests/jstests.cpp @@ -1,4 +1,4 @@ -// javajstests.cpp +// javajstests.cpp // /** @@ -22,15 +22,16 @@ #include "../pch.h" #include "../scripting/engine.h" +#include "../util/timer.h" #include "dbtests.h" namespace mongo { - bool dbEval(const char *ns, BSONObj& cmd, BSONObjBuilder& result, string& errmsg); + bool dbEval(const string& dbName , BSONObj& cmd, BSONObjBuilder& result, string& errmsg); } // namespace mongo namespace JSTests { - + class Fundamental { public: void run() { @@ -42,26 +43,26 @@ namespace JSTests { globalScriptEngine->runTest(); } }; - + class BasicScope { public: - void run(){ + void run() { auto_ptr s; s.reset( globalScriptEngine->newScope() ); s->setNumber( "x" , 5 ); ASSERT( 5 == s->getNumber( "x" ) ); - + s->setNumber( "x" , 1.67 ); ASSERT( 1.67 == s->getNumber( "x" ) ); s->setString( "s" , "eliot was here" ); ASSERT( "eliot was here" == s->getString( "s" ) ); - + s->setBoolean( "b" , true ); ASSERT( s->getBoolean( "b" ) ); - if ( 0 ){ + if ( 0 ) { s->setBoolean( "b" , false ); ASSERT( ! s->getBoolean( "b" ) ); } @@ -70,12 +71,12 @@ namespace JSTests { class ResetScope { public: - void run(){ + void run() { // Not worrying about this for now SERVER-446. /* auto_ptr s; s.reset( globalScriptEngine->newScope() ); - + s->setBoolean( "x" , true ); ASSERT( s->getBoolean( "x" ) ); @@ -84,36 +85,36 @@ namespace JSTests { */ } }; - + class FalseTests { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); ASSERT( ! s->getBoolean( "x" ) ); - + s->setString( "z" , "" ); ASSERT( ! s->getBoolean( "z" ) ); - - + + delete s ; } }; class SimpleFunctions { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); s->invoke( "x=5;" , BSONObj() ); ASSERT( 5 == s->getNumber( "x" ) ); - + s->invoke( "return 17;" , BSONObj() ); ASSERT( 17 == s->getNumber( "return" ) ); - + s->invoke( "function(){ return 17; }" , BSONObj() ); ASSERT( 17 == s->getNumber( "return" ) ); - + s->setNumber( "x" , 1.76 ); s->invoke( "return x == 1.76; " , BSONObj() ); ASSERT( s->getBoolean( "return" ) ); @@ -121,7 +122,7 @@ namespace JSTests { s->setNumber( "x" , 1.76 ); s->invoke( "return x == 1.79; " , BSONObj() ); ASSERT( ! s->getBoolean( "return" ) ); - + s->invoke( "function( z ){ return 5 + z; }" , BSON( "" << 11 ) ); ASSERT_EQUALS( 16 , s->getNumber( "return" ) ); @@ -131,9 +132,9 @@ namespace JSTests { class ObjectMapping { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - + BSONObj o = BSON( "x" << 17 << "y" << "eliot" << "z" << "sara" ); s->setObject( "blah" , o ); @@ -154,7 +155,7 @@ namespace JSTests { s->invoke( "this.z == 'asara';" , BSONObj() ); ASSERT_EQUALS( false , s->getBoolean( "return" ) ); - + s->invoke( "return this.x == 17;" , BSONObj() ); ASSERT_EQUALS( true , s->getBoolean( "return" ) ); @@ -169,28 +170,28 @@ namespace JSTests { s->invoke( "function (){ return this.x == 17; }" , BSONObj() ); ASSERT_EQUALS( true , s->getBoolean( "return" ) ); - + s->invoke( "function z(){ return this.x == 18; }" , BSONObj() ); ASSERT_EQUALS( false , s->getBoolean( "return" ) ); s->invoke( "function (){ this.x == 17; }" , BSONObj() ); ASSERT_EQUALS( false , s->getBoolean( "return" ) ); - + s->invoke( "function z(){ this.x == 18; }" , BSONObj() ); ASSERT_EQUALS( false , s->getBoolean( "return" ) ); s->invoke( "x = 5; for( ; x <10; x++){ a = 1; }" , BSONObj() ); ASSERT_EQUALS( 10 , s->getNumber( "x" ) ); - + delete s; } }; class ObjectDecoding { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - + s->invoke( "z = { num : 1 };" , BSONObj() ); BSONObj out = s->getObject( "z" ); ASSERT_EQUALS( 1 , out["num"].number() ); @@ -200,43 +201,43 @@ namespace JSTests { out = s->getObject( "z" ); ASSERT_EQUALS( (string)"eliot" , out["x"].valuestr() ); ASSERT_EQUALS( 1 , out.nFields() ); - + BSONObj o = BSON( "x" << 17 ); - s->setObject( "blah" , o ); + s->setObject( "blah" , o ); out = s->getObject( "blah" ); ASSERT_EQUALS( 17 , out["x"].number() ); - + delete s; } }; - + class JSOIDTests { public: - void run(){ + void run() { #ifdef MOZJS Scope * s = globalScriptEngine->newScope(); - + s->localConnect( "blah" ); - + s->invoke( "z = { _id : new ObjectId() , a : 123 };" , BSONObj() ); BSONObj out = s->getObject( "z" ); ASSERT_EQUALS( 123 , out["a"].number() ); ASSERT_EQUALS( jstOID , out["_id"].type() ); - + OID save = out["_id"].__oid(); - + s->setObject( "a" , out ); - - s->invoke( "y = { _id : a._id , a : 124 };" , BSONObj() ); + + s->invoke( "y = { _id : a._id , a : 124 };" , BSONObj() ); out = s->getObject( "y" ); ASSERT_EQUALS( 124 , out["a"].number() ); - ASSERT_EQUALS( jstOID , out["_id"].type() ); + ASSERT_EQUALS( jstOID , out["_id"].type() ); ASSERT_EQUALS( out["_id"].__oid().str() , save.str() ); - s->invoke( "y = { _id : new ObjectId( a._id ) , a : 125 };" , BSONObj() ); + s->invoke( "y = { _id : new ObjectId( a._id ) , a : 125 };" , BSONObj() ); out = s->getObject( "y" ); ASSERT_EQUALS( 125 , out["a"].number() ); - ASSERT_EQUALS( jstOID , out["_id"].type() ); + ASSERT_EQUALS( jstOID , out["_id"].type() ); ASSERT_EQUALS( out["_id"].__oid().str() , save.str() ); delete s; @@ -267,9 +268,9 @@ namespace JSTests { class ObjectModReadonlyTests { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - + BSONObj o = BSON( "x" << 17 << "y" << "eliot" << "z" << "sara" << "zz" << BSONObj() ); s->setObject( "blah" , o , true ); @@ -288,16 +289,16 @@ namespace JSTests { s->setObject( "blah.zz", BSON( "a" << 19 ) ); out = s->getObject( "blah" ); ASSERT( out["zz"].embeddedObject()["a"].eoo() ); - + s->invoke( "delete blah['x']" , BSONObj() ); out = s->getObject( "blah" ); ASSERT( !out["x"].eoo() ); - + // read-only object itself can be overwritten s->invoke( "blah = {}", BSONObj() ); out = s->getObject( "blah" ); ASSERT( out.isEmpty() ); - + // test array - can't implement this in v8 // o = fromjson( "{a:[1,2,3]}" ); // s->setObject( "blah", o, true ); @@ -307,45 +308,47 @@ namespace JSTests { // out = s->getObject( "blah" ); // ASSERT_EQUALS( 1.0, out[ "a" ].embeddedObject()[ 0 ].number() ); // ASSERT_EQUALS( 3.0, out[ "a" ].embeddedObject()[ 2 ].number() ); - + delete s; } }; class OtherJSTypes { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - - { // date + + { + // date BSONObj o; - { + { BSONObjBuilder b; b.appendDate( "d" , 123456789 ); o = b.obj(); } s->setObject( "x" , o ); - + s->invoke( "return x.d.getTime() != 12;" , BSONObj() ); ASSERT_EQUALS( true, s->getBoolean( "return" ) ); - + s->invoke( "z = x.d.getTime();" , BSONObj() ); ASSERT_EQUALS( 123456789 , s->getNumber( "z" ) ); - + s->invoke( "z = { z : x.d }" , BSONObj() ); BSONObj out = s->getObject( "z" ); ASSERT( out["z"].type() == Date ); } - { // regex + { + // regex BSONObj o; - { + { BSONObjBuilder b; b.appendRegex( "r" , "^a" , "i" ); o = b.obj(); } s->setObject( "x" , o ); - + s->invoke( "z = x.r.test( 'b' );" , BSONObj() ); ASSERT_EQUALS( false , s->getBoolean( "z" ) ); @@ -362,26 +365,26 @@ namespace JSTests { ASSERT_EQUALS( (string)"i" , out["a"].regexFlags() ); } - + // array { BSONObj o = fromjson( "{r:[1,2,3]}" ); - s->setObject( "x", o, false ); + s->setObject( "x", o, false ); BSONObj out = s->getObject( "x" ); ASSERT_EQUALS( Array, out.firstElement().type() ); - s->setObject( "x", o, true ); + s->setObject( "x", o, true ); out = s->getObject( "x" ); ASSERT_EQUALS( Array, out.firstElement().type() ); } - + delete s; } }; class SpecialDBTypes { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); BSONObjBuilder b; @@ -389,7 +392,7 @@ namespace JSTests { b.appendMinKey( "b" ); b.appendMaxKey( "c" ); b.appendTimestamp( "d" , 1234000 , 9876 ); - + { BSONObj t = b.done(); @@ -398,7 +401,7 @@ namespace JSTests { } s->setObject( "z" , b.obj() ); - + ASSERT( s->invoke( "y = { a : z.a , b : z.b , c : z.c , d: z.d }" , BSONObj() ) == 0 ); BSONObj out = s->getObject( "y" ); @@ -414,14 +417,14 @@ namespace JSTests { delete s; } }; - + class TypeConservation { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - + // -- A -- - + BSONObj o; { BSONObjBuilder b ; @@ -431,7 +434,7 @@ namespace JSTests { } ASSERT_EQUALS( NumberInt , o["a"].type() ); ASSERT_EQUALS( NumberDouble , o["b"].type() ); - + s->setObject( "z" , o ); s->invoke( "return z" , BSONObj() ); BSONObj out = s->getObject( "return" ); @@ -442,7 +445,7 @@ namespace JSTests { ASSERT_EQUALS( NumberInt , out["a"].type() ); // -- B -- - + { BSONObjBuilder b ; b.append( "a" , (int)5 ); @@ -459,31 +462,31 @@ namespace JSTests { ASSERT_EQUALS( NumberDouble , out["b"].type() ); ASSERT_EQUALS( NumberInt , out["a"].type() ); - + // -- C -- - + { BSONObjBuilder b ; - + { BSONObjBuilder c; c.append( "0" , 5.5 ); c.append( "1" , 6 ); b.appendArray( "a" , c.obj() ); } - + o = b.obj(); } - + ASSERT_EQUALS( NumberDouble , o["a"].embeddedObjectUserCheck()["0"].type() ); ASSERT_EQUALS( NumberInt , o["a"].embeddedObjectUserCheck()["1"].type() ); - + s->setObject( "z" , o , false ); out = s->getObject( "z" ); ASSERT_EQUALS( NumberDouble , out["a"].embeddedObjectUserCheck()["0"].type() ); ASSERT_EQUALS( NumberInt , out["a"].embeddedObjectUserCheck()["1"].type() ); - + s->invokeSafe( "z.z = 5;" , BSONObj() ); out = s->getObject( "z" ); ASSERT_EQUALS( 5 , out["z"].number() ); @@ -493,9 +496,9 @@ namespace JSTests { // Eliot says I don't have to worry about this case - + // // -- D -- -// +// // o = fromjson( "{a:3.0,b:4.5}" ); // ASSERT_EQUALS( NumberDouble , o["a"].type() ); // ASSERT_EQUALS( NumberDouble , o["b"].type() ); @@ -505,20 +508,20 @@ namespace JSTests { // out = s->getObject( "return" ); // ASSERT_EQUALS( 3 , out["a"].number() ); // ASSERT_EQUALS( 4.5 , out["b"].number() ); -// +// // ASSERT_EQUALS( NumberDouble , out["b"].type() ); // ASSERT_EQUALS( NumberDouble , out["a"].type() ); -// - +// + delete s; } - + }; - + class NumberLong { public: void run() { - Scope * s = globalScriptEngine->newScope(); + auto_ptr s( globalScriptEngine->newScope() ); s->localConnect( "blah" ); BSONObjBuilder b; long long val = (long long)( 0xbabadeadbeefbaddULL ); @@ -527,7 +530,7 @@ namespace JSTests { s->setObject( "a", in ); BSONObj out = s->getObject( "a" ); ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() ); - + ASSERT( s->exec( "printjson( a ); b = {b:a.a}", "foo", false, true, false ) ); out = s->getObject( "b" ); ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() ); @@ -537,7 +540,7 @@ namespace JSTests { cout << out.toString() << endl; ASSERT_EQUALS( val, out.firstElement().numberLong() ); } - + ASSERT( s->exec( "c = {c:a.a.toString()}", "foo", false, true, false ) ); out = s->getObject( "c" ); stringstream ss; @@ -552,12 +555,12 @@ namespace JSTests { ASSERT( s->exec( "e = {e:a.a.floatApprox}", "foo", false, true, false ) ); out = s->getObject( "e" ); ASSERT_EQUALS( NumberDouble, out.firstElement().type() ); - ASSERT_EQUALS( double( val ), out.firstElement().number() ); + ASSERT_EQUALS( double( val ), out.firstElement().number() ); ASSERT( s->exec( "f = {f:a.a.top}", "foo", false, true, false ) ); out = s->getObject( "f" ); ASSERT( NumberDouble == out.firstElement().type() || NumberInt == out.firstElement().type() ); - + s->setObject( "z", BSON( "z" << (long long)( 4 ) ) ); ASSERT( s->exec( "y = {y:z.z.top}", "foo", false, true, false ) ); out = s->getObject( "y" ); @@ -566,36 +569,64 @@ namespace JSTests { ASSERT( s->exec( "x = {x:z.z.floatApprox}", "foo", false, true, false ) ); out = s->getObject( "x" ); ASSERT( NumberDouble == out.firstElement().type() || NumberInt == out.firstElement().type() ); - ASSERT_EQUALS( double( 4 ), out.firstElement().number() ); + ASSERT_EQUALS( double( 4 ), out.firstElement().number() ); ASSERT( s->exec( "w = {w:z.z}", "foo", false, true, false ) ); out = s->getObject( "w" ); ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() ); - ASSERT_EQUALS( 4, out.firstElement().numberLong() ); - + ASSERT_EQUALS( 4, out.firstElement().numberLong() ); + } }; - + + class NumberLong2 { + public: + void run() { + auto_ptr s( globalScriptEngine->newScope() ); + s->localConnect( "blah" ); + + BSONObj in; + { + BSONObjBuilder b; + b.append( "a" , 5 ); + b.append( "b" , (long long)5 ); + b.append( "c" , (long long)pow( 2.0, 29 ) ); + b.append( "d" , (long long)pow( 2.0, 30 ) ); + b.append( "e" , (long long)pow( 2.0, 31 ) ); + b.append( "f" , (long long)pow( 2.0, 45 ) ); + in = b.obj(); + } + s->setObject( "a" , in ); + + ASSERT( s->exec( "x = tojson( a ); " ,"foo" , false , true , false ) ); + string outString = s->getString( "x" ); + + ASSERT( s->exec( (string)"y = " + outString , "foo2" , false , true , false ) ); + BSONObj out = s->getObject( "y" ); + ASSERT_EQUALS( in , out ); + } + }; + class WeirdObjects { public: - BSONObj build( int depth ){ + BSONObj build( int depth ) { BSONObjBuilder b; b.append( "0" , depth ); if ( depth > 0 ) b.appendArray( "1" , build( depth - 1 ) ); return b.obj(); } - - void run(){ + + void run() { Scope * s = globalScriptEngine->newScope(); s->localConnect( "blah" ); - - for ( int i=5; i<100 ; i += 10 ){ + + for ( int i=5; i<100 ; i += 10 ) { s->setObject( "a" , build(i) , false ); s->invokeSafe( "tojson( a )" , BSONObj() ); - + s->setObject( "a" , build(5) , true ); s->invokeSafe( "tojson( a )" , BSONObj() ); } @@ -609,11 +640,12 @@ namespace JSTests { BSONObj cmd; BSONObjBuilder result; string errmsg; - dbEval( "", cmd, result, errmsg); + dbEval( "test", cmd, result, errmsg); + assert(0); } DBDirectClient client; - + class Utf8Check { public: Utf8Check() { reset(); } @@ -638,7 +670,7 @@ namespace JSTests { } void reset() { client.dropCollection( ns() ); - } + } static const char *ns() { return "unittest.jstests.utf8check"; } }; @@ -654,13 +686,13 @@ namespace JSTests { private: void reset() { client.dropCollection( ns() ); - } + } static const char *ns() { return "unittest.jstests.longutf8string"; } }; class InvalidUTF8Check { public: - void run(){ + void run() { if( !globalScriptEngine->utf8Ok() ) return; @@ -676,24 +708,24 @@ namespace JSTests { crap[2] = (char) 128; crap[3] = 17; crap[4] = 0; - + BSONObjBuilder bb; bb.append( "x" , crap ); b = bb.obj(); } - + //cout << "ELIOT: " << b.jsonString() << endl; s->setThis( &b ); // its ok if this is handled by js, just can't create a c++ exception - s->invoke( "x=this.x.length;" , BSONObj() ); + s->invoke( "x=this.x.length;" , BSONObj() ); } }; - + class CodeTests { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - + { BSONObjBuilder b; b.append( "a" , 1 ); @@ -702,10 +734,10 @@ namespace JSTests { b.appendCodeWScope( "d" , "function(){ out.d = 13 + bleh; }" , BSON( "bleh" << 5 ) ); s->setObject( "foo" , b.obj() ); } - + s->invokeSafe( "out = {}; out.a = foo.a; foo.b(); foo.c();" , BSONObj() ); BSONObj out = s->getObject( "out" ); - + ASSERT_EQUALS( 1 , out["a"].number() ); ASSERT_EQUALS( 11 , out["b"].number() ); ASSERT_EQUALS( 12 , out["c"].number() ); @@ -714,7 +746,7 @@ namespace JSTests { //s->invokeSafe( "foo.d() " , BSONObj() ); //out = s->getObject( "out" ); //ASSERT_EQUALS( 18 , out["d"].number() ); - + delete s; } @@ -722,19 +754,19 @@ namespace JSTests { class DBRefTest { public: - DBRefTest(){ + DBRefTest() { _a = "unittest.dbref.a"; _b = "unittest.dbref.b"; reset(); } - ~DBRefTest(){ + ~DBRefTest() { //reset(); } - - void run(){ + + void run() { client.insert( _a , BSON( "a" << "17" ) ); - + { BSONObj fromA = client.findOne( _a , BSONObj() ); assert( fromA.valid() ); @@ -744,28 +776,28 @@ namespace JSTests { b.appendDBRef( "c" , "dbref.a" , fromA["_id"].__oid() ); client.insert( _b , b.obj() ); } - + ASSERT( client.eval( "unittest" , "x = db.dbref.b.findOne(); assert.eq( 17 , x.c.fetch().a , 'ref working' );" ) ); - + // BSON DBRef <=> JS DBPointer ASSERT( client.eval( "unittest", "x = db.dbref.b.findOne(); db.dbref.b.drop(); x.c = new DBPointer( x.c.ns, x.c.id ); db.dbref.b.insert( x );" ) ); ASSERT_EQUALS( DBRef, client.findOne( "unittest.dbref.b", "" )[ "c" ].type() ); - + // BSON Object <=> JS DBRef ASSERT( client.eval( "unittest", "x = db.dbref.b.findOne(); db.dbref.b.drop(); x.c = new DBRef( x.c.ns, x.c.id ); db.dbref.b.insert( x );" ) ); ASSERT_EQUALS( Object, client.findOne( "unittest.dbref.b", "" )[ "c" ].type() ); ASSERT_EQUALS( string( "dbref.a" ), client.findOne( "unittest.dbref.b", "" )[ "c" ].embeddedObject().getStringField( "$ref" ) ); } - - void reset(){ + + void reset() { client.dropCollection( _a ); client.dropCollection( _b ); } - + const char * _a; const char * _b; }; - + class InformalDBRef { public: void run() { @@ -775,20 +807,20 @@ namespace JSTests { client.insert( ns(), BSON( "r" << BSON( "$ref" << "jstests.informaldbref" << "$id" << obj["_id"].__oid() << "foo" << "bar" ) ) ); obj = client.findOne( ns(), BSONObj() ); ASSERT_EQUALS( "bar", obj[ "r" ].embeddedObject()[ "foo" ].str() ); - + ASSERT( client.eval( "unittest", "x = db.jstests.informaldbref.findOne(); y = { r:x.r }; db.jstests.informaldbref.drop(); y.r[ \"a\" ] = \"b\"; db.jstests.informaldbref.save( y );" ) ); obj = client.findOne( ns(), BSONObj() ); - ASSERT_EQUALS( "bar", obj[ "r" ].embeddedObject()[ "foo" ].str() ); - ASSERT_EQUALS( "b", obj[ "r" ].embeddedObject()[ "a" ].str() ); + ASSERT_EQUALS( "bar", obj[ "r" ].embeddedObject()[ "foo" ].str() ); + ASSERT_EQUALS( "b", obj[ "r" ].embeddedObject()[ "a" ].str() ); } private: static const char *ns() { return "unittest.jstests.informaldbref"; } }; - + class BinDataType { public: - - void pp( const char * s , BSONElement e ){ + + void pp( const char * s , BSONElement e ) { int len; const char * data = e.binData( len ); cout << s << ":" << e.binDataType() << "\t" << len << endl; @@ -798,12 +830,12 @@ namespace JSTests { cout << endl; } - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); s->localConnect( "asd" ); const char * foo = "asdas\0asdasd"; const char * base64 = "YXNkYXMAYXNkYXNk"; - + BSONObj in; { BSONObjBuilder b; @@ -812,10 +844,10 @@ namespace JSTests { in = b.obj(); s->setObject( "x" , in ); } - + s->invokeSafe( "myb = x.b; print( myb ); printjson( myb );" , BSONObj() ); s->invokeSafe( "y = { c : myb };" , BSONObj() ); - + BSONObj out = s->getObject( "y" ); ASSERT_EQUALS( BinData , out["c"].type() ); // pp( "in " , in["b"] ); @@ -827,14 +859,14 @@ namespace JSTests { stringstream expected; expected << "BinData(" << BinDataGeneral << ",\"" << base64 << "\")"; ASSERT_EQUALS( expected.str(), s->getString( "q" ) ); - + stringstream scriptBuilder; scriptBuilder << "z = { c : new BinData( " << BinDataGeneral << ", \"" << base64 << "\" ) };"; string script = scriptBuilder.str(); s->invokeSafe( script.c_str(), BSONObj() ); out = s->getObject( "z" ); // pp( "out" , out["c"] ); - ASSERT_EQUALS( 0 , in["b"].woCompare( out["c"] , false ) ); + ASSERT_EQUALS( 0 , in["b"].woCompare( out["c"] , false ) ); s->invokeSafe( "a = { f: new BinData( 128, \"\" ) };", BSONObj() ); out = s->getObject( "a" ); @@ -842,16 +874,16 @@ namespace JSTests { out[ "f" ].binData( len ); ASSERT_EQUALS( 0, len ); ASSERT_EQUALS( 128, out[ "f" ].binDataType() ); - + delete s; } }; class VarTests { public: - void run(){ + void run() { Scope * s = globalScriptEngine->newScope(); - + ASSERT( s->exec( "a = 5;" , "a" , false , true , false ) ); ASSERT_EQUALS( 5 , s->getNumber("a" ) ); @@ -863,19 +895,19 @@ namespace JSTests { class Speed1 { public: - void run(){ + void run() { BSONObj start = BSON( "x" << 5 ); BSONObj empty; auto_ptr s; s.reset( globalScriptEngine->newScope() ); - + ScriptingFunction f = s->createFunction( "return this.x + 6;" ); s->setThis( &start ); - + Timer t; double n = 0; - for ( ; n < 100000; n++ ){ + for ( ; n < 100000; n++ ) { s->invoke( f , empty ); ASSERT_EQUALS( 11 , s->getNumber( "return" ) ); } @@ -885,10 +917,10 @@ namespace JSTests { class ScopeOut { public: - void run(){ + void run() { auto_ptr s; s.reset( globalScriptEngine->newScope() ); - + s->invokeSafe( "x = 5;" , BSONObj() ); { BSONObjBuilder b; @@ -910,18 +942,39 @@ namespace JSTests { } }; + class RenameTest { + public: + void run() { + auto_ptr s; + s.reset( globalScriptEngine->newScope() ); + + s->setNumber( "x" , 5 ); + ASSERT_EQUALS( 5 , s->getNumber( "x" ) ); + ASSERT_EQUALS( Undefined , s->type( "y" ) ); + + s->rename( "x" , "y" ); + ASSERT_EQUALS( 5 , s->getNumber( "y" ) ); + ASSERT_EQUALS( Undefined , s->type( "x" ) ); + + s->rename( "y" , "x" ); + ASSERT_EQUALS( 5 , s->getNumber( "x" ) ); + ASSERT_EQUALS( Undefined , s->type( "y" ) ); + } + }; + + class All : public Suite { public: All() : Suite( "js" ) { } - - void setupTests(){ + + void setupTests() { add< Fundamental >(); add< BasicScope >(); add< ResetScope >(); add< FalseTests >(); add< SimpleFunctions >(); - + add< ObjectMapping >(); add< ObjectDecoding >(); add< JSOIDTests >(); @@ -931,15 +984,17 @@ namespace JSTests { add< SpecialDBTypes >(); add< TypeConservation >(); add< NumberLong >(); - + add< NumberLong2 >(); + add< RenameTest >(); + add< WeirdObjects >(); add< CodeTests >(); add< DBRefTest >(); add< InformalDBRef >(); add< BinDataType >(); - + add< VarTests >(); - + add< Speed1 >(); add< InvalidUTF8Check >(); @@ -949,6 +1004,6 @@ namespace JSTests { add< ScopeOut >(); } } myall; - + } // namespace JavaJSTests diff --git a/dbtests/matchertests.cpp b/dbtests/matchertests.cpp index 696c924..380b8b8 100644 --- a/dbtests/matchertests.cpp +++ b/dbtests/matchertests.cpp @@ -18,12 +18,15 @@ */ #include "pch.h" -#include "../db/matcher.h" +#include "../util/timer.h" +#include "../db/matcher.h" #include "../db/json.h" #include "dbtests.h" + + namespace MatcherTests { class Basic { @@ -34,26 +37,26 @@ namespace MatcherTests { ASSERT( m.matches( fromjson( "{\"a\":\"b\"}" ) ) ); } }; - + class DoubleEqual { public: void run() { BSONObj query = fromjson( "{\"a\":5}" ); Matcher m( query ); - ASSERT( m.matches( fromjson( "{\"a\":5}" ) ) ); + ASSERT( m.matches( fromjson( "{\"a\":5}" ) ) ); } }; - + class MixedNumericEqual { public: void run() { BSONObjBuilder query; query.append( "a", 5 ); Matcher m( query.done() ); - ASSERT( m.matches( fromjson( "{\"a\":5}" ) ) ); - } + ASSERT( m.matches( fromjson( "{\"a\":5}" ) ) ); + } }; - + class MixedNumericGt { public: void run() { @@ -62,16 +65,16 @@ namespace MatcherTests { BSONObjBuilder b; b.append( "a", 5 ); ASSERT( m.matches( b.done() ) ); - } + } }; - + class MixedNumericIN { public: - void run(){ + void run() { BSONObj query = fromjson( "{ a : { $in : [4,6] } }" ); ASSERT_EQUALS( 4 , query["a"].embeddedObject()["$in"].embeddedObject()["0"].number() ); ASSERT_EQUALS( NumberInt , query["a"].embeddedObject()["$in"].embeddedObject()["0"].type() ); - + Matcher m( query ); { @@ -92,19 +95,19 @@ namespace MatcherTests { b.append( "a" , 4 ); ASSERT( m.matches( b.done() ) ); } - + } }; class MixedNumericEmbedded { public: - void run(){ + void run() { Matcher m( BSON( "a" << BSON( "x" << 1 ) ) ); ASSERT( m.matches( BSON( "a" << BSON( "x" << 1 ) ) ) ); ASSERT( m.matches( BSON( "a" << BSON( "x" << 1.0 ) ) ) ); } }; - + class Size { public: void run() { @@ -113,16 +116,38 @@ namespace MatcherTests { ASSERT( !m.matches( fromjson( "{a:[1,2,3]}" ) ) ); ASSERT( !m.matches( fromjson( "{a:[1,2,3,'a','b']}" ) ) ); ASSERT( !m.matches( fromjson( "{a:[[1,2,3,4]]}" ) ) ); - } + } + }; + + + class TimingBase { + public: + long time( const BSONObj& patt , const BSONObj& obj ) { + Matcher m( patt ); + Timer t; + for ( int i=0; i<10000; i++ ) { + ASSERT( m.matches( obj ) ); + } + return t.millis(); + } + }; + + class AllTiming : public TimingBase { + public: + void run() { + long normal = time( BSON( "x" << 5 ) , BSON( "x" << 5 ) ); + long all = time( BSON( "x" << BSON( "$all" << BSON_ARRAY( 5 ) ) ) , BSON( "x" << 5 ) ); + + cout << "normal: " << normal << " all: " << all << endl; + } }; - class All : public Suite { public: - All() : Suite( "matcher" ){ + All() : Suite( "matcher" ) { } - - void setupTests(){ + + void setupTests() { add< Basic >(); add< DoubleEqual >(); add< MixedNumericEqual >(); @@ -130,8 +155,9 @@ namespace MatcherTests { add< MixedNumericIN >(); add< Size >(); add< MixedNumericEmbedded >(); + add< AllTiming >(); } } dball; - + } // namespace MatcherTests diff --git a/dbtests/mmaptests.cpp b/dbtests/mmaptests.cpp new file mode 100644 index 0000000..7fb6eee --- /dev/null +++ b/dbtests/mmaptests.cpp @@ -0,0 +1,219 @@ +// @file mmaptests.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "../db/mongommf.h" +#include "../util/timer.h" +#include "dbtests.h" + +namespace MMapTests { + + class LeakTest { + const string fn; + const int optOld; + public: + LeakTest() : + fn( (path(dbpath) / "testfile.map").string() ), optOld(cmdLine.durOptions) + { + cmdLine.durOptions = 0; // DurParanoid doesn't make sense with this test + } + ~LeakTest() { + cmdLine.durOptions = optOld; + try { boost::filesystem::remove(fn); } + catch(...) { } + } + void run() { + + try { boost::filesystem::remove(fn); } + catch(...) { } + + writelock lk; + + { + MongoMMF f; + unsigned long long len = 256 * 1024 * 1024; + assert( f.create(fn, len, /*sequential*/false) ); + { + char *p = (char *) f.getView(); + assert(p); + // write something to the private view as a test + if( cmdLine.dur ) + MemoryMappedFile::makeWritable(p, 6); + strcpy(p, "hello"); + } + if( cmdLine.dur ) { + char *w = (char *) f.view_write(); + strcpy(w + 6, "world"); + } + MongoFileFinder ff; + ASSERT( ff.findByPath(fn) ); + ASSERT( ff.findByPath("asdf") == 0 ); + } + { + MongoFileFinder ff; + ASSERT( ff.findByPath(fn) == 0 ); + } + + int N = 10000; +#if !defined(_WIN32) && !defined(__linux__) + // seems this test is slow on OS X. + N = 100; +#endif + + // we make a lot here -- if we were leaking, presumably it would fail doing this many. + Timer t; + for( int i = 0; i < N; i++ ) { + MongoMMF f; + assert( f.open(fn, i%4==1) ); + { + char *p = (char *) f.getView(); + assert(p); + if( cmdLine.dur ) + MemoryMappedFile::makeWritable(p, 4); + strcpy(p, "zzz"); + } + if( cmdLine.dur ) { + char *w = (char *) f.view_write(); + if( i % 2 == 0 ) + ++(*w); + assert( w[6] == 'w' ); + } + } + if( t.millis() > 10000 ) { + log() << "warning: MMap LeakTest is unusually slow N:" << N << ' ' << t.millis() << "ms" << endl; + } + + } + }; + + class All : public Suite { + public: + All() : Suite( "mmap" ) {} + void setupTests() { + add< LeakTest >(); + } + } myall; + +#if 0 + + class CopyOnWriteSpeedTest { + public: + void run() { + + string fn = "/tmp/testfile.map"; + boost::filesystem::remove(fn); + + MemoryMappedFile f; + char *p = (char *) f.create(fn, 1024 * 1024 * 1024, true); + assert(p); + strcpy(p, "hello"); + + { + void *x = f.testGetCopyOnWriteView(); + Timer tt; + for( int i = 11; i < 1000000000; i++ ) + p[i] = 'z'; + cout << "fill 1GB time: " << tt.millis() << "ms" << endl; + f.testCloseCopyOnWriteView(x); + } + + /* test a lot of view/unviews */ + { + Timer t; + + char *q; + for( int i = 0; i < 1000; i++ ) { + q = (char *) f.testGetCopyOnWriteView(); + assert( q ); + if( i == 999 ) { + strcpy(q+2, "there"); + } + f.testCloseCopyOnWriteView(q); + } + + cout << "view unview: " << t.millis() << "ms" << endl; + } + + f.flush(true); + + /* plain old mmaped writes */ + { + Timer t; + for( int i = 0; i < 10; i++ ) { + memset(p+100, 'c', 200 * 1024 * 1024); + } + cout << "traditional writes: " << t.millis() << "ms" << endl; + } + + f.flush(true); + + /* test doing some writes */ + { + Timer t; + char *q = (char *) f.testGetCopyOnWriteView(); + for( int i = 0; i < 10; i++ ) { + assert( q ); + memset(q+100, 'c', 200 * 1024 * 1024); + } + f.testCloseCopyOnWriteView(q); + + cout << "inc style some writes: " << t.millis() << "ms" << endl; + } + + /* test doing some writes */ + { + Timer t; + for( int i = 0; i < 10; i++ ) { + char *q = (char *) f.testGetCopyOnWriteView(); + assert( q ); + memset(q+100, 'c', 200 * 1024 * 1024); + f.testCloseCopyOnWriteView(q); + } + + cout << "some writes: " << t.millis() << "ms" << endl; + } + + /* more granular */ + { + Timer t; + for( int i = 0; i < 100; i++ ) { + char *q = (char *) f.testGetCopyOnWriteView(); + assert( q ); + memset(q+100, 'c', 20 * 1024 * 1024); + f.testCloseCopyOnWriteView(q); + } + + cout << "more granular some writes: " << t.millis() << "ms" << endl; + } + + p[10] = 0; + cout << p << endl; + } + }; + + class All : public Suite { + public: + All() : Suite( "mmap" ) {} + void setupTests() { + add< CopyOnWriteSpeedTest >(); + } + } myall; + +#endif + +} diff --git a/dbtests/mockdbclient.h b/dbtests/mockdbclient.h index 9119075..fda0963 100644 --- a/dbtests/mockdbclient.h +++ b/dbtests/mockdbclient.h @@ -64,8 +64,8 @@ public: virtual void afterCommand() {} }; DirectDBClientConnection( ReplPair *rp, ConnectionCallback *cc = 0 ) : - rp_( rp ), - cc_( cc ) { + rp_( rp ), + cc_( cc ) { } virtual BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) { BSONObj c = query.obj.copy(); diff --git a/dbtests/namespacetests.cpp b/dbtests/namespacetests.cpp index ca051fe..c2be0b0 100644 --- a/dbtests/namespacetests.cpp +++ b/dbtests/namespacetests.cpp @@ -32,7 +32,7 @@ namespace NamespaceTests { dblock lk; Client::Context _context; public: - Base() : _context(ns()){ + Base() : _context(ns()) { } virtual ~Base() { if ( id_.info.isNull() ) @@ -323,7 +323,7 @@ namespace NamespaceTests { return k.obj(); } }; - + class ArraySubobjectSingleMissing : public Base { public: void run() { @@ -336,7 +336,7 @@ namespace NamespaceTests { elts.push_back( simpleBC( i ) ); BSONObjBuilder b; b.append( "a", elts ); - + BSONObjSetDefaultOrder keys; id().getKeysFromObject( b.done(), keys ); checkSize( 4, keys ); @@ -353,7 +353,7 @@ namespace NamespaceTests { return aDotB(); } }; - + class ArraySubobjectMissing : public Base { public: void run() { @@ -376,7 +376,7 @@ namespace NamespaceTests { return aDotB(); } }; - + class MissingField : public Base { public: void run() { @@ -391,7 +391,7 @@ namespace NamespaceTests { return BSON( "a" << 1 ); } }; - + class SubobjectMissing : public Base { public: void run() { @@ -406,12 +406,12 @@ namespace NamespaceTests { return aDotB(); } }; - + class CompoundMissing : public Base { public: - void run(){ + void run() { create(); - + { BSONObjSetDefaultOrder keys; id().getKeysFromObject( fromjson( "{x:'a',y:'b'}" ) , keys ); @@ -428,16 +428,16 @@ namespace NamespaceTests { b.appendNull( "" ); assertEquals( b.obj() , *keys.begin() ); } - + } private: virtual BSONObj key() const { return BSON( "x" << 1 << "y" << 1 ); } - + }; - + class ArraySubelementComplex : public Base { public: void run() { @@ -508,17 +508,17 @@ namespace NamespaceTests { return aDotB(); } }; - + class EmptyArray : Base { public: - void run(){ + void run() { create(); BSONObjSetDefaultOrder keys; id().getKeysFromObject( fromjson( "{a:[1,2]}" ), keys ); checkSize(2, keys ); keys.clear(); - + id().getKeysFromObject( fromjson( "{a:[1]}" ), keys ); checkSize(1, keys ); keys.clear(); @@ -535,14 +535,14 @@ namespace NamespaceTests { class MultiEmptyArray : Base { public: - void run(){ + void run() { create(); BSONObjSetDefaultOrder keys; id().getKeysFromObject( fromjson( "{a:1,b:[1,2]}" ), keys ); checkSize(2, keys ); keys.clear(); - + id().getKeysFromObject( fromjson( "{a:1,b:[1]}" ), keys ); checkSize(1, keys ); keys.clear(); @@ -551,7 +551,7 @@ namespace NamespaceTests { //cout << "YO : " << *(keys.begin()) << endl; checkSize(1, keys ); keys.clear(); - + id().getKeysFromObject( fromjson( "{a:1,b:[]}" ), keys ); checkSize(1, keys ); //cout << "YO : " << *(keys.begin()) << endl; @@ -600,11 +600,11 @@ namespace NamespaceTests { if ( fileNo == -1 ) continue; for ( int j = i.ext()->firstRecord.getOfs(); j != DiskLoc::NullOfs; - j = DiskLoc( fileNo, j ).rec()->nextOfs ) { + j = DiskLoc( fileNo, j ).rec()->nextOfs ) { ++count; } } - ASSERT_EQUALS( count, nsd()->nrecords ); + ASSERT_EQUALS( count, nsd()->stats.nrecords ); return count; } int nExtents() const { @@ -620,7 +620,7 @@ namespace NamespaceTests { return ns_; } NamespaceDetails *nsd() const { - return nsdetails( ns() ); + return nsdetails( ns() )->writingWithExtra(); } static BSONObj bigObj() { string as( 187, 'a' ); @@ -700,7 +700,7 @@ namespace NamespaceTests { } }; - /* test NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc loc) + /* test NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc loc) */ class TruncateCapped : public Base { virtual string spec() const { @@ -737,9 +737,9 @@ namespace NamespaceTests { } DiskLoc d = l[6]; - long long n = nsd->nrecords; + long long n = nsd->stats.nrecords; nsd->cappedTruncateAfter(ns(), d, false); - ASSERT_EQUALS( nsd->nrecords , n-1 ); + ASSERT_EQUALS( nsd->stats.nrecords , n-1 ); { ForwardCappedCursor c(nsd); @@ -770,7 +770,7 @@ namespace NamespaceTests { void run() { create(); nsd()->deletedList[ 2 ] = nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted; - nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted = DiskLoc(); + nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted.writing() = DiskLoc(); nsd()->cappedLastDelRecLastExtent().Null(); NamespaceDetails *d = nsd(); zero( &d->capExtent ); @@ -820,15 +820,15 @@ namespace NamespaceTests { ASSERT_EQUALS( 496U, sizeof( NamespaceDetails ) ); } }; - + } // namespace NamespaceDetailsTests class All : public Suite { public: - All() : Suite( "namespace" ){ + All() : Suite( "namespace" ) { } - void setupTests(){ + void setupTests() { add< IndexDetailsTests::Create >(); add< IndexDetailsTests::GetKeysFromObjectSimple >(); add< IndexDetailsTests::GetKeysFromObjectDotted >(); diff --git a/dbtests/pairingtests.cpp b/dbtests/pairingtests.cpp index 68d4c0e..9cca548 100644 --- a/dbtests/pairingtests.cpp +++ b/dbtests/pairingtests.cpp @@ -37,7 +37,7 @@ namespace PairingTests { ~Base() { pairSync = backup; dblock lk; - Helpers::emptyCollection( "local.pair.sync" ); + Helpers::emptyCollection( "local.pair.sync" ); if ( pairSync->initialSyncCompleted() ) { // save to db pairSync->setInitialSyncCompleted(); @@ -63,7 +63,7 @@ namespace PairingTests { private: static void init() { dblock lk; - Helpers::emptyCollection( "local.pair.sync" ); + Helpers::emptyCollection( "local.pair.sync" ); if ( synced != 0 && notSynced != 0 ) return; notSynced = new PairSync(); @@ -71,7 +71,7 @@ namespace PairingTests { synced = new PairSync(); synced->init(); synced->setInitialSyncCompleted(); - Helpers::emptyCollection( "local.pair.sync" ); + Helpers::emptyCollection( "local.pair.sync" ); } PairSync *backup; static PairSync *synced; @@ -199,24 +199,24 @@ namespace PairingTests { TestableReplPair rp4( true, fromjson( "{ok:1,you_are:1}" ) ); rp4.arbitrate(); - ASSERT( rp4.state == ReplPair::State_Master ); + ASSERT( rp4.state == ReplPair::State_Master ); TestableReplPair rp5( true, fromjson( "{ok:1,you_are:0}" ) ); rp5.arbitrate(); - ASSERT( rp5.state == ReplPair::State_Slave ); + ASSERT( rp5.state == ReplPair::State_Slave ); TestableReplPair rp6( true, fromjson( "{ok:1,you_are:-1}" ) ); rp6.arbitrate(); // unchanged from initial value - ASSERT( rp6.state == ReplPair::State_Negotiating ); + ASSERT( rp6.state == ReplPair::State_Negotiating ); } private: class TestableReplPair : public ReplPair { public: TestableReplPair( bool connect, const BSONObj &one ) : - ReplPair( "a", "z" ), - connect_( connect ), - one_( one ) { + ReplPair( "a", "z" ), + connect_( connect ), + one_( one ) { } virtual DBClientConnection *newClientConnection() const { @@ -326,10 +326,10 @@ namespace PairingTests { class All : public Suite { public: - All() : Suite( "pairing" ){ + All() : Suite( "pairing" ) { } - - void setupTests(){ + + void setupTests() { add< ReplPairTests::Create >(); add< ReplPairTests::Dominant >(); add< ReplPairTests::SetMaster >(); diff --git a/dbtests/pdfiletests.cpp b/dbtests/pdfiletests.cpp index 7e92783..2844fc4 100644 --- a/dbtests/pdfiletests.cpp +++ b/dbtests/pdfiletests.cpp @@ -31,7 +31,7 @@ namespace PdfileTests { class Base { public: - Base() : _context( ns() ){ + Base() : _context( ns() ) { } virtual ~Base() { if ( !nsd() ) @@ -71,6 +71,7 @@ namespace PdfileTests { BSONObj o = b.done(); int len = o.objsize(); Extent *e = ext.ext(); + e = getDur().writing(e); int ofs; if ( e->lastRecord.isNull() ) ofs = ext.getOfs() + ( e->_extentData - (char *)e ); @@ -78,6 +79,7 @@ namespace PdfileTests { ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders; DiskLoc dl( ext.a(), ofs ); Record *r = dl.rec(); + r = (Record*) getDur().writingPtr(r, Record::HeaderSize + len); r->lengthWithHeaders = Record::HeaderSize + len; r->extentOfs = e->myLoc.getOfs(); r->nextOfs = DiskLoc::NullOfs; @@ -86,7 +88,7 @@ namespace PdfileTests { if ( e->firstRecord.isNull() ) e->firstRecord = dl; else - e->lastRecord.rec()->nextOfs = ofs; + getDur().writingInt(e->lastRecord.rec()->nextOfs) = ofs; e->lastRecord = dl; return dl; } @@ -110,7 +112,7 @@ namespace PdfileTests { class EmptyLooped : public Base { virtual void prepare() { - nsd()->capFirstNewRecord = DiskLoc(); + nsd()->writingWithExtra()->capFirstNewRecord = DiskLoc(); } virtual int count() const { return 0; @@ -119,7 +121,7 @@ namespace PdfileTests { class EmptyMultiExtentLooped : public Base { virtual void prepare() { - nsd()->capFirstNewRecord = DiskLoc(); + nsd()->writingWithExtra()->capFirstNewRecord = DiskLoc(); } virtual int count() const { return 0; @@ -131,7 +133,7 @@ namespace PdfileTests { class Single : public Base { virtual void prepare() { - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 0 ); + nsd()->writingWithExtra()->capFirstNewRecord = insert( nsd()->capExtent, 0 ); } virtual int count() const { return 1; @@ -140,7 +142,8 @@ namespace PdfileTests { class NewCapFirst : public Base { virtual void prepare() { - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 0 ); + DiskLoc x = insert( nsd()->capExtent, 0 ); + nsd()->writingWithExtra()->capFirstNewRecord = x; insert( nsd()->capExtent, 1 ); } virtual int count() const { @@ -151,7 +154,7 @@ namespace PdfileTests { class NewCapLast : public Base { virtual void prepare() { insert( nsd()->capExtent, 0 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 1 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 1 ); } virtual int count() const { return 2; @@ -161,7 +164,7 @@ namespace PdfileTests { class NewCapMiddle : public Base { virtual void prepare() { insert( nsd()->capExtent, 0 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 1 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 1 ); insert( nsd()->capExtent, 2 ); } virtual int count() const { @@ -173,7 +176,7 @@ namespace PdfileTests { virtual void prepare() { insert( nsd()->capExtent, 0 ); insert( nsd()->lastExtent, 1 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 2 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 ); insert( nsd()->capExtent, 3 ); } virtual int count() const { @@ -186,10 +189,10 @@ namespace PdfileTests { class LastExtent : public Base { virtual void prepare() { - nsd()->capExtent = nsd()->lastExtent; + nsd()->capExtent.writing() = nsd()->lastExtent; insert( nsd()->capExtent, 0 ); insert( nsd()->firstExtent, 1 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 2 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 ); insert( nsd()->capExtent, 3 ); } virtual int count() const { @@ -202,11 +205,11 @@ namespace PdfileTests { class MidExtent : public Base { virtual void prepare() { - nsd()->capExtent = nsd()->firstExtent.ext()->xnext; + nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext; insert( nsd()->capExtent, 0 ); insert( nsd()->lastExtent, 1 ); insert( nsd()->firstExtent, 2 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 3 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 3 ); insert( nsd()->capExtent, 4 ); } virtual int count() const { @@ -219,10 +222,10 @@ namespace PdfileTests { class AloneInExtent : public Base { virtual void prepare() { - nsd()->capExtent = nsd()->firstExtent.ext()->xnext; + nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext; insert( nsd()->lastExtent, 0 ); insert( nsd()->firstExtent, 1 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 2 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 ); } virtual int count() const { return 3; @@ -234,10 +237,10 @@ namespace PdfileTests { class FirstInExtent : public Base { virtual void prepare() { - nsd()->capExtent = nsd()->firstExtent.ext()->xnext; + nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext; insert( nsd()->lastExtent, 0 ); insert( nsd()->firstExtent, 1 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 2 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 ); insert( nsd()->capExtent, 3 ); } virtual int count() const { @@ -250,11 +253,11 @@ namespace PdfileTests { class LastInExtent : public Base { virtual void prepare() { - nsd()->capExtent = nsd()->firstExtent.ext()->xnext; + nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext; insert( nsd()->capExtent, 0 ); insert( nsd()->lastExtent, 1 ); insert( nsd()->firstExtent, 2 ); - nsd()->capFirstNewRecord = insert( nsd()->capExtent, 3 ); + nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 3 ); } virtual int count() const { return 4; @@ -265,11 +268,11 @@ namespace PdfileTests { }; } // namespace ScanCapped - + namespace Insert { class Base { public: - Base() : _context( ns() ){ + Base() : _context( ns() ) { } virtual ~Base() { if ( !nsd() ) @@ -288,7 +291,7 @@ namespace PdfileTests { dblock lk_; Client::Context _context; }; - + class UpdateDate : public Base { public: void run() { @@ -301,12 +304,86 @@ namespace PdfileTests { } }; } // namespace Insert - + + class ExtentSizing { + public: + struct SmallFilesControl { + SmallFilesControl() { + old = cmdLine.smallfiles; + cmdLine.smallfiles = false; + } + ~SmallFilesControl() { + cmdLine.smallfiles = old; + } + bool old; + }; + void run() { + SmallFilesControl c; + // test that no matter what we start with, we always get to max extent size + for ( int obj=16; objaddAFile( big , false ); + cout << f->length() << ' ' << n << endl; + if ( f->length() == l ) + break; + l = f->length(); + } + + int start = d->numFiles(); + for ( int i=0; iallocExtent( c1.c_str() , d->getFile( i )->getHeader()->unusedLength , false ); + ASSERT_EQUALS( start , d->numFiles() ); + + { + DBDirectClient db; + db.dropDatabase( dbname ); + } + } + }; + + class All : public Suite { public: - All() : Suite( "pdfile" ){} - - void setupTests(){ + All() : Suite( "pdfile" ) {} + + void setupTests() { add< ScanCapped::Empty >(); add< ScanCapped::EmptyLooped >(); add< ScanCapped::EmptyMultiExtentLooped >(); @@ -321,6 +398,8 @@ namespace PdfileTests { add< ScanCapped::FirstInExtent >(); add< ScanCapped::LastInExtent >(); add< Insert::UpdateDate >(); + add< ExtentSizing >(); + add< ExtentAllocOrder >(); } } myall; diff --git a/dbtests/perf/btreeperf.cpp b/dbtests/perf/btreeperf.cpp new file mode 100644 index 0000000..7d68d8f --- /dev/null +++ b/dbtests/perf/btreeperf.cpp @@ -0,0 +1,442 @@ +// btreeperf.cpp + +/* Copyright 2010 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Performance timing and space utilization testing for btree indexes. + */ + +#include + +#include +#include +#include +#include +#include + +#include "client/dbclient.h" +#include "../../util/timer.h" + +using namespace std; +using namespace mongo; +using namespace boost; + +const char *ns = "test.btreeperf"; +const char *db = "test"; +const char *index_collection = "btreeperf.$_id_"; + +// This random number generator has a much larger period than the default +// generator and is half as fast as the default. Given that we intend to +// generate large numbers of documents and will utilize more than one random +// sample per document, choosing this generator seems like a worthwhile tradeoff. +mt19937 randomNumberGenerator; + +/** + * An interface for generating documents to be inserted and document specs for + * remove requests. + */ +class InsertAndRemoveStrategy { +public: + virtual ~InsertAndRemoveStrategy() {} + virtual BSONObj insertObj() = 0; + virtual BSONObj removeObj() = 0; +protected: + /** + * Helper functions for converting a sample value to a sample object with + * specified _id, to be inserted or removed. + */ + + template< class T > + BSONObj insertObjWithVal( const T &val ) { + BSONObjBuilder b; + b.append( "_id", val ); + return b.obj(); + } + template< class T > + BSONObj removeObjWithVal( const T &val ) { + BSONObjBuilder b; + b.append( "_id", val ); + return b.obj(); + } +}; + +/** + * Manages a set of elements of type T. Supports inserting unique elements and + * sampling a random element without replacement. + * + * TODO In the contexts where this class is currently used, duplicate keys are + * either impossible or highly unlikely. And an occasional duplicate value will + * not much affect the procedure by wich a random element is chosen. We could + * stop checking for duplicates in push(), eliminate _set from the implementaiton, + * and potentially improve performance and memory requirements somewhat. + */ +template< class T > +class SetSampler { +public: + /** @param val Insert this value in the set if not already present. */ + void push( const T& val ) { + if ( _set.insert( val ).second ) { + _vector.push_back( val ); + } + } + /** @return a random element removed from the set */ + T pull() { + if ( _vector.size() == 0 ) { + return T(); + } + uniform_int< size_t > sizeRange( 0, _vector.size() - 1 ); + variate_generator< mt19937&, uniform_int< size_t > > sizeGenerator( randomNumberGenerator, sizeRange ); + size_t toRemove = sizeGenerator(); + T val = _vector[ toRemove ]; + // Replace the random element with the last element, then remove the + // last element. + _vector[ toRemove ] = _vector.back(); + _vector.pop_back(); + _set.erase( val ); + return val; + } +private: + vector< T > _vector; + set< T > _set; +}; + +/** + * Tracks values that have been specified for insertion by the derived class's + * implementation of insertVal() and selects uniformally from among values that + * have been inserted but not yet removed for the next value to remove. + * + * The implementation is probabilistically sound, but may be resource intensive + * and slow due to the use of a SetSampler. + */ +template< class T > +class InsertAndUniformRemoveStrategy : public InsertAndRemoveStrategy { +public: + virtual BSONObj insertObj() { + T val = insertVal(); + _sampler.push( val ); + return insertObjWithVal( val ); + } + virtual BSONObj removeObj() { return removeObjWithVal( _sampler.pull() ); } +protected: + /** @return value to insert. This is the only function a derived class need implement. */ + virtual T insertVal() = 0; +private: + SetSampler< T > _sampler; +}; + +/** + * The derived class supplies keys to be inserted and removed. The key removal + * strategy is similar to the strategy for selecting a random element described + * in the MongoDB cookbook: the first key in the collection greater than or + * equal to the supplied removal key is removed. This allows selecting an + * exising key for removal without the overhead required by a SetSampler. + * + * While this ranged selection strategy can work well for selecting a random + * element, there are some theoretical and empirically observed shortcomings + * when the strategy is applied to removing nodes for btree performance measurement: + * 1 The likelihood that a given key is removed is proportional to the difference + * in value between it and the previous key. Because key deletion increases + * the difference in value between adjacent keys, neighboring keys will be + * more likely to be deleted than they would be in a true uniform distribution. + * 2 MongoDB 1.6 uses 'unused' nodes in the btree implementation. With a ranged + * removal strategy, those nodes must be traversed to find a node available + * for removal. + * 3 Ranged removal was observed to be biased against the balancing policy of + * MongoDB 1.7 in some cases, in terms of storage size. This may be a + * consequence of point 1 above. + * 4 Ranged removal was observed to be significantly biased against the btree + * implementation in MongoDB 1.6 in terms of performance. This is likely a + * consequence of point 2 above. + * 5 In some cases the biases described above were not evident in tests lasting + * several minutes, but were evident in tests lasting several hours. + */ +template< class T > +class InsertAndRangedRemoveStrategy : public InsertAndRemoveStrategy { +public: + virtual BSONObj insertObj() { return insertObjWithVal( insertVal() ); } + virtual BSONObj removeObj() { return rangedRemoveObjWithVal( removeVal() ); } +protected: + /** Small likelihood that this removal spec will not match any document */ + template< class U > + BSONObj rangedRemoveObjWithVal( const U &val ) { + BSONObjBuilder b1; + BSONObjBuilder b2( b1.subobjStart( "_id" ) ); + b2.append( "$gte", val ); + b2.done(); + return b1.obj(); + } + virtual T insertVal() = 0; + virtual T removeVal() = 0; +}; + +/** + * Integer Keys + * Uniform Inserts + * Uniform Removes + */ +class UniformInsertRangedUniformRemoveInteger : public InsertAndRangedRemoveStrategy< long long > { +public: + UniformInsertRangedUniformRemoveInteger() : + _uniform_int( 0ULL, ~0ULL ), + _nextLongLong( randomNumberGenerator, _uniform_int ) { + } + /** Small likelihood of duplicates */ + virtual long long insertVal() { return _nextLongLong(); } + virtual long long removeVal() { return _nextLongLong(); } +private: + uniform_int< unsigned long long > _uniform_int; + variate_generator< mt19937&, uniform_int< unsigned long long > > _nextLongLong; +}; + +class UniformInsertUniformRemoveInteger : public InsertAndUniformRemoveStrategy< long long > { +public: + virtual long long insertVal() { return _gen.insertVal(); } +private: + UniformInsertRangedUniformRemoveInteger _gen; +}; + +/** + * String Keys + * Uniform Inserts + * Uniform Removes + */ +class UniformInsertRangedUniformRemoveString : public InsertAndRangedRemoveStrategy< string > { +public: + UniformInsertRangedUniformRemoveString() : + _geometric_distribution( 0.9 ), + _nextLength( randomNumberGenerator, _geometric_distribution ), + _uniform_char( 'a', 'z' ), + _nextChar( randomNumberGenerator, _uniform_char ) { + } + /** Small likelihood of duplicates */ + virtual string insertVal() { return nextString(); } + virtual string removeVal() { return nextString(); } +private: + string nextString() { + // The longer the minimum string length, the lower the likelihood of duplicates + int len = _nextLength() + 5; + len = len > 100 ? 100 : len; + string ret( len, 'x' ); + for( int i = 0; i < len; ++i ) { + ret[ i ] = _nextChar(); + } + return ret; + } + geometric_distribution<> _geometric_distribution; + variate_generator< mt19937&, geometric_distribution<> > _nextLength; + uniform_int< char > _uniform_char; + variate_generator< mt19937&, uniform_int< char > > _nextChar; +}; + +class UniformInsertUniformRemoveString : public InsertAndUniformRemoveStrategy< string > { +public: + virtual string insertVal() { return _gen.insertVal(); } +private: + UniformInsertRangedUniformRemoveString _gen; +}; + +/** + * OID Keys + * Increasing Inserts + * Uniform Removes + */ +class IncreasingInsertRangedUniformRemoveOID : public InsertAndRangedRemoveStrategy< OID > { +public: + IncreasingInsertRangedUniformRemoveOID() : + _max( -1 ) { + } + virtual OID insertVal() { return oidFromULL( ++_max ); } + virtual OID removeVal() { + uniform_int< unsigned long long > distribution( 0, _max > 0 ? _max : 0 ); + variate_generator< mt19937&, uniform_int< unsigned long long > > generator( randomNumberGenerator, distribution ); + return oidFromULL( generator() ); + } +private: + static OID oidFromULL( unsigned long long val ) { + val = __builtin_bswap64( val ); + OID oid; + oid.clear(); + memcpy( (char*)&oid + 4, &val, 8 ); + return oid; + } + long long _max; +}; + +class IncreasingInsertUniformRemoveOID : public InsertAndUniformRemoveStrategy< OID > { +public: + virtual OID insertVal() { return _gen.insertVal(); } +private: + IncreasingInsertRangedUniformRemoveOID _gen; +}; + +/** + * Integer Keys + * Increasing Inserts + * Increasing Removes (on remove, the lowest key is always removed) + */ +class IncreasingInsertIncreasingRemoveInteger : public InsertAndRemoveStrategy { +public: + IncreasingInsertIncreasingRemoveInteger() : + // Start with a large value so data type will be preserved if we round + // trip through json. + _min( 1LL << 32 ), + _max( 1LL << 32 ) { + } + virtual BSONObj insertObj() { return insertObjWithVal( ++_max ); } + virtual BSONObj removeObj() { return removeObjWithVal( _min < _max ? ++_min : _min ); } +private: + long long _min; + long long _max; +}; + +/** Generate a random boolean value. */ +class BernoulliGenerator { +public: + /** + * @param excessFalsePercent This specifies the desired rate of false values + * vs true values. If we want false to be 5% more likely than true, we + * specify 5 for this argument. + */ + BernoulliGenerator( int excessFalsePercent ) : + _bernoulli_distribution( 1.0 / ( 2.0 + excessFalsePercent / 100.0 ) ), + _generator( randomNumberGenerator, _bernoulli_distribution ) { + } + bool operator()() { return _generator(); } +private: + bernoulli_distribution<> _bernoulli_distribution; + variate_generator< mt19937&, bernoulli_distribution<> > _generator; +}; + +/** Runs a strategy on a connection, with specified mix of inserts and removes. */ +class InsertAndRemoveRunner { +public: + InsertAndRemoveRunner( DBClientConnection &conn, InsertAndRemoveStrategy &strategy, int excessInsertPercent ) : + _conn( conn ), + _strategy( strategy ), + _nextOpTypeRemove( excessInsertPercent ) { + } + void writeOne() { + if ( _nextOpTypeRemove() ) { + _conn.remove( ns, _strategy.removeObj(), true ); + } + else { + _conn.insert( ns, _strategy.insertObj() ); + } + } +private: + DBClientConnection &_conn; + InsertAndRemoveStrategy &_strategy; + BernoulliGenerator _nextOpTypeRemove; +}; + +/** + * Writes a test script to cout based on a strategy and specified mix of inserts + * and removes. The script can be subsequently executed by InsertAndRemoveRunner. + * Script generation is intended for strategies that are memory or cpu intensive + * and might either divert resources from a mongod instance being analyzed on the + * same machine or fail to generate requests as quickly as the mongod might + * accept them. + * The script contains one line per operation. Each line begins + * with a letter indicating the operation type, followed by a space. Next + * follows the json representation of a document for the specified operation + * type. + */ +class InsertAndRemoveScriptGenerator { +public: + InsertAndRemoveScriptGenerator( InsertAndRemoveStrategy &strategy, int excessInsertPercent ) : + _strategy( strategy ), + _nextOpTypeRemove( excessInsertPercent ) { + } + void writeOne() { + if ( _nextOpTypeRemove() ) { + cout << "r " << _strategy.removeObj().jsonString() << endl; + } + else { + cout << "i " << _strategy.insertObj().jsonString() << endl; + } + } +private: + InsertAndRemoveStrategy &_strategy; + BernoulliGenerator _nextOpTypeRemove; +}; + +/** + * Run a test script from cin that was generated by + * InsertAndRemoveScriptGenerator. Running the script is intended to be + * lightweight in terms of memory and cpu usage, and fast. + */ +class InsertAndRemoveScriptRunner { +public: + InsertAndRemoveScriptRunner( DBClientConnection &conn ) : + _conn( conn ) { + } + void writeOne() { + cin.getline( _buf, 1024 ); + BSONObj val = fromjson( _buf + 2 ); + if ( _buf[ 0 ] == 'r' ) { + _conn.remove( ns, val, true ); + } + else { + _conn.insert( ns, val ); + } + } +private: + DBClientConnection &_conn; + char _buf[ 1024 ]; +}; + +int main( int argc, const char **argv ) { + + DBClientConnection conn; + conn.connect( "127.0.0.1:27017" ); + conn.dropCollection( ns ); + +// UniformInsertRangedUniformRemoveInteger strategy; +// UniformInsertUniformRemoveInteger strategy; +// UniformInsertRangedUniformRemoveString strategy; +// UniformInsertUniformRemoveString strategy; +// IncreasingInsertRangedUniformRemoveOID strategy; +// IncreasingInsertUniformRemoveOID strategy; +// IncreasingInsertIncreasingRemoveInteger strategy; +// InsertAndRemoveScriptGenerator runner( strategy, 5 ); + InsertAndRemoveScriptRunner runner( conn ); + + Timer t; + BSONObj statsCmd = BSON( "collstats" << index_collection ); + + // Print header, unless we are generating a script (in that case, comment this out). + cout << "ops,milliseconds,docs,totalBucketSize" << endl; + + long long i = 0; + long long n = 10000000000; + while( i < n ) { + runner.writeOne(); + // Print statistics, unless we are generating a script (in that case, comment this out). + // The stats collection requests below provide regular read operations, + // ensuring we are caught up with the progress being made by the mongod + // under analysis. + if ( ++i % 50000 == 0 ) { + // The total number of documents present. + long long docs = conn.count( ns ); + BSONObj result; + conn.runCommand( db, statsCmd, result ); + // The total number of bytes used for all allocated 8K buckets of the + // btree. + long long totalBucketSize = result.getField( "count" ).numberLong() * 8192; + cout << i << ',' << t.millis() << ',' << docs << ',' << totalBucketSize << endl; + } + } +} diff --git a/dbtests/perf/perftest.cpp b/dbtests/perf/perftest.cpp index f86a1c3..ef03551 100644 --- a/dbtests/perf/perftest.cpp +++ b/dbtests/perf/perftest.cpp @@ -74,14 +74,14 @@ public: << "}" << endl; } ~Runner() { - theFileAllocator().waitUntilFinished(); + FileAllocator::get()->waitUntilFinished(); client_->dropDatabase( testDb< T >().c_str() ); } }; class RunnerSuite : public Suite { public: - RunnerSuite( string name ) : Suite( name ){} + RunnerSuite( string name ) : Suite( name ) {} protected: template< class T > void add() { @@ -168,9 +168,9 @@ namespace Insert { class All : public RunnerSuite { public: - All() : RunnerSuite( "insert" ){} + All() : RunnerSuite( "insert" ) {} - void setupTests(){ + void setupTests() { add< IdIndex >(); add< TwoIndex >(); add< TenIndex >(); @@ -252,8 +252,8 @@ namespace Update { class All : public RunnerSuite { public: - All() : RunnerSuite( "update" ){} - void setupTests(){ + All() : RunnerSuite( "update" ) {} + void setupTests() { add< Smaller >(); add< Bigger >(); add< Inc >(); @@ -266,33 +266,33 @@ namespace Update { namespace BSON { const char *sample = - "{\"one\":2, \"two\":5, \"three\": {}," - "\"four\": { \"five\": { \"six\" : 11 } }," - "\"seven\": [ \"a\", \"bb\", \"ccc\", 5 ]," - "\"eight\": Dbref( \"rrr\", \"01234567890123456789aaaa\" )," - "\"_id\": ObjectId( \"deadbeefdeadbeefdeadbeef\" )," - "\"nine\": { \"$binary\": \"abc=\", \"$type\": \"02\" }," - "\"ten\": Date( 44 ), \"eleven\": /foooooo/i }"; + "{\"one\":2, \"two\":5, \"three\": {}," + "\"four\": { \"five\": { \"six\" : 11 } }," + "\"seven\": [ \"a\", \"bb\", \"ccc\", 5 ]," + "\"eight\": Dbref( \"rrr\", \"01234567890123456789aaaa\" )," + "\"_id\": ObjectId( \"deadbeefdeadbeefdeadbeef\" )," + "\"nine\": { \"$binary\": \"abc=\", \"$type\": \"02\" }," + "\"ten\": Date( 44 ), \"eleven\": /foooooo/i }"; const char *shopwikiSample = - "{ '_id' : '289780-80f85380b5c1d4a0ad75d1217673a4a2' , 'site_id' : 289780 , 'title'" - ": 'Jubilee - Margaret Walker' , 'image_url' : 'http://www.heartlanddigsandfinds.c" - "om/store/graphics/Product_Graphics/Product_8679.jpg' , 'url' : 'http://www.heartla" - "nddigsandfinds.com/store/store_product_detail.cfm?Product_ID=8679&Category_ID=2&Su" - "b_Category_ID=910' , 'url_hash' : 3450626119933116345 , 'last_update' : null , '" - "features' : { '$imagePrefetchDate' : '2008Aug30 22:39' , '$image.color.rgb' : '5a7" - "574' , 'Price' : '$10.99' , 'Description' : 'Author--s 1st Novel. A Houghton Miffl" - "in Literary Fellowship Award novel by the esteemed poet and novelist who has demon" - "strated a lifelong commitment to the heritage of black culture. An acclaimed story" - "of Vyry, a negro slave during the 19th Century, facing the biggest challenge of h" - "er lifetime - that of gaining her freedom, fighting for all the things she had nev" - "er known before. The author, great-granddaughter of Vyry, reveals what the Civil W" - "ar in America meant to the Negroes. Slavery W' , '$priceHistory-1' : '2008Dec03 $1" - "0.99' , 'Brand' : 'Walker' , '$brands_in_title' : 'Walker' , '--path' : '//HTML[1]" - "/BODY[1]/TABLE[1]/TR[1]/TD[1]/P[1]/TABLE[1]/TR[1]/TD[1]/TABLE[1]/TR[2]/TD[2]/TABLE" - "[1]/TR[1]/TD[1]/P[1]/TABLE[1]/TR[1]' , '~location' : 'en_US' , '$crawled' : '2009J" - "an11 03:22' , '$priceHistory-2' : '2008Nov15 $10.99' , '$priceHistory-0' : '2008De" - "c24 $10.99'}}"; + "{ '_id' : '289780-80f85380b5c1d4a0ad75d1217673a4a2' , 'site_id' : 289780 , 'title'" + ": 'Jubilee - Margaret Walker' , 'image_url' : 'http://www.heartlanddigsandfinds.c" + "om/store/graphics/Product_Graphics/Product_8679.jpg' , 'url' : 'http://www.heartla" + "nddigsandfinds.com/store/store_product_detail.cfm?Product_ID=8679&Category_ID=2&Su" + "b_Category_ID=910' , 'url_hash' : 3450626119933116345 , 'last_update' : null , '" + "features' : { '$imagePrefetchDate' : '2008Aug30 22:39' , '$image.color.rgb' : '5a7" + "574' , 'Price' : '$10.99' , 'Description' : 'Author--s 1st Novel. A Houghton Miffl" + "in Literary Fellowship Award novel by the esteemed poet and novelist who has demon" + "strated a lifelong commitment to the heritage of black culture. An acclaimed story" + "of Vyry, a negro slave during the 19th Century, facing the biggest challenge of h" + "er lifetime - that of gaining her freedom, fighting for all the things she had nev" + "er known before. The author, great-granddaughter of Vyry, reveals what the Civil W" + "ar in America meant to the Negroes. Slavery W' , '$priceHistory-1' : '2008Dec03 $1" + "0.99' , 'Brand' : 'Walker' , '$brands_in_title' : 'Walker' , '--path' : '//HTML[1]" + "/BODY[1]/TABLE[1]/TR[1]/TD[1]/P[1]/TABLE[1]/TR[1]/TD[1]/TABLE[1]/TR[2]/TD[2]/TABLE" + "[1]/TR[1]/TD[1]/P[1]/TABLE[1]/TR[1]' , '~location' : 'en_US' , '$crawled' : '2009J" + "an11 03:22' , '$priceHistory-2' : '2008Nov15 $10.99' , '$priceHistory-0' : '2008De" + "c24 $10.99'}}"; class Parse { public: @@ -332,8 +332,8 @@ namespace BSON { class All : public RunnerSuite { public: - All() : RunnerSuite( "bson" ){} - void setupTests(){ + All() : RunnerSuite( "bson" ) {} + void setupTests() { add< Parse >(); add< ShopwikiParse >(); add< Json >(); @@ -402,8 +402,8 @@ namespace Index { class All : public RunnerSuite { public: - All() : RunnerSuite( "index" ){} - void setupTests(){ + All() : RunnerSuite( "index" ) {} + void setupTests() { add< Int >(); add< ObjectId >(); add< String >(); @@ -435,7 +435,7 @@ namespace QueryTests { } void run() { client_->findOne( ns_.c_str(), - QUERY( "a" << "b" ).hint( BSON( "_id" << 1 ) ) ); + QUERY( "a" << "b" ).hint( BSON( "_id" << 1 ) ) ); } string ns_; }; @@ -465,7 +465,7 @@ namespace QueryTests { } void run() { auto_ptr< DBClientCursor > c = - client_->query( ns_.c_str(), Query( BSONObj() ).sort( BSON( "_id" << 1 ) ) ); + client_->query( ns_.c_str(), Query( BSONObj() ).sort( BSON( "_id" << 1 ) ) ); int i = 0; for( ; c->more(); c->nextSafe(), ++i ); ASSERT_EQUALS( 50000, i ); @@ -481,7 +481,7 @@ namespace QueryTests { } void run() { auto_ptr< DBClientCursor > c = - client_->query( ns_.c_str(), Query( BSONObj() ).sort( BSON( "_id" << 1 ) ) ); + client_->query( ns_.c_str(), Query( BSONObj() ).sort( BSON( "_id" << 1 ) ) ); int i = 0; for( ; c->more(); c->nextSafe(), ++i ); ASSERT_EQUALS( 50000, i ); @@ -541,8 +541,8 @@ namespace QueryTests { class All : public RunnerSuite { public: - All() : RunnerSuite( "query" ){} - void setupTests(){ + All() : RunnerSuite( "query" ) {} + void setupTests() { add< NoMatch >(); add< NoMatchIndex >(); add< NoMatchLong >(); @@ -602,8 +602,8 @@ namespace Count { class All : public RunnerSuite { public: - All() : RunnerSuite( "count" ){} - void setupTests(){ + All() : RunnerSuite( "count" ) {} + void setupTests() { add< Count >(); add< CountIndex >(); add< CountSimpleIndex >(); @@ -677,8 +677,8 @@ namespace Plan { class All : public RunnerSuite { public: - All() : RunnerSuite("plan" ){} - void setupTests(){ + All() : RunnerSuite("plan" ) {} + void setupTests() { add< Hint >(); add< Sort >(); add< Query >(); diff --git a/dbtests/perftests.cpp b/dbtests/perftests.cpp new file mode 100644 index 0000000..182595c --- /dev/null +++ b/dbtests/perftests.cpp @@ -0,0 +1,336 @@ +/** @file perftests.cpp.cpp : unit tests relating to performance + + The idea herein is tests that run fast and can be part of the normal CI suite. So no tests herein that take + a long time to run. Obviously we need those too, but they will be separate. + + These tests use DBDirectClient; they are a bit white-boxish. +*/ + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "../db/query.h" +#include "../db/db.h" +#include "../db/instance.h" +#include "../db/json.h" +#include "../db/lasterror.h" +#include "../db/update.h" +#include "../db/taskqueue.h" +#include "../util/timer.h" +#include "dbtests.h" +#include "../db/dur_stats.h" + +namespace PerfTests { + typedef DBDirectClient DBClientType; + //typedef DBClientConnection DBClientType; + + class ClientBase { + public: + // NOTE: Not bothering to backup the old error record. + ClientBase() { + //_client.connect("localhost"); + mongo::lastError.reset( new LastError() ); + } + virtual ~ClientBase() { + //mongo::lastError.release(); + } + protected: + static void insert( const char *ns, BSONObj o ) { + _client.insert( ns, o ); + } + static void update( const char *ns, BSONObj q, BSONObj o, bool upsert = 0 ) { + _client.update( ns, Query( q ), o, upsert ); + } + static bool error() { + return !_client.getPrevError().getField( "err" ).isNull(); + } + DBClientBase &client() const { return _client; } + private: + static DBClientType _client; + }; + DBClientType ClientBase::_client; + + // todo: use a couple threads. not a very good test yet. + class TaskQueueTest { + static int tot; + struct V { + int val; + static void go(const V &v) { tot += v.val; } + }; + public: + void run() { + tot = 0; + TaskQueue d; + int x = 0; + for( int i = 0; i < 100; i++ ) { + if( i % 30 == 0 ) + d.invoke(); + + x += i; + writelock lk; + V v; + v.val = i; + d.defer(v); + } + d.invoke(); + assert( x == tot ); + } + }; + int TaskQueueTest::tot; + + class CappedTest : public ClientBase { + }; + + class B : public ClientBase { + string _ns; + protected: + const char *ns() { return _ns.c_str(); } + virtual void prep() = 0; + + virtual void timed() = 0; + + // optional 2nd test phase to be timed separately + // return name of it + virtual const char * timed2() { return 0; } + + virtual void post() { } + virtual string name() = 0; + virtual unsigned long long expectation() = 0; + virtual int howLongMillis() { return 5000; } + public: + void say(unsigned long long n, int ms, string s) { + cout << setw(36) << left << s << ' ' << right << setw(7) << n*1000/ms << "/sec " << setw(4) << ms << "ms" << endl; + cout << dur::stats.curr->_asObj().toString() << endl; + } + void run() { + _ns = string("perftest.") + name(); + client().dropCollection(ns()); + + prep(); + + int hlm = howLongMillis(); + + dur::stats._intervalMicros = 0; // no auto rotate + dur::stats.curr->reset(); + Timer t; + unsigned long long n = 0; + const unsigned Batch = 50; + do { + unsigned i; + for( i = 0; i < Batch; i++ ) + timed(); + n += i; + } + while( t.millis() < hlm ); + client().getLastError(); // block until all ops are finished + int ms = t.millis(); + say(n, ms, name()); + + if( n < expectation() ) { + cout << "\ntest " << name() << " seems slow n:" << n << " ops/sec but expect greater than:" << expectation() << endl; + cout << endl; + } + + { + const char *test2name = timed2(); + if( test2name ) { + dur::stats.curr->reset(); + Timer t; + unsigned long long n = 0; + while( 1 ) { + unsigned i; + for( i = 0; i < Batch; i++ ) + timed2(); + n += i; + if( t.millis() > hlm ) + break; + } + int ms = t.millis(); + say(n, ms, test2name); + } + } + } + }; + + class InsertDup : public B { + const BSONObj o; + public: + InsertDup() : o( BSON("_id" << 1) ) { } // dup keys + string name() { + return "insert duplicate _ids"; + } + void prep() { + client().insert( ns(), o ); + } + void timed() { + client().insert( ns(), o ); + } + void post() { + assert( client().count(ns()) == 1 ); + } + unsigned long long expectation() { return 1000; } + }; + + class Insert1 : public InsertDup { + const BSONObj x; + public: + Insert1() : x( BSON("x" << 99) ) { } + string name() { return "insert simple"; } + void timed() { + client().insert( ns(), x ); + } + void post() { + assert( client().count(ns()) > 100 ); + } + unsigned long long expectation() { return 1000; } + }; + + class InsertBig : public InsertDup { + BSONObj x; + virtual int howLongMillis() { + if( sizeof(void*) == 4 ) + return 1000; // could exceed mmapping if run too long, as this function adds a lot fasta + return 5000; + } + public: + InsertBig() { + char buf[200000]; + BSONObjBuilder b; + b.append("x", 99); + b.appendBinData("bin", 200000, (BinDataType) 129, buf); + x = b.obj(); + } + string name() { return "insert big"; } + void timed() { + client().insert( ns(), x ); + } + unsigned long long expectation() { return 20; } + }; + + class InsertRandom : public B { + public: + string name() { return "random inserts"; } + void prep() { + client().insert( ns(), BSONObj() ); + client().ensureIndex(ns(), BSON("x"<<1)); + } + void timed() { + int x = rand(); + BSONObj y = BSON("x" << x << "y" << rand() << "z" << 33); + client().insert(ns(), y); + } + void post() { + } + unsigned long long expectation() { return 1000; } + }; + + /** upserts about 32k records and then keeps updating them + 2 indexes + */ + class Update1 : public B { + public: + static int rand() { + return std::rand() & 0x7fff; + } + string name() { return "random upserts"; } + void prep() { + client().insert( ns(), BSONObj() ); + client().ensureIndex(ns(), BSON("x"<<1)); + } + void timed() { + int x = rand(); + BSONObj q = BSON("x" << x); + BSONObj y = BSON("x" << x << "y" << rand() << "z" << 33); + client().update(ns(), q, y, /*upsert*/true); + } + + const char * timed2() { + static BSONObj I = BSON( "$inc" << BSON( "y" << 1 ) ); + + // test some $inc's + + int x = rand(); + BSONObj q = BSON("x" << x); + client().update(ns(), q, I); + + return "inc"; + } + + void post() { + } + unsigned long long expectation() { return 1000; } + }; + + template + class MoreIndexes : public T { + public: + string name() { return T::name() + " with more indexes"; } + void prep() { + T::prep(); + this->client().ensureIndex(this->ns(), BSON("y"<<1)); + this->client().ensureIndex(this->ns(), BSON("z"<<1)); + } + }; + + void t() { + for( int i = 0; i < 20; i++ ) { + sleepmillis(21); + string fn = "/tmp/t1"; + MongoMMF f; + unsigned long long len = 1 * 1024 * 1024; + assert( f.create(fn, len, /*sequential*/rand()%2==0) ); + { + char *p = (char *) f.getView(); + assert(p); + // write something to the private view as a test + strcpy(p, "hello"); + } + if( cmdLine.dur ) { + char *w = (char *) f.view_write(); + strcpy(w + 6, "world"); + } + MongoFileFinder ff; + ASSERT( ff.findByPath(fn) ); + } + } + + class All : public Suite { + public: + All() : Suite( "perf" ) + { + } + ~All() { + } + Result * run( const string& filter ) { + boost::thread a(t); + Result * res = Suite::run(filter); + a.join(); + return res; + } + + void setupTests() { + add< TaskQueueTest >(); + add< InsertDup >(); + add< Insert1 >(); + add< InsertRandom >(); + add< MoreIndexes >(); + add< Update1 >(); + add< MoreIndexes >(); + add< InsertBig >(); + } + } myall; +} diff --git a/dbtests/queryoptimizertests.cpp b/dbtests/queryoptimizertests.cpp index f5d1155..acf9217 100644 --- a/dbtests/queryoptimizertests.cpp +++ b/dbtests/queryoptimizertests.cpp @@ -27,12 +27,12 @@ namespace mongo { extern BSONObj id_obj; - void runQuery(Message& m, QueryMessage& q, Message &response ){ + void runQuery(Message& m, QueryMessage& q, Message &response ) { CurOp op( &(cc()) ); op.ensureStarted(); runQuery( m , q , op, response ); } - void runQuery(Message& m, QueryMessage& q ){ + void runQuery(Message& m, QueryMessage& q ) { Message response; runQuery( m, q, response ); } @@ -64,14 +64,14 @@ namespace QueryOptimizerTests { } } }; - + class NumericBase : public Base { public: - NumericBase(){ + NumericBase() { o = BSON( "min" << -numeric_limits::max() << "max" << numeric_limits::max() ); } - + virtual BSONElement lower() { return o["min"]; } virtual BSONElement upper() { return o["max"]; } private: @@ -81,7 +81,7 @@ namespace QueryOptimizerTests { class Empty : public Base { virtual BSONObj query() { return BSONObj(); } }; - + class Eq : public Base { public: Eq() : o_( BSON( "a" << 1 ) ) {} @@ -94,7 +94,7 @@ namespace QueryOptimizerTests { class DupEq : public Eq { public: virtual BSONObj query() { return BSON( "a" << 1 << "b" << 2 << "a" << 1 ); } - }; + }; class Lt : public NumericBase { public: @@ -103,13 +103,13 @@ namespace QueryOptimizerTests { virtual BSONElement upper() { return o_.firstElement(); } virtual bool upperInclusive() { return false; } BSONObj o_; - }; + }; class Lte : public Lt { - virtual BSONObj query() { return BSON( "a" << LTE << 1 ); } + virtual BSONObj query() { return BSON( "a" << LTE << 1 ); } virtual bool upperInclusive() { return true; } }; - + class Gt : public NumericBase { public: Gt() : o_( BSON( "-" << 1 ) ) {} @@ -117,23 +117,23 @@ namespace QueryOptimizerTests { virtual BSONElement lower() { return o_.firstElement(); } virtual bool lowerInclusive() { return false; } BSONObj o_; - }; - + }; + class Gte : public Gt { - virtual BSONObj query() { return BSON( "a" << GTE << 1 ); } + virtual BSONObj query() { return BSON( "a" << GTE << 1 ); } virtual bool lowerInclusive() { return true; } }; - + class TwoLt : public Lt { - virtual BSONObj query() { return BSON( "a" << LT << 1 << LT << 5 ); } + virtual BSONObj query() { return BSON( "a" << LT << 1 << LT << 5 ); } }; class TwoGt : public Gt { - virtual BSONObj query() { return BSON( "a" << GT << 0 << GT << 1 ); } - }; + virtual BSONObj query() { return BSON( "a" << GT << 0 << GT << 1 ); } + }; class EqGte : public Eq { - virtual BSONObj query() { return BSON( "a" << 1 << "a" << GTE << 1 ); } + virtual BSONObj query() { return BSON( "a" << 1 << "a" << GTE << 1 ); } }; class EqGteInvalid { @@ -142,7 +142,7 @@ namespace QueryOptimizerTests { FieldRangeSet fbs( "ns", BSON( "a" << 1 << "a" << GTE << 2 ) ); ASSERT( !fbs.matchPossible() ); } - }; + }; struct RegexBase : Base { void run() { //need to only look at first interval @@ -166,7 +166,7 @@ namespace QueryOptimizerTests { virtual BSONElement upper() { return o2_.firstElement(); } virtual bool upperInclusive() { return false; } BSONObj o1_, o2_; - }; + }; class RegexObj : public RegexBase { public: @@ -177,7 +177,7 @@ namespace QueryOptimizerTests { virtual bool upperInclusive() { return false; } BSONObj o1_, o2_; }; - + class UnhelpfulRegex : public RegexBase { public: UnhelpfulRegex() { @@ -191,13 +191,13 @@ namespace QueryOptimizerTests { BSONObjBuilder b; b.appendRegex( "a", "abc" ); return b.obj(); - } + } virtual BSONElement lower() { return limits["lower"]; } virtual BSONElement upper() { return limits["upper"]; } virtual bool upperInclusive() { return false; } BSONObj limits; }; - + class In : public Base { public: In() : o1_( BSON( "-" << -3 ) ), o2_( BSON( "-" << 44 ) ) {} @@ -219,7 +219,7 @@ namespace QueryOptimizerTests { virtual BSONElement upper() { return o2_.firstElement(); } BSONObj o1_, o2_; }; - + class Equality { public: void run() { @@ -237,7 +237,7 @@ namespace QueryOptimizerTests { ASSERT( !s6.range( "a" ).equality() ); } }; - + class SimplifiedQuery { public: void run() { @@ -251,7 +251,7 @@ namespace QueryOptimizerTests { ASSERT( !simple.getObjectField( "e" ).woCompare( fromjson( "{$gte:0,$lte:10}" ) ) ); } }; - + class QueryPatternTest { public: void run() { @@ -277,14 +277,14 @@ namespace QueryOptimizerTests { return FieldRangeSet( "", query ).pattern( sort ); } }; - + class NoWhere { public: void run() { ASSERT_EQUALS( 0, FieldRangeSet( "ns", BSON( "$where" << 1 ) ).nNontrivialRanges() ); } }; - + class Numeric { public: void run() { @@ -311,29 +311,39 @@ namespace QueryOptimizerTests { ASSERT( f.range( "a" ).max().woCompare( BSON( "a" << 3.0 ).firstElement(), false ) == 0 ); } }; - - class MultiBound { - public: - void run() { + + class UnionBound { + public: + void run() { + FieldRangeSet frs( "", fromjson( "{a:{$gt:1,$lt:9},b:{$gt:9,$lt:12}}" ) ); + FieldRange ret = frs.range( "a" ); + ret |= frs.range( "b" ); + ASSERT_EQUALS( 2U, ret.intervals().size() ); + } + }; + + class MultiBound { + public: + void run() { FieldRangeSet frs1( "", fromjson( "{a:{$in:[1,3,5,7,9]}}" ) ); FieldRangeSet frs2( "", fromjson( "{a:{$in:[2,3,5,8,9]}}" ) ); - FieldRange fr1 = frs1.range( "a" ); - FieldRange fr2 = frs2.range( "a" ); - fr1 &= fr2; + FieldRange fr1 = frs1.range( "a" ); + FieldRange fr2 = frs2.range( "a" ); + fr1 &= fr2; ASSERT( fr1.min().woCompare( BSON( "a" << 3.0 ).firstElement(), false ) == 0 ); ASSERT( fr1.max().woCompare( BSON( "a" << 9.0 ).firstElement(), false ) == 0 ); - vector< FieldInterval > intervals = fr1.intervals(); - vector< FieldInterval >::const_iterator j = intervals.begin(); - double expected[] = { 3, 5, 9 }; - for( int i = 0; i < 3; ++i, ++j ) { - ASSERT_EQUALS( expected[ i ], j->_lower._bound.number() ); - ASSERT( j->_lower._inclusive ); - ASSERT( j->_lower == j->_upper ); - } - ASSERT( j == intervals.end() ); - } - }; - + vector< FieldInterval > intervals = fr1.intervals(); + vector< FieldInterval >::const_iterator j = intervals.begin(); + double expected[] = { 3, 5, 9 }; + for( int i = 0; i < 3; ++i, ++j ) { + ASSERT_EQUALS( expected[ i ], j->_lower._bound.number() ); + ASSERT( j->_lower._inclusive ); + ASSERT( j->_lower == j->_upper ); + } + ASSERT( j == intervals.end() ); + } + }; + class DiffBase { public: virtual ~DiffBase() {} @@ -341,7 +351,7 @@ namespace QueryOptimizerTests { FieldRangeSet frs( "", fromjson( obj().toString() ) ); FieldRange ret = frs.range( "a" ); ret -= frs.range( "b" ); - check( ret ); + check( ret ); } protected: void check( const FieldRange &fr ) { @@ -366,7 +376,7 @@ namespace QueryOptimizerTests { class TwoRangeBase : public DiffBase { public: TwoRangeBase( string obj, int low, int high, bool lowI, bool highI ) - : _obj( obj ) { + : _obj( obj ) { _n[ 0 ] = low; _n[ 1 ] = high; _b[ 0 ] = lowI; @@ -381,7 +391,7 @@ namespace QueryOptimizerTests { int _n[ 2 ]; bool _b[ 2 ]; }; - + struct Diff1 : public TwoRangeBase { Diff1() : TwoRangeBase( "{a:{$gt:1,$lt:2},b:{$gt:3,$lt:4}}", 1, 2, false, false ) {} }; @@ -389,7 +399,7 @@ namespace QueryOptimizerTests { struct Diff2 : public TwoRangeBase { Diff2() : TwoRangeBase( "{a:{$gt:1,$lt:2},b:{$gt:2,$lt:4}}", 1, 2, false, false ) {} }; - + struct Diff3 : public TwoRangeBase { Diff3() : TwoRangeBase( "{a:{$gt:1,$lte:2},b:{$gt:2,$lt:4}}", 1, 2, false, true ) {} }; @@ -397,11 +407,11 @@ namespace QueryOptimizerTests { struct Diff4 : public TwoRangeBase { Diff4() : TwoRangeBase( "{a:{$gt:1,$lt:2},b:{$gte:2,$lt:4}}", 1, 2, false, false) {} }; - + struct Diff5 : public TwoRangeBase { Diff5() : TwoRangeBase( "{a:{$gt:1,$lte:2},b:{$gte:2,$lt:4}}", 1, 2, false, false) {} }; - + struct Diff6 : public TwoRangeBase { Diff6() : TwoRangeBase( "{a:{$gt:1,$lte:3},b:{$gte:2,$lt:4}}", 1, 2, false, false) {} }; @@ -409,7 +419,7 @@ namespace QueryOptimizerTests { struct Diff7 : public TwoRangeBase { Diff7() : TwoRangeBase( "{a:{$gt:1,$lte:3},b:{$gt:2,$lt:4}}", 1, 2, false, true) {} }; - + struct Diff8 : public TwoRangeBase { Diff8() : TwoRangeBase( "{a:{$gt:1,$lt:4},b:{$gt:2,$lt:4}}", 1, 2, false, true) {} }; @@ -420,22 +430,45 @@ namespace QueryOptimizerTests { struct Diff10 : public TwoRangeBase { Diff10() : TwoRangeBase( "{a:{$gt:1,$lte:4},b:{$gt:2,$lte:4}}", 1, 2, false, true) {} - }; - - struct Diff11 : public TwoRangeBase { - Diff11() : TwoRangeBase( "{a:{$gt:1,$lte:4},b:{$gt:2,$lt:4}}", 1, 4, false, true) {} }; - struct Diff12 : public TwoRangeBase { - Diff12() : TwoRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:2,$lt:4}}", 1, 5, false, false) {} + class SplitRangeBase : public DiffBase { + public: + SplitRangeBase( string obj, int low1, bool low1I, int high1, bool high1I, int low2, bool low2I, int high2, bool high2I ) + : _obj( obj ) { + _n[ 0 ] = low1; + _n[ 1 ] = high1; + _n[ 2 ] = low2; + _n[ 3 ] = high2; + _b[ 0 ] = low1I; + _b[ 1 ] = high1I; + _b[ 2 ] = low2I; + _b[ 3 ] = high2I; + } + private: + virtual unsigned len() const { return 2; } + virtual const int *nums() const { return _n; } + virtual const bool *incs() const { return _b; } + virtual BSONObj obj() const { return fromjson( _obj ); } + string _obj; + int _n[ 4 ]; + bool _b[ 4 ]; + }; + + struct Diff11 : public SplitRangeBase { + Diff11() : SplitRangeBase( "{a:{$gt:1,$lte:4},b:{$gt:2,$lt:4}}", 1, false, 2, true, 4, true, 4, true) {} + }; + + struct Diff12 : public SplitRangeBase { + Diff12() : SplitRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:2,$lt:4}}", 1, false, 2, true, 4, true, 5, false) {} }; - + struct Diff13 : public TwoRangeBase { Diff13() : TwoRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:1,$lt:4}}", 4, 5, true, false) {} }; - - struct Diff14 : public TwoRangeBase { - Diff14() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:{$gt:1,$lt:4}}", 1, 5, true, false) {} + + struct Diff14 : public SplitRangeBase { + Diff14() : SplitRangeBase( "{a:{$gte:1,$lt:5},b:{$gt:1,$lt:4}}", 1, true, 1, true, 4, true, 5, false) {} }; struct Diff15 : public TwoRangeBase { @@ -481,7 +514,7 @@ namespace QueryOptimizerTests { struct Diff25 : public TwoRangeBase { Diff25() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:0}", 1, 5, true, true) {} }; - + struct Diff26 : public TwoRangeBase { Diff26() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:1}", 1, 5, false, true) {} }; @@ -490,14 +523,14 @@ namespace QueryOptimizerTests { Diff27() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:1}", 1, 5, false, true) {} }; - struct Diff28 : public TwoRangeBase { - Diff28() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:3}", 1, 5, true, true) {} + struct Diff28 : public SplitRangeBase { + Diff28() : SplitRangeBase( "{a:{$gte:1,$lte:5},b:3}", 1, true, 3, false, 3, false, 5, true) {} }; struct Diff29 : public TwoRangeBase { Diff29() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:5}", 1, 5, true, false) {} }; - + struct Diff30 : public TwoRangeBase { Diff30() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:5}", 1, 5, true, false) {} }; @@ -505,7 +538,7 @@ namespace QueryOptimizerTests { struct Diff31 : public TwoRangeBase { Diff31() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:6}", 1, 5, true, false) {} }; - + struct Diff32 : public TwoRangeBase { Diff32() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:6}", 1, 5, true, true) {} }; @@ -513,7 +546,7 @@ namespace QueryOptimizerTests { class EmptyBase : public DiffBase { public: EmptyBase( string obj ) - : _obj( obj ) {} + : _obj( obj ) {} private: virtual unsigned len() const { return 0; } virtual const int *nums() const { return 0; } @@ -521,7 +554,7 @@ namespace QueryOptimizerTests { virtual BSONObj obj() const { return fromjson( _obj ); } string _obj; }; - + struct Diff33 : public EmptyBase { Diff33() : EmptyBase( "{a:{$gte:1,$lte:5},b:{$gt:0,$lt:6}}" ) {} }; @@ -553,7 +586,7 @@ namespace QueryOptimizerTests { struct Diff40 : public EmptyBase { Diff40() : EmptyBase( "{a:{$gt:1,$lte:5},b:{$gt:0,$lte:5}}" ) {} }; - + struct Diff41 : public TwoRangeBase { Diff41() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:0,$lt:5}}", 5, 5, true, true ) {} }; @@ -606,8 +639,8 @@ namespace QueryOptimizerTests { Diff53() : EmptyBase( "{a:{$gte:1,$lt:5},b:{$gte:1,$lte:5}}" ) {} }; - struct Diff54 : public TwoRangeBase { - Diff54() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:1,$lt:5}}", 1, 5, true, true ) {} + struct Diff54 : public SplitRangeBase { + Diff54() : SplitRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:1,$lt:5}}", 1, true, 1, true, 5, true, 5, true ) {} }; struct Diff55 : public TwoRangeBase { @@ -621,7 +654,7 @@ namespace QueryOptimizerTests { struct Diff57 : public EmptyBase { Diff57() : EmptyBase( "{a:{$gte:1,$lte:5},b:{$gte:1,$lte:5}}" ) {} }; - + struct Diff58 : public TwoRangeBase { Diff58() : TwoRangeBase( "{a:1,b:{$gt:1,$lt:5}}", 1, 1, true, true ) {} }; @@ -645,7 +678,11 @@ namespace QueryOptimizerTests { struct Diff63 : public EmptyBase { Diff63() : EmptyBase( "{a:5,b:5}" ) {} }; - + + struct Diff64 : public TwoRangeBase { + Diff64() : TwoRangeBase( "{a:{$gte:1,$lte:2},b:{$gt:0,$lte:1}}", 1, 2, false, true ) {} + }; + class DiffMulti1 : public DiffBase { public: void run() { @@ -656,12 +693,12 @@ namespace QueryOptimizerTests { other |= frs.range( "d" ); other |= frs.range( "e" ); ret -= other; - check( ret ); + check( ret ); } protected: - virtual unsigned len() const { return 1; } - virtual const int *nums() const { static int n[] = { 2, 7 }; return n; } - virtual const bool *incs() const { static bool b[] = { true, true }; return b; } + virtual unsigned len() const { return 3; } + virtual const int *nums() const { static int n[] = { 2, 3, 3, 4, 5, 7 }; return n; } + virtual const bool *incs() const { static bool b[] = { true, false, false, true, true, true }; return b; } virtual BSONObj obj() const { return BSONObj(); } }; @@ -675,7 +712,7 @@ namespace QueryOptimizerTests { ret |= frs.range( "d" ); ret |= frs.range( "e" ); ret -= mask; - check( ret ); + check( ret ); } protected: virtual unsigned len() const { return 2; } @@ -683,7 +720,7 @@ namespace QueryOptimizerTests { virtual const bool *incs() const { static bool b[] = { false, true, true, false }; return b; } virtual BSONObj obj() const { return BSONObj(); } }; - + class SetIntersect { public: void run() { @@ -693,9 +730,9 @@ namespace QueryOptimizerTests { ASSERT_EQUALS( fromjson( "{a:1,b:5,c:7,d:{$gte:8,$lte:9},e:10}" ), frs1.simplifiedQuery( BSONObj() ) ); } }; - + } // namespace FieldRangeTests - + namespace QueryPlanTests { class Base { public: @@ -742,23 +779,25 @@ namespace QueryOptimizerTests { static DBDirectClient client_; }; DBDirectClient Base::client_; - + // There's a limit of 10 indexes total, make sure not to exceed this in a given test. #define INDEXNO(x) nsd()->idxNo( *this->index( BSON(x) ) ) #define INDEX(x) this->index( BSON(x) ) auto_ptr< FieldRangeSet > FieldRangeSet_GLOBAL; #define FBS(x) ( FieldRangeSet_GLOBAL.reset( new FieldRangeSet( ns(), x ) ), *FieldRangeSet_GLOBAL ) - + auto_ptr< FieldRangeSet > FieldRangeSet_GLOBAL2; +#define FBS2(x) ( FieldRangeSet_GLOBAL2.reset( new FieldRangeSet( ns(), x ) ), *FieldRangeSet_GLOBAL2 ) + class NoIndex : public Base { public: void run() { - QueryPlan p( nsd(), -1, FBS( BSONObj() ), BSONObj(), BSONObj() ); + QueryPlan p( nsd(), -1, FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSONObj() ); ASSERT( !p.optimal() ); ASSERT( !p.scanAndOrderRequired() ); ASSERT( !p.exactKeyMatch() ); } }; - + class SimpleOrder : public Base { public: void run() { @@ -768,43 +807,43 @@ namespace QueryOptimizerTests { BSONObjBuilder b2; b2.appendMaxKey( "" ); BSONObj end = b2.obj(); - - QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + + QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); ASSERT( !p.scanAndOrderRequired() ); ASSERT( !startKey( p ).woCompare( start ) ); ASSERT( !endKey( p ).woCompare( end ) ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << 1 ) ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << 1 ) ); ASSERT( !p2.scanAndOrderRequired() ); - QueryPlan p3( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "b" << 1 ) ); + QueryPlan p3( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "b" << 1 ) ); ASSERT( p3.scanAndOrderRequired() ); ASSERT( !startKey( p3 ).woCompare( start ) ); ASSERT( !endKey( p3 ).woCompare( end ) ); } }; - + class MoreIndexThanNeeded : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); - ASSERT( !p.scanAndOrderRequired() ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + ASSERT( !p.scanAndOrderRequired() ); } }; - + class IndexSigns : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << -1 ) , FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) ); - ASSERT( !p.scanAndOrderRequired() ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << -1 ) , FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) ); + ASSERT( !p.scanAndOrderRequired() ); ASSERT_EQUALS( 1, p.direction() ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) ); - ASSERT( p2.scanAndOrderRequired() ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) ); + ASSERT( p2.scanAndOrderRequired() ); ASSERT_EQUALS( 0, p2.direction() ); - QueryPlan p3( nsd(), indexno( id_obj ), FBS( BSONObj() ), BSONObj(), BSON( "_id" << 1 ) ); + QueryPlan p3( nsd(), indexno( id_obj ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "_id" << 1 ) ); ASSERT( !p3.scanAndOrderRequired() ); ASSERT_EQUALS( 1, p3.direction() ); - } + } }; - + class IndexReverse : public Base { public: void run() { @@ -816,18 +855,18 @@ namespace QueryOptimizerTests { b2.appendMaxKey( "" ); b2.appendMinKey( "" ); BSONObj end = b2.obj(); - QueryPlan p( nsd(), INDEXNO( "a" << -1 << "b" << 1 ),FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) ); - ASSERT( !p.scanAndOrderRequired() ); + QueryPlan p( nsd(), INDEXNO( "a" << -1 << "b" << 1 ),FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) ); + ASSERT( !p.scanAndOrderRequired() ); ASSERT_EQUALS( -1, p.direction() ); ASSERT( !startKey( p ).woCompare( start ) ); ASSERT( !endKey( p ).woCompare( end ) ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << -1 << "b" << -1 ) ); - ASSERT( !p2.scanAndOrderRequired() ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << -1 << "b" << -1 ) ); + ASSERT( !p2.scanAndOrderRequired() ); ASSERT_EQUALS( -1, p2.direction() ); - QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << -1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << -1 << "b" << -1 ) ); - ASSERT( p3.scanAndOrderRequired() ); + QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << -1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << -1 << "b" << -1 ) ); + ASSERT( p3.scanAndOrderRequired() ); ASSERT_EQUALS( 0, p3.direction() ); - } + } }; class NoOrder : public Base { @@ -841,143 +880,143 @@ namespace QueryOptimizerTests { b2.append( "", 3 ); b2.appendMaxKey( "" ); BSONObj end = b2.obj(); - QueryPlan p( nsd(), INDEXNO( "a" << -1 << "b" << 1 ), FBS( BSON( "a" << 3 ) ), BSON( "a" << 3 ), BSONObj() ); - ASSERT( !p.scanAndOrderRequired() ); + QueryPlan p( nsd(), INDEXNO( "a" << -1 << "b" << 1 ), FBS( BSON( "a" << 3 ) ), FBS2( BSON( "a" << 3 ) ), BSON( "a" << 3 ), BSONObj() ); + ASSERT( !p.scanAndOrderRequired() ); ASSERT( !startKey( p ).woCompare( start ) ); ASSERT( !endKey( p ).woCompare( end ) ); - QueryPlan p2( nsd(), INDEXNO( "a" << -1 << "b" << 1 ), FBS( BSON( "a" << 3 ) ), BSON( "a" << 3 ), BSONObj() ); - ASSERT( !p2.scanAndOrderRequired() ); + QueryPlan p2( nsd(), INDEXNO( "a" << -1 << "b" << 1 ), FBS( BSON( "a" << 3 ) ), FBS2( BSON( "a" << 3 ) ), BSON( "a" << 3 ), BSONObj() ); + ASSERT( !p2.scanAndOrderRequired() ); ASSERT( !startKey( p ).woCompare( start ) ); ASSERT( !endKey( p ).woCompare( end ) ); - } + } }; - + class EqualWithOrder : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 4 ) ), BSON( "a" << 4 ), BSON( "b" << 1 ) ); - ASSERT( !p.scanAndOrderRequired() ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "b" << 4 ) ), BSON( "b" << 4 ), BSON( "a" << 1 << "c" << 1 ) ); - ASSERT( !p2.scanAndOrderRequired() ); - QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 4 ) ), BSON( "b" << 4 ), BSON( "a" << 1 << "c" << 1 ) ); - ASSERT( p3.scanAndOrderRequired() ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 4 ) ), FBS2( BSON( "a" << 4 ) ), BSON( "a" << 4 ), BSON( "b" << 1 ) ); + ASSERT( !p.scanAndOrderRequired() ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "b" << 4 ) ), FBS2( BSON( "b" << 4 ) ), BSON( "b" << 4 ), BSON( "a" << 1 << "c" << 1 ) ); + ASSERT( !p2.scanAndOrderRequired() ); + QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 4 ) ), FBS2( BSON( "b" << 4 ) ), BSON( "b" << 4 ), BSON( "a" << 1 << "c" << 1 ) ); + ASSERT( p3.scanAndOrderRequired() ); } }; - + class Optimal : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); ASSERT( p.optimal() ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); ASSERT( p2.optimal() ); - QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSON( "a" << 1 ) ); + QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 ) ), FBS2( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSON( "a" << 1 ) ); ASSERT( p3.optimal() ); - QueryPlan p4( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSON( "a" << 1 ) ); + QueryPlan p4( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 ) ), FBS2( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSON( "a" << 1 ) ); ASSERT( !p4.optimal() ); - QueryPlan p5( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSON( "b" << 1 ) ); + QueryPlan p5( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 ) ), FBS2( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSON( "b" << 1 ) ); ASSERT( p5.optimal() ); - QueryPlan p6( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSON( "b" << 1 ) ); + QueryPlan p6( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 ) ), FBS2( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSON( "b" << 1 ) ); ASSERT( !p6.optimal() ); - QueryPlan p7( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 << "b" << 1 ) ), BSON( "a" << 1 << "b" << 1 ), BSON( "a" << 1 ) ); + QueryPlan p7( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 << "b" << 1 ) ), FBS2( BSON( "a" << 1 << "b" << 1 ) ), BSON( "a" << 1 << "b" << 1 ), BSON( "a" << 1 ) ); ASSERT( p7.optimal() ); - QueryPlan p8( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSON( "a" << 1 ) ); + QueryPlan p8( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << 1 << "b" << LT << 1 ) ), FBS2( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSON( "a" << 1 ) ); ASSERT( p8.optimal() ); - QueryPlan p9( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSON( "a" << 1 ) ); + QueryPlan p9( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << 1 << "b" << LT << 1 ) ), FBS2( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSON( "a" << 1 ) ); ASSERT( p9.optimal() ); } }; - + class MoreOptimal : public Base { public: void run() { - QueryPlan p10( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSONObj() ); - ASSERT( p10.optimal() ); - QueryPlan p11( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSONObj() ); - ASSERT( p11.optimal() ); - QueryPlan p12( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << LT << 1 ) ), BSON( "a" << LT << 1 ), BSONObj() ); - ASSERT( p12.optimal() ); - QueryPlan p13( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << LT << 1 ) ), BSON( "a" << LT << 1 ), BSON( "a" << 1 ) ); - ASSERT( p13.optimal() ); + QueryPlan p10( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << 1 ) ), FBS2( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSONObj() ); + ASSERT( p10.optimal() ); + QueryPlan p11( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << 1 << "b" << LT << 1 ) ), FBS2( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSONObj() ); + ASSERT( p11.optimal() ); + QueryPlan p12( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << LT << 1 ) ), FBS2( BSON( "a" << LT << 1 ) ), BSON( "a" << LT << 1 ), BSONObj() ); + ASSERT( p12.optimal() ); + QueryPlan p13( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FBS( BSON( "a" << LT << 1 ) ), FBS2( BSON( "a" << LT << 1 ) ), BSON( "a" << LT << 1 ), BSON( "a" << 1 ) ); + ASSERT( p13.optimal() ); } }; - + class KeyMatch : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); ASSERT( !p.exactKeyMatch() ); - QueryPlan p2( nsd(), INDEXNO( "b" << 1 << "a" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + QueryPlan p2( nsd(), INDEXNO( "b" << 1 << "a" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); ASSERT( !p2.exactKeyMatch() ); - QueryPlan p3( nsd(), INDEXNO( "b" << 1 << "a" << 1 ), FBS( BSON( "b" << "z" ) ), BSON( "b" << "z" ), BSON( "a" << 1 ) ); + QueryPlan p3( nsd(), INDEXNO( "b" << 1 << "a" << 1 ), FBS( BSON( "b" << "z" ) ), FBS2( BSON( "b" << "z" ) ), BSON( "b" << "z" ), BSON( "a" << 1 ) ); ASSERT( !p3.exactKeyMatch() ); - QueryPlan p4( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FBS( BSON( "c" << "y" << "b" << "z" ) ), BSON( "c" << "y" << "b" << "z" ), BSON( "a" << 1 ) ); + QueryPlan p4( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FBS( BSON( "c" << "y" << "b" << "z" ) ), FBS2( BSON( "c" << "y" << "b" << "z" ) ), BSON( "c" << "y" << "b" << "z" ), BSON( "a" << 1 ) ); ASSERT( !p4.exactKeyMatch() ); - QueryPlan p5( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FBS( BSON( "c" << "y" << "b" << "z" ) ), BSON( "c" << "y" << "b" << "z" ), BSONObj() ); + QueryPlan p5( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FBS( BSON( "c" << "y" << "b" << "z" ) ), FBS2( BSON( "c" << "y" << "b" << "z" ) ), BSON( "c" << "y" << "b" << "z" ), BSONObj() ); ASSERT( !p5.exactKeyMatch() ); - QueryPlan p6( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FBS( BSON( "c" << LT << "y" << "b" << GT << "z" ) ), BSON( "c" << LT << "y" << "b" << GT << "z" ), BSONObj() ); + QueryPlan p6( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FBS( BSON( "c" << LT << "y" << "b" << GT << "z" ) ), FBS2( BSON( "c" << LT << "y" << "b" << GT << "z" ) ), BSON( "c" << LT << "y" << "b" << GT << "z" ), BSONObj() ); ASSERT( !p6.exactKeyMatch() ); - QueryPlan p7( nsd(), INDEXNO( "b" << 1 ), FBS( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); + QueryPlan p7( nsd(), INDEXNO( "b" << 1 ), FBS( BSONObj() ), FBS2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) ); ASSERT( !p7.exactKeyMatch() ); - QueryPlan p8( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << "y" << "a" << "z" ) ), BSON( "b" << "y" << "a" << "z" ), BSONObj() ); + QueryPlan p8( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << "y" << "a" << "z" ) ), FBS2( BSON( "b" << "y" << "a" << "z" ) ), BSON( "b" << "y" << "a" << "z" ), BSONObj() ); ASSERT( p8.exactKeyMatch() ); - QueryPlan p9( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << "z" ) ), BSON( "a" << "z" ), BSON( "a" << 1 ) ); + QueryPlan p9( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << "z" ) ), FBS2( BSON( "a" << "z" ) ), BSON( "a" << "z" ), BSON( "a" << 1 ) ); ASSERT( p9.exactKeyMatch() ); } }; - + class MoreKeyMatch : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << "r" << "b" << NE << "q" ) ), BSON( "a" << "r" << "b" << NE << "q" ), BSON( "a" << 1 ) ); - ASSERT( !p.exactKeyMatch() ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << "r" << "b" << NE << "q" ) ), FBS2( BSON( "a" << "r" << "b" << NE << "q" ) ), BSON( "a" << "r" << "b" << NE << "q" ), BSON( "a" << 1 ) ); + ASSERT( !p.exactKeyMatch() ); } }; - + class ExactKeyQueryTypes : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << "b" ) ), BSON( "a" << "b" ), BSONObj() ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << "b" ) ), FBS2( BSON( "a" << "b" ) ), BSON( "a" << "b" ), BSONObj() ); ASSERT( p.exactKeyMatch() ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << 4 ) ), BSON( "a" << 4 ), BSONObj() ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << 4 ) ), FBS2( BSON( "a" << 4 ) ), BSON( "a" << 4 ), BSONObj() ); ASSERT( !p2.exactKeyMatch() ); - QueryPlan p3( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << BSON( "c" << "d" ) ) ), BSON( "a" << BSON( "c" << "d" ) ), BSONObj() ); + QueryPlan p3( nsd(), INDEXNO( "a" << 1 ), FBS( BSON( "a" << BSON( "c" << "d" ) ) ), FBS2( BSON( "a" << BSON( "c" << "d" ) ) ), BSON( "a" << BSON( "c" << "d" ) ), BSONObj() ); ASSERT( !p3.exactKeyMatch() ); BSONObjBuilder b; b.appendRegex( "a", "^ddd" ); BSONObj q = b.obj(); - QueryPlan p4( nsd(), INDEXNO( "a" << 1 ), FBS( q ), q, BSONObj() ); + QueryPlan p4( nsd(), INDEXNO( "a" << 1 ), FBS( q ), FBS2( q ), q, BSONObj() ); ASSERT( !p4.exactKeyMatch() ); - QueryPlan p5( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << "z" << "b" << 4 ) ), BSON( "a" << "z" << "b" << 4 ), BSONObj() ); + QueryPlan p5( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "a" << "z" << "b" << 4 ) ), FBS2( BSON( "a" << "z" << "b" << 4 ) ), BSON( "a" << "z" << "b" << 4 ), BSONObj() ); ASSERT( !p5.exactKeyMatch() ); } }; - + class Unhelpful : public Base { public: void run() { - QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSONObj() ); + QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 ) ), FBS2( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSONObj() ); ASSERT( !p.range( "a" ).nontrivial() ); ASSERT( p.unhelpful() ); - QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 << "c" << 1 ) ), BSON( "b" << 1 << "c" << 1 ), BSON( "a" << 1 ) ); + QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FBS( BSON( "b" << 1 << "c" << 1 ) ), FBS2( BSON( "b" << 1 << "c" << 1 ) ), BSON( "b" << 1 << "c" << 1 ), BSON( "a" << 1 ) ); ASSERT( !p2.scanAndOrderRequired() ); ASSERT( !p2.range( "a" ).nontrivial() ); ASSERT( !p2.unhelpful() ); - QueryPlan p3( nsd(), INDEXNO( "b" << 1 ), FBS( BSON( "b" << 1 << "c" << 1 ) ), BSON( "b" << 1 << "c" << 1 ), BSONObj() ); + QueryPlan p3( nsd(), INDEXNO( "b" << 1 ), FBS( BSON( "b" << 1 << "c" << 1 ) ), FBS2( BSON( "b" << 1 << "c" << 1 ) ), BSON( "b" << 1 << "c" << 1 ), BSONObj() ); ASSERT( p3.range( "b" ).nontrivial() ); ASSERT( !p3.unhelpful() ); - QueryPlan p4( nsd(), INDEXNO( "b" << 1 << "c" << 1 ), FBS( BSON( "c" << 1 << "d" << 1 ) ), BSON( "c" << 1 << "d" << 1 ), BSONObj() ); + QueryPlan p4( nsd(), INDEXNO( "b" << 1 << "c" << 1 ), FBS( BSON( "c" << 1 << "d" << 1 ) ), FBS2( BSON( "c" << 1 << "d" << 1 ) ), BSON( "c" << 1 << "d" << 1 ), BSONObj() ); ASSERT( !p4.range( "b" ).nontrivial() ); ASSERT( p4.unhelpful() ); } }; - + } // namespace QueryPlanTests namespace QueryPlanSetTests { class Base { public: - Base() : _context( ns() ){ + Base() : _context( ns() ) { string err; userCreateNS( ns(), BSONObj(), err, false ); } @@ -1000,7 +1039,7 @@ namespace QueryOptimizerTests { if ( fieldsToReturn ) fieldsToReturn->appendSelfToBufBuilder(b); toSend.setData(dbQuery, b.buf(), b.len()); - } + } protected: static const char *ns() { return "unittests.QueryPlanSetTests"; } static NamespaceDetails *nsd() { return nsdetails( ns() ); } @@ -1008,24 +1047,26 @@ namespace QueryOptimizerTests { dblock lk_; Client::Context _context; }; - + class NoIndexes : public Base { public: void run() { auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); ASSERT_EQUALS( 1, s.nPlans() ); } }; - + class Optimal : public Base { public: void run() { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "b_2" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSONObj() ); - ASSERT_EQUALS( 1, s.nPlans() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSONObj() ); + ASSERT_EQUALS( 1, s.nPlans() ); } }; @@ -1035,7 +1076,8 @@ namespace QueryOptimizerTests { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); ASSERT_EQUALS( 3, s.nPlans() ); } }; @@ -1046,11 +1088,12 @@ namespace QueryOptimizerTests { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSONObj() ) ); - QueryPlanSet s( ns(), frs, BSONObj(), BSONObj() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSONObj(), BSONObj() ); ASSERT_EQUALS( 1, s.nPlans() ); } }; - + class HintSpec : public Base { public: void run() { @@ -1059,8 +1102,9 @@ namespace QueryOptimizerTests { BSONObj b = BSON( "hint" << BSON( "a" << 1 ) ); BSONElement e = b.firstElement(); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 1 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ); - ASSERT_EQUALS( 1, s.nPlans() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ); + ASSERT_EQUALS( 1, s.nPlans() ); } }; @@ -1072,11 +1116,12 @@ namespace QueryOptimizerTests { BSONObj b = BSON( "hint" << "a_1" ); BSONElement e = b.firstElement(); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 1 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ); - ASSERT_EQUALS( 1, s.nPlans() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ); + ASSERT_EQUALS( 1, s.nPlans() ); } }; - + class NaturalHint : public Base { public: void run() { @@ -1085,8 +1130,9 @@ namespace QueryOptimizerTests { BSONObj b = BSON( "hint" << BSON( "$natural" << 1 ) ); BSONElement e = b.firstElement(); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 1 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ); - ASSERT_EQUALS( 1, s.nPlans() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ); + ASSERT_EQUALS( 1, s.nPlans() ); } }; @@ -1096,7 +1142,8 @@ namespace QueryOptimizerTests { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "b_2" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 1 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 1 ), BSON( "$natural" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 1 ), BSON( "$natural" << 1 ) ); ASSERT_EQUALS( 1, s.nPlans() ); } }; @@ -1107,11 +1154,12 @@ namespace QueryOptimizerTests { BSONObj b = BSON( "hint" << "a_1" ); BSONElement e = b.firstElement(); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 1 ) ) ); - ASSERT_EXCEPTION( QueryPlanSet s( ns(), frs, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ), - AssertionException ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + ASSERT_EXCEPTION( QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), &e ), + AssertionException ); } }; - + class Count : public Base { public: void run() { @@ -1136,7 +1184,7 @@ namespace QueryOptimizerTests { ASSERT_EQUALS( 0, runCount( ns(), BSON( "query" << BSON( "a" << GT << 0 << LT << -1 ) ), err ) ); } }; - + class QueryMissingNs : public Base { public: QueryMissingNs() { log() << "querymissingns starts" << endl; } @@ -1154,25 +1202,27 @@ namespace QueryOptimizerTests { } }; - + class UnhelpfulIndex : public Base { public: void run() { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 1 << "c" << 2 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 1 << "c" << 2 ), BSONObj() ); - ASSERT_EQUALS( 2, s.nPlans() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 1 << "c" << 2 ), BSONObj() ); + ASSERT_EQUALS( 2, s.nPlans() ); } - }; - + }; + class SingleException : public Base { public: void run() { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); ASSERT_EQUALS( 3, s.nPlans() ); bool threw = false; auto_ptr< TestOp > t( new TestOp( true, threw ) ); @@ -1200,6 +1250,7 @@ namespace QueryOptimizerTests { return op; } virtual bool mayRecordPlan() const { return true; } + virtual long long nscanned() { return 0; } private: bool iThrow_; bool &threw_; @@ -1207,14 +1258,15 @@ namespace QueryOptimizerTests { mutable bool youThrow_; }; }; - + class AllException : public Base { public: void run() { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); ASSERT_EQUALS( 3, s.nPlans() ); auto_ptr< TestOp > t( new TestOp() ); boost::shared_ptr< TestOp > done = s.runOp( *t ); @@ -1233,9 +1285,10 @@ namespace QueryOptimizerTests { return new TestOp(); } virtual bool mayRecordPlan() const { return true; } + virtual long long nscanned() { return 0; } }; }; - + class SaveGoodIndex : public Base { public: void run() { @@ -1249,7 +1302,7 @@ namespace QueryOptimizerTests { nPlans( 3 ); runQuery(); nPlans( 1 ); - + { DBDirectClient client; for( int i = 0; i < 34; ++i ) { @@ -1259,9 +1312,10 @@ namespace QueryOptimizerTests { } } nPlans( 3 ); - + auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); NoRecordTestOp original; s.runOp( original ); nPlans( 3 ); @@ -1269,29 +1323,33 @@ namespace QueryOptimizerTests { BSONObj hint = fromjson( "{hint:{$natural:1}}" ); BSONElement hintElt = hint.firstElement(); auto_ptr< FieldRangeSet > frs2( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s2( ns(), frs2, BSON( "a" << 4 ), BSON( "b" << 1 ), &hintElt ); + auto_ptr< FieldRangeSet > frsOrig2( new FieldRangeSet( *frs2 ) ); + QueryPlanSet s2( ns(), frs2, frsOrig2, BSON( "a" << 4 ), BSON( "b" << 1 ), &hintElt ); TestOp newOriginal; s2.runOp( newOriginal ); nPlans( 3 ); auto_ptr< FieldRangeSet > frs3( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s3( ns(), frs3, BSON( "a" << 4 ), BSON( "b" << 1 << "c" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig3( new FieldRangeSet( *frs3 ) ); + QueryPlanSet s3( ns(), frs3, frsOrig3, BSON( "a" << 4 ), BSON( "b" << 1 << "c" << 1 ) ); TestOp newerOriginal; s3.runOp( newerOriginal ); - nPlans( 3 ); - + nPlans( 3 ); + runQuery(); nPlans( 1 ); } private: void nPlans( int n ) { auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); - ASSERT_EQUALS( n, s.nPlans() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + ASSERT_EQUALS( n, s.nPlans() ); } void runQuery() { auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); TestOp original; s.runOp( original ); } @@ -1305,33 +1363,37 @@ namespace QueryOptimizerTests { return new TestOp(); } virtual bool mayRecordPlan() const { return true; } + virtual long long nscanned() { return 0; } }; class NoRecordTestOp : public TestOp { virtual bool mayRecordPlan() const { return false; } virtual QueryOp *_createChild() const { return new NoRecordTestOp(); } }; - }; - + }; + class TryAllPlansOnErr : public Base { public: void run() { Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s( ns(), frs, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) ); ScanOnlyTestOp op; s.runOp( op ); ASSERT( fromjson( "{$natural:1}" ).woCompare( NamespaceDetailsTransient::_get( ns() ).indexForPattern( s.fbs().pattern( BSON( "b" << 1 ) ) ) ) == 0 ); ASSERT_EQUALS( 1, NamespaceDetailsTransient::_get( ns() ).nScannedForPattern( s.fbs().pattern( BSON( "b" << 1 ) ) ) ); - + auto_ptr< FieldRangeSet > frs2( new FieldRangeSet( ns(), BSON( "a" << 4 ) ) ); - QueryPlanSet s2( ns(), frs2, BSON( "a" << 4 ), BSON( "b" << 1 ) ); + auto_ptr< FieldRangeSet > frsOrig2( new FieldRangeSet( *frs2 ) ); + QueryPlanSet s2( ns(), frs2, frsOrig2, BSON( "a" << 4 ), BSON( "b" << 1 ) ); TestOp op2; ASSERT( s2.runOp( op2 )->complete() ); } private: class TestOp : public QueryOp { public: + TestOp() {} virtual void _init() {} virtual void next() { if ( qp().indexKey().firstElement().fieldName() == string( "$natural" ) ) @@ -1342,6 +1404,7 @@ namespace QueryOptimizerTests { return new TestOp(); } virtual bool mayRecordPlan() const { return true; } + virtual long long nscanned() { return 1; } }; class ScanOnlyTestOp : public TestOp { virtual void next() { @@ -1354,7 +1417,7 @@ namespace QueryOptimizerTests { } }; }; - + class FindOne : public Base { public: void run() { @@ -1362,12 +1425,12 @@ namespace QueryOptimizerTests { theDataFileMgr.insertWithObjMod( ns(), one ); BSONObj result; ASSERT( Helpers::findOne( ns(), BSON( "a" << 1 ), result ) ); - ASSERT_EXCEPTION( Helpers::findOne( ns(), BSON( "a" << 1 ), result, true ), AssertionException ); + ASSERT_EXCEPTION( Helpers::findOne( ns(), BSON( "a" << 1 ), result, true ), AssertionException ); Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" ); - ASSERT( Helpers::findOne( ns(), BSON( "a" << 1 ), result, true ) ); + ASSERT( Helpers::findOne( ns(), BSON( "a" << 1 ), result, true ) ); } }; - + class Delete : public Base { public: void run() { @@ -1380,10 +1443,10 @@ namespace QueryOptimizerTests { theDataFileMgr.insertWithObjMod( ns(), one ); deleteObjects( ns(), BSON( "a" << 1 ), false ); ASSERT( BSON( "a" << 1 ).woCompare( NamespaceDetailsTransient::_get( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "a" << 1 ) ).pattern() ) ) == 0 ); - ASSERT_EQUALS( 2, NamespaceDetailsTransient::_get( ns() ).nScannedForPattern( FieldRangeSet( ns(), BSON( "a" << 1 ) ).pattern() ) ); + ASSERT_EQUALS( 1, NamespaceDetailsTransient::_get( ns() ).nScannedForPattern( FieldRangeSet( ns(), BSON( "a" << 1 ) ).pattern() ) ); } }; - + class DeleteOneScan : public Base { public: void run() { @@ -1410,7 +1473,7 @@ namespace QueryOptimizerTests { theDataFileMgr.insertWithObjMod( ns(), one ); theDataFileMgr.insertWithObjMod( ns(), two ); theDataFileMgr.insertWithObjMod( ns(), three ); - deleteObjects( ns(), BSON( "a" << GTE << 0 << "_id" << GT << 0 ), true ); + deleteObjects( ns(), BSON( "a" << GTE << 0 ), true ); for( boost::shared_ptr c = theDataFileMgr.findAll( ns() ); c->ok(); c->advance() ) ASSERT( 2 != c->current().getIntField( "_id" ) ); } @@ -1436,7 +1499,7 @@ namespace QueryOptimizerTests { runQuery( m, q); } ASSERT( BSON( "$natural" << 1 ).woCompare( NamespaceDetailsTransient::_get( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ) ).pattern() ) ) == 0 ); - + Message m2; assembleRequest( ns(), QUERY( "b" << 99 << "a" << GTE << 0 ).obj, 2, 0, 0, 0, m2 ); { @@ -1444,11 +1507,11 @@ namespace QueryOptimizerTests { QueryMessage q(d); runQuery( m2, q); } - ASSERT( BSON( "a" << 1 ).woCompare( NamespaceDetailsTransient::_get( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ) ).pattern() ) ) == 0 ); - ASSERT_EQUALS( 2, NamespaceDetailsTransient::_get( ns() ).nScannedForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ) ).pattern() ) ); + ASSERT( BSON( "a" << 1 ).woCompare( NamespaceDetailsTransient::_get( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ) ).pattern() ) ) == 0 ); + ASSERT_EQUALS( 3, NamespaceDetailsTransient::_get( ns() ).nScannedForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ) ).pattern() ) ); } }; - + class InQueryIntervals : public Base { public: void run() { @@ -1460,30 +1523,32 @@ namespace QueryOptimizerTests { BSONObj hint = fromjson( "{$hint:{a:1}}" ); BSONElement hintElt = hint.firstElement(); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ) ) ); - QueryPlanSet s( ns(), frs, fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSONObj(), &hintElt ); - QueryPlan qp( nsd(), 1, s.fbs(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSONObj() ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSONObj(), &hintElt ); + QueryPlan qp( nsd(), 1, s.fbs(), s.originalFrs(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSONObj() ); boost::shared_ptr c = qp.newCursor(); double expected[] = { 2, 3, 6, 9 }; for( int i = 0; i < 4; ++i, c->advance() ) { ASSERT_EQUALS( expected[ i ], c->current().getField( "a" ).number() ); } ASSERT( !c->ok() ); - + // now check reverse { auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ) ) ); - QueryPlanSet s( ns(), frs, fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSON( "a" << -1 ), &hintElt ); - QueryPlan qp( nsd(), 1, s.fbs(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSON( "a" << -1 ) ); + auto_ptr< FieldRangeSet > frsOrig( new FieldRangeSet( *frs ) ); + QueryPlanSet s( ns(), frs, frsOrig, fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSON( "a" << -1 ), &hintElt ); + QueryPlan qp( nsd(), 1, s.fbs(), s.originalFrs(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSON( "a" << -1 ) ); boost::shared_ptr c = qp.newCursor(); double expected[] = { 9, 6, 3, 2 }; for( int i = 0; i < 4; ++i, c->advance() ) { ASSERT_EQUALS( expected[ i ], c->current().getField( "a" ).number() ); } - ASSERT( !c->ok() ); + ASSERT( !c->ok() ); } } }; - + class EqualityThenIn : public Base { public: void run() { @@ -1494,8 +1559,8 @@ namespace QueryOptimizerTests { } BSONObj hint = fromjson( "{$hint:{a:1,b:1}}" ); BSONElement hintElt = hint.firstElement(); - auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ) ) ); - QueryPlan qp( nsd(), 1, *frs, fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ), BSONObj() ); + auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ) ) ); + QueryPlan qp( nsd(), 1, *frs, *frs, fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ), BSONObj() ); boost::shared_ptr c = qp.newCursor(); double expected[] = { 2, 3, 6, 9 }; ASSERT( c->ok() ); @@ -1506,7 +1571,7 @@ namespace QueryOptimizerTests { ASSERT( !c->ok() ); } }; - + class NotEqualityThenIn : public Base { public: void run() { @@ -1518,7 +1583,7 @@ namespace QueryOptimizerTests { BSONObj hint = fromjson( "{$hint:{a:1,b:1}}" ); BSONElement hintElt = hint.firstElement(); auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns(), fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ) ) ); - QueryPlan qp( nsd(), 1, *frs, fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ), BSONObj() ); + QueryPlan qp( nsd(), 1, *frs, *frs, fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ), BSONObj() ); boost::shared_ptr c = qp.newCursor(); int matches[] = { 2, 3, 6, 9 }; for( int i = 0; i < 4; ++i, c->advance() ) { @@ -1529,7 +1594,7 @@ namespace QueryOptimizerTests { }; } // namespace QueryPlanSetTests - + class Base { public: Base() : _ctx( ns() ) { @@ -1549,7 +1614,7 @@ namespace QueryOptimizerTests { dblock lk_; Client::Context _ctx; }; - + class BestGuess : public Base { public: void run() { @@ -1559,7 +1624,7 @@ namespace QueryOptimizerTests { theDataFileMgr.insertWithObjMod( ns(), temp ); temp = BSON( "b" << 1 ); theDataFileMgr.insertWithObjMod( ns(), temp ); - + boost::shared_ptr< Cursor > c = bestGuessCursor( ns(), BSON( "b" << 1 ), BSON( "a" << 1 ) ); ASSERT_EQUALS( string( "a" ), c->indexKeyPattern().firstElement().fieldName() ); c = bestGuessCursor( ns(), BSON( "a" << 1 ), BSON( "b" << 1 ) ); @@ -1568,22 +1633,22 @@ namespace QueryOptimizerTests { ASSERT_EQUALS( string( "a" ), m->sub_c()->indexKeyPattern().firstElement().fieldName() ); m = dynamic_pointer_cast< MultiCursor >( bestGuessCursor( ns(), fromjson( "{a:1,$or:[{y:1}]}" ), BSON( "b" << 1 ) ) ); ASSERT_EQUALS( string( "b" ), m->sub_c()->indexKeyPattern().firstElement().fieldName() ); - + FieldRangeSet frs( "ns", BSON( "a" << 1 ) ); { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); - NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( frs.pattern( BSON( "b" << 1 ) ), BSON( "a" << 1 ), 0 ); + NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( frs.pattern( BSON( "b" << 1 ) ), BSON( "a" << 1 ), 0 ); } m = dynamic_pointer_cast< MultiCursor >( bestGuessCursor( ns(), fromjson( "{a:1,$or:[{y:1}]}" ), BSON( "b" << 1 ) ) ); ASSERT_EQUALS( string( "b" ), m->sub_c()->indexKeyPattern().firstElement().fieldName() ); } }; - + class All : public Suite { public: - All() : Suite( "queryoptimizer" ){} - - void setupTests(){ + All() : Suite( "queryoptimizer" ) {} + + void setupTests() { add< FieldRangeTests::Empty >(); add< FieldRangeTests::Eq >(); add< FieldRangeTests::DupEq >(); @@ -1606,6 +1671,7 @@ namespace QueryOptimizerTests { add< FieldRangeTests::Numeric >(); add< FieldRangeTests::InLowerBound >(); add< FieldRangeTests::InUpperBound >(); + add< FieldRangeTests::UnionBound >(); add< FieldRangeTests::MultiBound >(); add< FieldRangeTests::Diff1 >(); add< FieldRangeTests::Diff2 >(); @@ -1670,6 +1736,7 @@ namespace QueryOptimizerTests { add< FieldRangeTests::Diff61 >(); add< FieldRangeTests::Diff62 >(); add< FieldRangeTests::Diff63 >(); + add< FieldRangeTests::Diff64 >(); add< FieldRangeTests::DiffMulti1 >(); add< FieldRangeTests::DiffMulti2 >(); add< FieldRangeTests::SetIntersect >(); @@ -1713,6 +1780,6 @@ namespace QueryOptimizerTests { add< BestGuess >(); } } myall; - + } // namespace QueryOptimizerTests diff --git a/dbtests/querytests.cpp b/dbtests/querytests.cpp index 31e1879..d008e4d 100644 --- a/dbtests/querytests.cpp +++ b/dbtests/querytests.cpp @@ -25,6 +25,8 @@ #include "../db/json.h" #include "../db/lasterror.h" +#include "../util/timer.h" + #include "dbtests.h" namespace mongo { @@ -37,7 +39,7 @@ namespace QueryTests { dblock lk; Client::Context _context; public: - Base() : _context( ns() ){ + Base() : _context( ns() ) { addIndex( fromjson( "{\"a\":1}" ) ); } ~Base() { @@ -48,7 +50,8 @@ namespace QueryTests { toDelete.push_back( c->currLoc() ); for( vector< DiskLoc >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) theDataFileMgr.deleteRecord( ns(), i->rec(), *i, false ); - } catch ( ... ) { + } + catch ( ... ) { FAIL( "Exception while cleaning up records" ); } } @@ -129,7 +132,7 @@ namespace QueryTests { ASSERT_EQUALS( 1, runCount( ns(), cmd, err ) ); } }; - + class FindOne : public Base { public: void run() { @@ -145,12 +148,11 @@ namespace QueryTests { class ClientBase { public: - // NOTE: Not bothering to backup the old error record. ClientBase() { mongo::lastError.reset( new LastError() ); } ~ClientBase() { - mongo::lastError.release(); + //mongo::lastError.release(); } protected: static void insert( const char *ns, BSONObj o ) { @@ -170,6 +172,9 @@ namespace QueryTests { class BoundedKey : public ClientBase { public: + ~BoundedKey() { + client().dropCollection( "unittests.querytests.BoundedKey" ); + } void run() { const char *ns = "unittests.querytests.BoundedKey"; insert( ns, BSON( "a" << 1 ) ); @@ -210,7 +215,7 @@ namespace QueryTests { client().dropCollection( ns ); } - void testLimit(int limit){ + void testLimit(int limit) { ASSERT_EQUALS(client().query( ns, BSONObj(), limit )->itcount(), limit); } void run() { @@ -285,7 +290,7 @@ namespace QueryTests { insert( ns, BSON( "a" << 0 ) ); c = client().query( ns, QUERY( "a" << 1 ).hint( BSON( "$natural" << 1 ) ), 2, 0, 0, QueryOption_CursorTailable ); ASSERT( 0 != c->getCursorId() ); - ASSERT( !c->isDead() ); + ASSERT( !c->isDead() ); } }; @@ -345,7 +350,7 @@ namespace QueryTests { ASSERT( !client().getLastError().empty() ); } }; - + class TailableQueryOnId : public ClientBase { public: ~TailableQueryOnId() { @@ -511,7 +516,13 @@ namespace QueryTests { static const char *ns() { return "unittests.querytests.AutoResetIndexCache"; } static const char *idxNs() { return "unittests.system.indexes"; } void index() const { ASSERT( !client().findOne( idxNs(), BSON( "name" << NE << "_id_" ) ).isEmpty() ); } - void noIndex() const { ASSERT( client().findOne( idxNs(), BSON( "name" << NE << "_id_" ) ).isEmpty() ); } + void noIndex() const { + BSONObj o = client().findOne( idxNs(), BSON( "name" << NE << "_id_" ) ); + if( !o.isEmpty() ) { + cout << o.toString() << endl; + ASSERT( false ); + } + } void checkIndex() { client().ensureIndex( ns(), BSON( "a" << 1 ) ); index(); @@ -598,8 +609,8 @@ namespace QueryTests { client().insert( ns, fromjson( "{a:[1,2,3]}" ) ); ASSERT( client().query( ns, Query( "{a:[1,2,3]}" ) )->more() ); client().ensureIndex( ns, BSON( "a" << 1 ) ); - ASSERT( client().query( ns, Query( "{a:{$in:[1,[1,2,3]]}}" ).hint( BSON( "a" << 1 ) ) )->more() ); - ASSERT( client().query( ns, Query( "{a:[1,2,3]}" ).hint( BSON( "a" << 1 ) ) )->more() ); // SERVER-146 + ASSERT( client().query( ns, Query( "{a:{$in:[1,[1,2,3]]}}" ).hint( BSON( "a" << 1 ) ) )->more() ); + ASSERT( client().query( ns, Query( "{a:[1,2,3]}" ).hint( BSON( "a" << 1 ) ) )->more() ); // SERVER-146 } }; @@ -613,7 +624,7 @@ namespace QueryTests { client().insert( ns, fromjson( "{a:[[1],2]}" ) ); check( "$natural" ); client().ensureIndex( ns, BSON( "a" << 1 ) ); - check( "a" ); // SERVER-146 + check( "a" ); // SERVER-146 } private: void check( const string &hintField ) { @@ -756,12 +767,12 @@ namespace QueryTests { class DifferentNumbers : public ClientBase { public: - ~DifferentNumbers(){ + ~DifferentNumbers() { client().dropCollection( "unittests.querytests.DifferentNumbers" ); } - void t( const char * ns ){ + void t( const char * ns ) { auto_ptr< DBClientCursor > cursor = client().query( ns, Query().sort( "7" ) ); - while ( cursor->more() ){ + while ( cursor->more() ) { BSONObj o = cursor->next(); assert( o.valid() ); //cout << " foo " << o << endl; @@ -782,37 +793,37 @@ namespace QueryTests { t(ns); } }; - + class CollectionBase : public ClientBase { public: - - CollectionBase( string leaf ){ + + CollectionBase( string leaf ) { _ns = "unittests.querytests."; _ns += leaf; client().dropCollection( ns() ); } - - virtual ~CollectionBase(){ + + virtual ~CollectionBase() { client().dropCollection( ns() ); } - - int count(){ + + int count() { return (int) client().count( ns() ); } - const char * ns(){ + const char * ns() { return _ns.c_str(); } - + private: string _ns; }; class SymbolStringSame : public CollectionBase { public: - SymbolStringSame() : CollectionBase( "symbolstringsame" ){} + SymbolStringSame() : CollectionBase( "symbolstringsame" ) {} - void run(){ + void run() { { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); b.append( "z" , 17 ); client().insert( ns() , b.obj() ); } ASSERT_EQUALS( 17 , client().findOne( ns() , BSONObj() )["z"].number() ); { @@ -828,46 +839,46 @@ namespace QueryTests { class TailableCappedRaceCondition : public CollectionBase { public: - - TailableCappedRaceCondition() : CollectionBase( "tailablecappedrace" ){ + + TailableCappedRaceCondition() : CollectionBase( "tailablecappedrace" ) { client().dropCollection( ns() ); _n = 0; } - void run(){ + void run() { string err; - writelock lk(""); + writelock lk(""); Client::Context ctx( "unittests" ); ASSERT( userCreateNS( ns() , fromjson( "{ capped : true , size : 2000 }" ) , err , false ) ); - for ( int i=0; i<100; i++ ){ + for ( int i=0; i<100; i++ ) { insertNext(); ASSERT( count() < 45 ); } - + int a = count(); - + auto_ptr< DBClientCursor > c = client().query( ns() , QUERY( "i" << GT << 0 ).hint( BSON( "$natural" << 1 ) ), 0, 0, 0, QueryOption_CursorTailable ); int n=0; - while ( c->more() ){ + while ( c->more() ) { BSONObj z = c->next(); n++; } - + ASSERT_EQUALS( a , n ); insertNext(); ASSERT( c->more() ); - for ( int i=0; i<50; i++ ){ + for ( int i=0; i<50; i++ ) { insertNext(); } - while ( c->more() ){ c->next(); } + while ( c->more() ) { c->next(); } ASSERT( c->isDead() ); } - - void insertNext(){ + + void insertNext() { insert( ns() , BSON( "i" << _n++ ) ); } @@ -876,89 +887,71 @@ namespace QueryTests { class HelperTest : public CollectionBase { public: - - HelperTest() : CollectionBase( "helpertest" ){ + + HelperTest() : CollectionBase( "helpertest" ) { } - void run(){ + void run() { writelock lk(""); Client::Context ctx( "unittests" ); - - for ( int i=0; i<50; i++ ){ + + for ( int i=0; i<50; i++ ) { insert( ns() , BSON( "_id" << i << "x" << i * 2 ) ); } ASSERT_EQUALS( 50 , count() ); - + BSONObj res; ASSERT( Helpers::findOne( ns() , BSON( "_id" << 20 ) , res , true ) ); ASSERT_EQUALS( 40 , res["x"].numberInt() ); - + ASSERT( Helpers::findById( cc(), ns() , BSON( "_id" << 20 ) , res ) ); ASSERT_EQUALS( 40 , res["x"].numberInt() ); ASSERT( ! Helpers::findById( cc(), ns() , BSON( "_id" << 200 ) , res ) ); unsigned long long slow , fast; - + int n = 10000; { Timer t; - for ( int i=0; i i = Helpers::find( ns() ); - int n = 0; - while ( i->hasNext() ){ - BSONObj o = i->next(); - n++; - } - ASSERT_EQUALS( 50 , n ); - i = Helpers::find( ns() , BSON( "_id" << 20 ) ); - n = 0; - while ( i->hasNext() ){ - BSONObj o = i->next(); - n++; - } - ASSERT_EQUALS( 1 , n ); - } - } }; class HelperByIdTest : public CollectionBase { public: - - HelperByIdTest() : CollectionBase( "helpertestbyid" ){ + + HelperByIdTest() : CollectionBase( "helpertestbyid" ) { } - void run(){ + void run() { writelock lk(""); Client::Context ctx( "unittests" ); - for ( int i=0; i<1000; i++ ){ + for ( int i=0; i<1000; i++ ) { insert( ns() , BSON( "_id" << i << "x" << i * 2 ) ); } - for ( int i=0; i<1000; i+=2 ){ + for ( int i=0; i<1000; i+=2 ) { client_.remove( ns() , BSON( "_id" << i ) ); } - BSONObj res; - for ( int i=0; i<1000; i++ ){ + BSONObj res; + for ( int i=0; i<1000; i++ ) { bool found = Helpers::findById( cc(), ns() , BSON( "_id" << i ) , res ); ASSERT_EQUALS( i % 2 , int(found) ); } @@ -966,19 +959,19 @@ namespace QueryTests { } }; - class ClientCursorTest : public CollectionBase{ - ClientCursorTest() : CollectionBase( "clientcursortest" ){ + class ClientCursorTest : public CollectionBase { + ClientCursorTest() : CollectionBase( "clientcursortest" ) { } - void run(){ + void run() { writelock lk(""); Client::Context ctx( "unittests" ); - - for ( int i=0; i<1000; i++ ){ + + for ( int i=0; i<1000; i++ ) { insert( ns() , BSON( "_id" << i << "x" << i * 2 ) ); } - + } }; @@ -990,19 +983,19 @@ namespace QueryTests { ~FindingStart() { __findingStartInitialTimeout = _old; } - + void run() { BSONObj info; ASSERT( client().runCommand( "unittests", BSON( "create" << "querytests.findingstart" << "capped" << true << "size" << 1000 << "$nExtents" << 5 << "autoIndexId" << false ), info ) ); - + int i = 0; for( int oldCount = -1; - count() != oldCount; - oldCount = count(), client().insert( ns(), BSON( "ts" << i++ ) ) ); + count() != oldCount; + oldCount = count(), client().insert( ns(), BSON( "ts" << i++ ) ) ); for( int k = 0; k < 5; ++k ) { client().insert( ns(), BSON( "ts" << i++ ) ); - int min = client().query( ns(), Query().sort( BSON( "$natural" << 1 ) ) )->next()[ "ts" ].numberInt(); + int min = client().query( ns(), Query().sort( BSON( "$natural" << 1 ) ) )->next()[ "ts" ].numberInt(); for( int j = -1; j < i; ++j ) { auto_ptr< DBClientCursor > c = client().query( ns(), QUERY( "ts" << GTE << j ), 0, 0, 0, QueryOption_OplogReplay ); ASSERT( c->more() ); @@ -1012,7 +1005,7 @@ namespace QueryTests { } } } - + private: int _old; }; @@ -1025,17 +1018,19 @@ namespace QueryTests { ~FindingStartPartiallyFull() { __findingStartInitialTimeout = _old; } - + void run() { + unsigned startNumCursors = ClientCursor::numCursors(); + BSONObj info; ASSERT( client().runCommand( "unittests", BSON( "create" << "querytests.findingstart" << "capped" << true << "size" << 10000 << "$nExtents" << 5 << "autoIndexId" << false ), info ) ); - + int i = 0; for( ; i < 150; client().insert( ns(), BSON( "ts" << i++ ) ) ); - + for( int k = 0; k < 5; ++k ) { client().insert( ns(), BSON( "ts" << i++ ) ); - int min = client().query( ns(), Query().sort( BSON( "$natural" << 1 ) ) )->next()[ "ts" ].numberInt(); + int min = client().query( ns(), Query().sort( BSON( "$natural" << 1 ) ) )->next()[ "ts" ].numberInt(); for( int j = -1; j < i; ++j ) { auto_ptr< DBClientCursor > c = client().query( ns(), QUERY( "ts" << GTE << j ), 0, 0, 0, QueryOption_OplogReplay ); ASSERT( c->more() ); @@ -1044,13 +1039,15 @@ namespace QueryTests { ASSERT_EQUALS( ( j > min ? j : min ), next[ "ts" ].numberInt() ); } } + + ASSERT_EQUALS( startNumCursors, ClientCursor::numCursors() ); } - + private: int _old; }; - - + + class WhatsMyUri : public CollectionBase { public: WhatsMyUri() : CollectionBase( "whatsmyuri" ) {} @@ -1060,15 +1057,15 @@ namespace QueryTests { ASSERT_EQUALS( unknownAddress.toString(), result[ "you" ].str() ); } }; - + namespace parsedtests { class basic1 { public: - void _test( const BSONObj& in ){ + void _test( const BSONObj& in ) { ParsedQuery q( "a.b" , 5 , 6 , 9 , in , BSONObj() ); ASSERT_EQUALS( BSON( "x" << 5 ) , q.getFilter() ); } - void run(){ + void run() { _test( BSON( "x" << 5 ) ); _test( BSON( "query" << BSON( "x" << 5 ) ) ); _test( BSON( "$query" << BSON( "x" << 5 ) ) ); @@ -1090,23 +1087,23 @@ namespace QueryTests { namespace queryobjecttests { class names1 { public: - void run(){ + void run() { ASSERT_EQUALS( BSON( "x" << 1 ) , QUERY( "query" << BSON( "x" << 1 ) ).getFilter() ); ASSERT_EQUALS( BSON( "x" << 1 ) , QUERY( "$query" << BSON( "x" << 1 ) ).getFilter() ); } - + }; } class OrderingTest { public: - void run(){ + void run() { { Ordering o = Ordering::make( BSON( "a" << 1 << "b" << -1 << "c" << 1 ) ); ASSERT_EQUALS( 1 , o.get(0) ); ASSERT_EQUALS( -1 , o.get(1) ); ASSERT_EQUALS( 1 , o.get(2) ); - + ASSERT( ! o.descending( 1 ) ); ASSERT( o.descending( 1 << 1 ) ); ASSERT( ! o.descending( 1 << 2 ) ); @@ -1117,7 +1114,7 @@ namespace QueryTests { ASSERT_EQUALS( 1 , o.get(0) ); ASSERT_EQUALS( 1 , o.get(1) ); ASSERT_EQUALS( -1 , o.get(2) ); - + ASSERT( ! o.descending( 1 ) ); ASSERT( ! o.descending( 1 << 1 ) ); ASSERT( o.descending( 1 << 2 ) ); @@ -1126,12 +1123,100 @@ namespace QueryTests { } }; + namespace proj { // Projection tests + + class T1 { + public: + void run() { + + Projection m; + m.init( BSON( "a" << 1 ) ); + ASSERT_EQUALS( BSON( "a" << 5 ) , m.transform( BSON( "x" << 1 << "a" << 5 ) ) ); + } + }; + + class K1 { + public: + void run() { + + Projection m; + m.init( BSON( "a" << 1 ) ); + + scoped_ptr x( m.checkKey( BSON( "a" << 1 ) ) ); + ASSERT( ! x ); + + x.reset( m.checkKey( BSON( "a" << 1 << "_id" << 1 ) ) ); + ASSERT( x ); + + ASSERT_EQUALS( BSON( "a" << 5 << "_id" << 17 ) , + x->hydrate( BSON( "" << 5 << "" << 17 ) ) ); + + x.reset( m.checkKey( BSON( "a" << 1 << "x" << 1 << "_id" << 1 ) ) ); + ASSERT( x ); + + ASSERT_EQUALS( BSON( "a" << 5 << "_id" << 17 ) , + x->hydrate( BSON( "" << 5 << "" << 123 << "" << 17 ) ) ); + + } + }; + + class K2 { + public: + void run() { + + Projection m; + m.init( BSON( "a" << 1 << "_id" << 0 ) ); + + scoped_ptr x( m.checkKey( BSON( "a" << 1 ) ) ); + ASSERT( x ); + + ASSERT_EQUALS( BSON( "a" << 17 ) , + x->hydrate( BSON( "" << 17 ) ) ); + + x.reset( m.checkKey( BSON( "x" << 1 << "a" << 1 << "_id" << 1 ) ) ); + ASSERT( x ); + + ASSERT_EQUALS( BSON( "a" << 123 ) , + x->hydrate( BSON( "" << 5 << "" << 123 << "" << 17 ) ) ); + + } + }; + + + class K3 { + public: + void run() { + + { + Projection m; + m.init( BSON( "a" << 1 << "_id" << 0 ) ); + + scoped_ptr x( m.checkKey( BSON( "a" << 1 << "x.a" << 1 ) ) ); + ASSERT( x ); + } + + + { + // TODO: this is temporary SERVER-2104 + Projection m; + m.init( BSON( "x.a" << 1 << "_id" << 0 ) ); + + scoped_ptr x( m.checkKey( BSON( "a" << 1 << "x.a" << 1 ) ) ); + ASSERT( ! x ); + } + + } + }; + + + } + class All : public Suite { public: All() : Suite( "query" ) { } - void setupTests(){ + void setupTests() { add< CountBasic >(); add< CountQuery >(); add< CountFields >(); @@ -1176,14 +1261,19 @@ namespace QueryTests { add< FindingStart >(); add< FindingStartPartiallyFull >(); add< WhatsMyUri >(); - + add< parsedtests::basic1 >(); - + add< queryobjecttests::names1 >(); add< OrderingTest >(); + + add< proj::T1 >(); + add< proj::K1 >(); + add< proj::K2 >(); + add< proj::K3 >(); } } myall; - + } // namespace QueryTests diff --git a/dbtests/repltests.cpp b/dbtests/repltests.cpp index a190dc8..c6ffba2 100644 --- a/dbtests/repltests.cpp +++ b/dbtests/repltests.cpp @@ -34,13 +34,13 @@ namespace ReplTests { BSONObj f( const char *s ) { return fromjson( s ); - } - + } + class Base { dblock lk; Client::Context _context; public: - Base() : _context( ns() ){ + Base() : _context( ns() ) { replSettings.master = true; createOplog(); ensureHaveIdIndex( ns() ); @@ -50,7 +50,8 @@ namespace ReplTests { replSettings.master = false; deleteAll( ns() ); deleteAll( cllNS() ); - } catch ( ... ) { + } + catch ( ... ) { FAIL( "Exception while cleaning up test" ); } } @@ -63,7 +64,7 @@ namespace ReplTests { } DBDirectClient *client() const { return &client_; } BSONObj one( const BSONObj &query = BSONObj() ) const { - return client()->findOne( ns(), query ); + return client()->findOne( ns(), query ); } void checkOne( const BSONObj &o ) const { check( o, one( o ) ); @@ -78,11 +79,11 @@ namespace ReplTests { void check( const BSONObj &expected, const BSONObj &got ) const { if ( expected.woCompare( got ) ) { out() << "expected: " << expected.toString() - << ", got: " << got.toString() << endl; + << ", got: " << got.toString() << endl; } ASSERT_EQUALS( expected , got ); } - BSONObj oneOp() const { + BSONObj oneOp() const { return client()->findOne( cllNS(), BSONObj() ); } int count() const { @@ -131,7 +132,7 @@ namespace ReplTests { out() << "all for " << ns << endl; for(; c->ok(); c->advance() ) { out() << c->current().toString() << endl; - } + } } // These deletes don't get logged. static void deleteAll( const char *ns ) { @@ -143,7 +144,7 @@ namespace ReplTests { toDelete.push_back( c->currLoc() ); } for( vector< DiskLoc >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) { - theDataFileMgr.deleteRecord( ns, i->rec(), *i, true ); + theDataFileMgr.deleteRecord( ns, i->rec(), *i, true ); } } static void insert( const BSONObj &o, bool god = false ) { @@ -163,7 +164,7 @@ namespace ReplTests { static DBDirectClient client_; }; DBDirectClient Base::client_; - + class LogBasic : public Base { public: void run() { @@ -172,9 +173,9 @@ namespace ReplTests { ASSERT_EQUALS( 2, opCount() ); } }; - + namespace Idempotence { - + class Base : public ReplTests::Base { public: virtual ~Base() {} @@ -186,7 +187,7 @@ namespace ReplTests { applyAllOperations(); check(); ASSERT_EQUALS( nOps, opCount() ); - + reset(); applyAllOperations(); check(); @@ -200,7 +201,7 @@ namespace ReplTests { virtual void check() const = 0; virtual void reset() const = 0; }; - + class InsertTimestamp : public Base { public: void doIt() const { @@ -221,7 +222,7 @@ namespace ReplTests { private: mutable Date_t date_; }; - + class InsertAutoId : public Base { public: InsertAutoId() : o_( fromjson( "{\"a\":\"b\"}" ) ) {} @@ -248,12 +249,12 @@ namespace ReplTests { checkOne( o_ ); } }; - + class InsertTwo : public Base { public: - InsertTwo() : - o_( fromjson( "{'_id':1,a:'b'}" ) ), - t_( fromjson( "{'_id':2,c:'d'}" ) ) {} + InsertTwo() : + o_( fromjson( "{'_id':1,a:'b'}" ) ), + t_( fromjson( "{'_id':2,c:'d'}" ) ) {} void doIt() const { vector< BSONObj > v; v.push_back( o_ ); @@ -287,7 +288,7 @@ namespace ReplTests { deleteAll( ns() ); } private: - BSONObj o_; + BSONObj o_; }; class UpdateTimestamp : public Base { @@ -311,14 +312,14 @@ namespace ReplTests { private: mutable Date_t date_; }; - + class UpdateSameField : public Base { public: UpdateSameField() : - q_( fromjson( "{a:'b'}" ) ), - o1_( wid( "{a:'b'}" ) ), - o2_( wid( "{a:'b'}" ) ), - u_( fromjson( "{a:'c'}" ) ){} + q_( fromjson( "{a:'b'}" ) ), + o1_( wid( "{a:'b'}" ) ), + o2_( wid( "{a:'b'}" ) ), + u_( fromjson( "{a:'c'}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); } @@ -334,14 +335,14 @@ namespace ReplTests { } private: BSONObj q_, o1_, o2_, u_; - }; - + }; + class UpdateSameFieldWithId : public Base { public: UpdateSameFieldWithId() : - o_( fromjson( "{'_id':1,a:'b'}" ) ), - q_( fromjson( "{a:'b'}" ) ), - u_( fromjson( "{'_id':1,a:'c'}" ) ){} + o_( fromjson( "{'_id':1,a:'b'}" ) ), + q_( fromjson( "{a:'b'}" ) ), + u_( fromjson( "{'_id':1,a:'c'}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); } @@ -356,14 +357,14 @@ namespace ReplTests { insert( fromjson( "{'_id':2,a:'b'}" ) ); } private: - BSONObj o_, q_, u_; - }; + BSONObj o_, q_, u_; + }; class UpdateSameFieldExplicitId : public Base { public: UpdateSameFieldExplicitId() : - o_( fromjson( "{'_id':1,a:'b'}" ) ), - u_( fromjson( "{'_id':1,a:'c'}" ) ){} + o_( fromjson( "{'_id':1,a:'b'}" ) ), + u_( fromjson( "{'_id':1,a:'c'}" ) ) {} void doIt() const { client()->update( ns(), o_, u_ ); } @@ -376,46 +377,15 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, u_; - }; - - class UpdateId : public UpdateSameFieldExplicitId { - public: - UpdateId() { - o_ = fromjson( "{'_id':1}" ); - u_ = fromjson( "{'_id':2}" ); - } - }; - - class UpdateId2 : public ReplTests::Base { - public: - UpdateId2() : - o_( fromjson( "{'_id':1}" ) ), - u_( fromjson( "{'_id':2}" ) ){} - void run() { - deleteAll( ns() ); - insert( o_ ); - client()->update( ns(), o_, u_ ); - ASSERT_EQUALS( 1, count() ); - checkOne( u_ ); - - deleteAll( ns() ); - insert( o_ ); - insert( u_ ); // simulate non snapshot replication, then op application - applyAllOperations(); - ASSERT_EQUALS( 1, count() ); - checkOne( u_ ); - } - protected: - BSONObj o_, u_; + BSONObj o_, u_; }; class UpdateDifferentFieldExplicitId : public Base { public: UpdateDifferentFieldExplicitId() : - o_( fromjson( "{'_id':1,a:'b'}" ) ), - q_( fromjson( "{'_id':1}" ) ), - u_( fromjson( "{'_id':1,a:'c'}" ) ){} + o_( fromjson( "{'_id':1,a:'b'}" ) ), + q_( fromjson( "{'_id':1}" ) ), + u_( fromjson( "{'_id':1,a:'c'}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); } @@ -428,28 +398,28 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, q_, u_; - }; - + BSONObj o_, q_, u_; + }; + class UpsertUpdateNoMods : public UpdateDifferentFieldExplicitId { void doIt() const { client()->update( ns(), q_, u_, true ); } }; - + class UpsertInsertNoMods : public InsertAutoId { void doIt() const { client()->update( ns(), fromjson( "{a:'c'}" ), o_, true ); } }; - + class UpdateSet : public Base { public: UpdateSet() : - o_( fromjson( "{'_id':1,a:5}" ) ), - q_( fromjson( "{a:5}" ) ), - u_( fromjson( "{$set:{a:7}}" ) ), - ou_( fromjson( "{'_id':1,a:7}" ) ) {} + o_( fromjson( "{'_id':1,a:5}" ) ), + q_( fromjson( "{a:5}" ) ), + u_( fromjson( "{$set:{a:7}}" ) ), + ou_( fromjson( "{'_id':1,a:7}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); } @@ -462,16 +432,16 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; - + class UpdateInc : public Base { public: UpdateInc() : - o_( fromjson( "{'_id':1,a:5}" ) ), - q_( fromjson( "{a:5}" ) ), - u_( fromjson( "{$inc:{a:3}}" ) ), - ou_( fromjson( "{'_id':1,a:8}" ) ) {} + o_( fromjson( "{'_id':1,a:5}" ) ), + q_( fromjson( "{a:5}" ) ), + u_( fromjson( "{$inc:{a:3}}" ) ), + ou_( fromjson( "{'_id':1,a:8}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); } @@ -484,16 +454,16 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; class UpdateInc2 : public Base { public: UpdateInc2() : - o_( fromjson( "{'_id':1,a:5}" ) ), - q_( fromjson( "{a:5}" ) ), - u_( fromjson( "{$inc:{a:3},$set:{x:5}}" ) ), - ou_( fromjson( "{'_id':1,a:8,x:5}" ) ) {} + o_( fromjson( "{'_id':1,a:5}" ) ), + q_( fromjson( "{a:5}" ) ), + u_( fromjson( "{$inc:{a:3},$set:{x:5}}" ) ), + ou_( fromjson( "{'_id':1,a:8,x:5}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); } @@ -506,16 +476,16 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; - + class IncEmbedded : public Base { public: IncEmbedded() : - o_( fromjson( "{'_id':1,a:{b:3},b:{b:1}}" ) ), - q_( fromjson( "{'_id':1}" ) ), - u_( fromjson( "{$inc:{'a.b':1,'b.b':1}}" ) ), - ou_( fromjson( "{'_id':1,a:{b:4},b:{b:2}}" ) ) + o_( fromjson( "{'_id':1,a:{b:3},b:{b:1}}" ) ), + q_( fromjson( "{'_id':1}" ) ), + u_( fromjson( "{$inc:{'a.b':1,'b.b':1}}" ) ), + ou_( fromjson( "{'_id':1,a:{b:4},b:{b:2}}" ) ) {} void doIt() const { client()->update( ns(), q_, u_ ); @@ -529,16 +499,16 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; class IncCreates : public Base { public: IncCreates() : - o_( fromjson( "{'_id':1}" ) ), - q_( fromjson( "{'_id':1}" ) ), - u_( fromjson( "{$inc:{'a':1}}" ) ), - ou_( fromjson( "{'_id':1,a:1}") ) + o_( fromjson( "{'_id':1}" ) ), + q_( fromjson( "{'_id':1}" ) ), + u_( fromjson( "{$inc:{'a':1}}" ) ), + ou_( fromjson( "{'_id':1,a:1}") ) {} void doIt() const { client()->update( ns(), q_, u_ ); @@ -552,16 +522,16 @@ namespace ReplTests { insert( o_ ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; class UpsertInsertIdMod : public Base { public: UpsertInsertIdMod() : - q_( fromjson( "{'_id':5,a:4}" ) ), - u_( fromjson( "{$inc:{a:3}}" ) ), - ou_( fromjson( "{'_id':5,a:7}" ) ) {} + q_( fromjson( "{'_id':5,a:4}" ) ), + u_( fromjson( "{$inc:{a:3}}" ) ), + ou_( fromjson( "{'_id':5,a:7}" ) ) {} void doIt() const { client()->update( ns(), q_, u_, true ); } @@ -573,15 +543,15 @@ namespace ReplTests { deleteAll( ns() ); } protected: - BSONObj q_, u_, ou_; + BSONObj q_, u_, ou_; }; - + class UpsertInsertSet : public Base { public: UpsertInsertSet() : - q_( fromjson( "{a:5}" ) ), - u_( fromjson( "{$set:{a:7}}" ) ), - ou_( fromjson( "{a:7}" ) ) {} + q_( fromjson( "{a:5}" ) ), + u_( fromjson( "{$set:{a:7}}" ) ), + ou_( fromjson( "{a:7}" ) ) {} void doIt() const { client()->update( ns(), q_, u_, true ); } @@ -594,15 +564,15 @@ namespace ReplTests { insert( fromjson( "{'_id':7,a:7}" ) ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; - + class UpsertInsertInc : public Base { public: UpsertInsertInc() : - q_( fromjson( "{a:5}" ) ), - u_( fromjson( "{$inc:{a:3}}" ) ), - ou_( fromjson( "{a:8}" ) ) {} + q_( fromjson( "{a:5}" ) ), + u_( fromjson( "{$inc:{a:3}}" ) ), + ou_( fromjson( "{a:8}" ) ) {} void doIt() const { client()->update( ns(), q_, u_, true ); } @@ -614,38 +584,38 @@ namespace ReplTests { deleteAll( ns() ); } protected: - BSONObj o_, q_, u_, ou_; + BSONObj o_, q_, u_, ou_; }; - + class MultiInc : public Base { public: - + string s() const { stringstream ss; auto_ptr cc = client()->query( ns() , Query().sort( BSON( "_id" << 1 ) ) ); bool first = true; - while ( cc->more() ){ + while ( cc->more() ) { if ( first ) first = false; else ss << ","; - + BSONObj o = cc->next(); ss << o["x"].numberInt(); } return ss.str(); } - + void doIt() const { client()->insert( ns(), BSON( "_id" << 1 << "x" << 1 ) ); client()->insert( ns(), BSON( "_id" << 2 << "x" << 5 ) ); - + ASSERT_EQUALS( "1,5" , s() ); - + client()->update( ns() , BSON( "_id" << 1 ) , BSON( "$inc" << BSON( "x" << 1 ) ) ); ASSERT_EQUALS( "2,5" , s() ); - + client()->update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) ); ASSERT_EQUALS( "3,5" , s() ); - + client()->update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) , false , true ); check(); } @@ -653,18 +623,18 @@ namespace ReplTests { void check() const { ASSERT_EQUALS( "4,6" , s() ); } - + void reset() const { deleteAll( ns() ); } }; - + class UpdateWithoutPreexistingId : public Base { public: UpdateWithoutPreexistingId() : - o_( fromjson( "{a:5}" ) ), - u_( fromjson( "{a:5}" ) ), - ot_( fromjson( "{b:4}" ) ) {} + o_( fromjson( "{a:5}" ) ), + u_( fromjson( "{a:5}" ) ), + ot_( fromjson( "{b:4}" ) ) {} void doIt() const { client()->update( ns(), o_, u_ ); } @@ -679,15 +649,15 @@ namespace ReplTests { insert( o_, true ); } protected: - BSONObj o_, u_, ot_; - }; - + BSONObj o_, u_, ot_; + }; + class Remove : public Base { public: Remove() : - o1_( f( "{\"_id\":\"010101010101010101010101\",\"a\":\"b\"}" ) ), - o2_( f( "{\"_id\":\"010101010101010101010102\",\"a\":\"b\"}" ) ), - q_( f( "{\"a\":\"b\"}" ) ) {} + o1_( f( "{\"_id\":\"010101010101010101010101\",\"a\":\"b\"}" ) ), + o2_( f( "{\"_id\":\"010101010101010101010102\",\"a\":\"b\"}" ) ), + q_( f( "{\"a\":\"b\"}" ) ) {} void doIt() const { client()->remove( ns(), q_ ); } @@ -700,23 +670,23 @@ namespace ReplTests { insert( o2_ ); } protected: - BSONObj o1_, o2_, q_; + BSONObj o1_, o2_, q_; }; - + class RemoveOne : public Remove { void doIt() const { client()->remove( ns(), q_, true ); - } + } void check() const { ASSERT_EQUALS( 1, count() ); } }; - + class FailingUpdate : public Base { public: FailingUpdate() : - o_( fromjson( "{'_id':1,a:'b'}" ) ), - u_( fromjson( "{'_id':1,c:'d'}" ) ) {} + o_( fromjson( "{'_id':1,a:'b'}" ) ), + u_( fromjson( "{'_id':1,c:'d'}" ) ) {} void doIt() const { client()->update( ns(), o_, u_ ); client()->insert( ns(), o_ ); @@ -731,7 +701,7 @@ namespace ReplTests { protected: BSONObj o_, u_; }; - + class SetNumToStr : public Base { public: void doIt() const { @@ -746,7 +716,7 @@ namespace ReplTests { insert( BSON( "_id" << 0 << "a" << 4.0 ) ); } }; - + class Push : public Base { public: void doIt() const { @@ -760,9 +730,9 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4]}" ) ); - } + } }; - + class PushUpsert : public Base { public: void doIt() const { @@ -776,7 +746,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4]}" ) ); - } + } }; class MultiPush : public Base { @@ -792,7 +762,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4]}" ) ); - } + } }; class EmptyPush : public Base { @@ -808,13 +778,13 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0}" ) ); - } + } }; class PushAll : public Base { public: void doIt() const { - client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ) ); + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ) ); } using ReplTests::Base::check; void check() const { @@ -824,13 +794,13 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4]}" ) ); - } + } }; - + class PushAllUpsert : public Base { public: void doIt() const { - client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ), true ); + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ), true ); } using ReplTests::Base::check; void check() const { @@ -840,7 +810,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4]}" ) ); - } + } }; class EmptyPushAll : public Base { @@ -856,7 +826,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0}" ) ); - } + } }; class Pull : public Base { @@ -872,9 +842,9 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4,5]}" ) ); - } + } }; - + class PullNothing : public Base { public: void doIt() const { @@ -888,13 +858,13 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4,5]}" ) ); - } + } }; - + class PullAll : public Base { public: void doIt() const { - client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pullAll:{a:[4,5]}}" ) ); + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pullAll:{a:[4,5]}}" ) ); } using ReplTests::Base::check; void check() const { @@ -904,7 +874,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4,5,6]}" ) ); - } + } }; class Pop : public Base { @@ -920,7 +890,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4,5,6]}" ) ); - } + } }; class PopReverse : public Base { @@ -936,7 +906,7 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:[4,5,6]}" ) ); - } + } }; class BitOp : public Base { @@ -952,13 +922,78 @@ namespace ReplTests { void reset() const { deleteAll( ns() ); insert( fromjson( "{'_id':0,a:3}" ) ); - } + } + }; + + class Rename : public Base { + public: + void doIt() const { + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{a:'b'}}" ) ); + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$set:{a:50}}" ) ); + } + using ReplTests::Base::check; + void check() const { + ASSERT_EQUALS( 1, count() ); + check( BSON( "_id" << 0 << "a" << 50 << "b" << 3 ) , one( fromjson( "{'_id':0}" ) ) ); + } + void reset() const { + deleteAll( ns() ); + insert( fromjson( "{'_id':0,a:3}" ) ); + } + }; + + class RenameReplace : public Base { + public: + void doIt() const { + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{a:'b'}}" ) ); + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$set:{a:50}}" ) ); + } + using ReplTests::Base::check; + void check() const { + ASSERT_EQUALS( 1, count() ); + check( BSON( "_id" << 0 << "a" << 50 << "b" << 3 ) , one( fromjson( "{'_id':0}" ) ) ); + } + void reset() const { + deleteAll( ns() ); + insert( fromjson( "{'_id':0,a:3,b:100}" ) ); + } + }; + + class RenameOverwrite : public Base { + public: + void doIt() const { + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{a:'b'}}" ) ); + } + using ReplTests::Base::check; + void check() const { + ASSERT_EQUALS( 1, count() ); + check( BSON( "_id" << 0 << "b" << 3 << "z" << 1 ) , one( fromjson( "{'_id':0}" ) ) ); + } + void reset() const { + deleteAll( ns() ); + insert( fromjson( "{'_id':0,z:1,a:3}" ) ); + } + }; + + class NoRename : public Base { + public: + void doIt() const { + client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{c:'b'},$set:{z:1}}" ) ); + } + using ReplTests::Base::check; + void check() const { + ASSERT_EQUALS( 1, count() ); + check( BSON( "_id" << 0 << "a" << 3 << "z" << 1 ) , one( fromjson( "{'_id':0}" ) ) ); + } + void reset() const { + deleteAll( ns() ); + insert( fromjson( "{'_id':0,a:3}" ) ); + } }; - } // namespace Idempotence - + class DeleteOpIsIdBased : public Base { public: void run() { @@ -968,21 +1003,21 @@ namespace ReplTests { client()->remove( ns(), BSON( "a" << 10 ) ); ASSERT_EQUALS( 1U, client()->count( ns(), BSONObj() ) ); insert( BSON( "_id" << 0 << "a" << 11 ) ); - insert( BSON( "_id" << 2 << "a" << 10 ) ); + insert( BSON( "_id" << 2 << "a" << 10 ) ); insert( BSON( "_id" << 3 << "a" << 10 ) ); - + applyAllOperations(); ASSERT_EQUALS( 2U, client()->count( ns(), BSONObj() ) ); ASSERT( !one( BSON( "_id" << 1 ) ).isEmpty() ); ASSERT( !one( BSON( "_id" << 2 ) ).isEmpty() ); } }; - + class DbIdsTest { public: void run() { Client::Context ctx( "unittests.repltest.DbIdsTest" ); - + s_.reset( new DbIds( "local.temp.DbIdsTest" ) ); s_->reset(); check( false, false, false ); @@ -991,7 +1026,7 @@ namespace ReplTests { check( true, false, false ); s_->set( "a", BSON( "_id" << 4 ), false ); check( false, false, false ); - + s_->set( "b", BSON( "_id" << 4 ), true ); check( false, true, false ); s_->set( "b", BSON( "_id" << 4 ), false ); @@ -1009,7 +1044,7 @@ namespace ReplTests { s_->reset(); check( false, false, false ); - + s_->set( "a", BSON( "_id" << 4 ), true ); s_->set( "a", BSON( "_id" << 4 ), true ); check( true, false, false ); @@ -1020,17 +1055,17 @@ namespace ReplTests { void check( bool one, bool two, bool three ) { ASSERT_EQUALS( one, s_->get( "a", BSON( "_id" << 4 ) ) ); ASSERT_EQUALS( two, s_->get( "b", BSON( "_id" << 4 ) ) ); - ASSERT_EQUALS( three, s_->get( "a", BSON( "_id" << 5 ) ) ); + ASSERT_EQUALS( three, s_->get( "a", BSON( "_id" << 5 ) ) ); } dblock lk_; auto_ptr< DbIds > s_; }; - + class MemIdsTest { public: void run() { int n = sizeof( BSONObj ) + BSON( "_id" << 4 ).objsize(); - + s_.reset(); ASSERT_EQUALS( 0, s_.roughSize() ); ASSERT( !s_.get( "a", BSON( "_id" << 4 ) ) ); @@ -1057,7 +1092,7 @@ namespace ReplTests { public: void run() { Client::Context ctx( "unittests.repltests.IdTrackerTest" ); - + ASSERT( s_.inMem() ); s_.reset( 4 * sizeof( BSONObj ) - 1 ); s_.haveId( "a", BSON( "_id" << 0 ), true ); @@ -1069,34 +1104,34 @@ namespace ReplTests { s_.mayUpgradeStorage(); ASSERT( !s_.inMem() ); check(); - + s_.haveId( "a", BSON( "_id" << 1 ), false ); ASSERT( !s_.haveId( "a", BSON( "_id" << 1 ) ) ); s_.haveId( "a", BSON( "_id" << 1 ), true ); check(); - ASSERT( !s_.inMem() ); - + ASSERT( !s_.inMem() ); + s_.reset( 4 * sizeof( BSONObj ) - 1 ); s_.mayUpgradeStorage(); - ASSERT( s_.inMem() ); + ASSERT( s_.inMem() ); } private: void check() { ASSERT( s_.haveId( "a", BSON( "_id" << 0 ) ) ); ASSERT( s_.haveId( "a", BSON( "_id" << 1 ) ) ); ASSERT( s_.haveId( "b", BSON( "_id" << 0 ) ) ); - ASSERT( s_.haveModId( "b", BSON( "_id" << 0 ) ) ); + ASSERT( s_.haveModId( "b", BSON( "_id" << 0 ) ) ); } dblock lk_; IdTracker s_; }; - + class All : public Suite { public: - All() : Suite( "repl" ){ + All() : Suite( "repl" ) { } - - void setupTests(){ + + void setupTests() { add< LogBasic >(); add< Idempotence::InsertTimestamp >(); add< Idempotence::InsertAutoId >(); @@ -1107,8 +1142,6 @@ namespace ReplTests { add< Idempotence::UpdateSameField >(); add< Idempotence::UpdateSameFieldWithId >(); add< Idempotence::UpdateSameFieldExplicitId >(); - add< Idempotence::UpdateId >(); - add< Idempotence::UpdateId2 >(); add< Idempotence::UpdateDifferentFieldExplicitId >(); add< Idempotence::UpsertUpdateNoMods >(); add< Idempotence::UpsertInsertNoMods >(); @@ -1140,12 +1173,16 @@ namespace ReplTests { add< Idempotence::Pop >(); add< Idempotence::PopReverse >(); add< Idempotence::BitOp >(); + add< Idempotence::Rename >(); + add< Idempotence::RenameReplace >(); + add< Idempotence::RenameOverwrite >(); + add< Idempotence::NoRename >(); add< DeleteOpIsIdBased >(); add< DbIdsTest >(); add< MemIdsTest >(); add< IdTrackerTest >(); } } myall; - + } // namespace ReplTests diff --git a/dbtests/sharding.cpp b/dbtests/sharding.cpp index 2473366..19edd55 100644 --- a/dbtests/sharding.cpp +++ b/dbtests/sharding.cpp @@ -27,17 +27,17 @@ namespace ShardingTests { namespace serverandquerytests { class test1 { public: - void run(){ + void run() { ServerAndQuery a( "foo:1" , BSON( "a" << GT << 0 << LTE << 100 ) ); ServerAndQuery b( "foo:1" , BSON( "a" << GT << 200 << LTE << 1000 ) ); - + ASSERT( a < b ); ASSERT( ! ( b < a ) ); set s; s.insert( a ); s.insert( b ); - + ASSERT_EQUALS( (unsigned int)2 , s.size() ); } }; @@ -45,12 +45,12 @@ namespace ShardingTests { class All : public Suite { public: - All() : Suite( "sharding" ){ + All() : Suite( "sharding" ) { } - void setupTests(){ + void setupTests() { add< serverandquerytests::test1 >(); } } myall; - + } diff --git a/dbtests/socktests.cpp b/dbtests/socktests.cpp index 267b1d6..5cd42f5 100644 --- a/dbtests/socktests.cpp +++ b/dbtests/socktests.cpp @@ -19,7 +19,6 @@ #include "pch.h" #include "../util/sock.h" - #include "dbtests.h" namespace SockTests { @@ -30,16 +29,20 @@ namespace SockTests { ASSERT_EQUALS( "127.0.0.1", hostbyname( "localhost" ) ); ASSERT_EQUALS( "127.0.0.1", hostbyname( "127.0.0.1" ) ); // ASSERT_EQUALS( "::1", hostbyname( "::1" ) ); // IPv6 disabled at runtime by default. + + HostAndPort h("asdfasdfasdf_no_such_host"); + // this fails uncomment when fixed. + ASSERT( !h.isSelf() ); } }; - + class All : public Suite { public: - All() : Suite( "sock" ){} - void setupTests(){ + All() : Suite( "sock" ) {} + void setupTests() { add< HostByName >(); } } myall; - + } // namespace SockTests diff --git a/dbtests/spin_lock_test.cpp b/dbtests/spin_lock_test.cpp index d053d61..01eb7b3 100644 --- a/dbtests/spin_lock_test.cpp +++ b/dbtests/spin_lock_test.cpp @@ -26,26 +26,26 @@ namespace { using mongo::SpinLock; - class LockTester{ + class LockTester { public: LockTester( SpinLock* spin, int* counter ) - : _spin(spin), _counter(counter), _requests(0){} + : _spin(spin), _counter(counter), _requests(0) {} - ~LockTester(){ + ~LockTester() { delete _t; } - void start( int increments ){ - _t = new boost::thread( boost::bind(&LockTester::test, this, increments) ); + void start( int increments ) { + _t = new boost::thread( boost::bind(&LockTester::test, this, increments) ); } - void join(){ + void join() { if ( _t ) _t->join(); } - int requests() const{ - return _requests; - } + int requests() const { + return _requests; + } private: SpinLock* _spin; // not owned here @@ -53,7 +53,7 @@ namespace { int _requests; boost::thread* _t; - void test( int increments ){ + void test( int increments ) { while ( increments-- > 0 ) { _spin->lock(); ++(*_counter); @@ -61,14 +61,14 @@ namespace { _spin->unlock(); } } - + LockTester( LockTester& ); LockTester& operator=( LockTester& ); }; - class ConcurrentIncs{ + class ConcurrentIncs { public: - void run(){ + void run() { #if defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) @@ -77,37 +77,37 @@ namespace { const int threads = 64; const int incs = 10000; - LockTester* testers[threads]; - - for ( int i = 0; i < threads; i++ ){ - testers[i] = new LockTester( &spin, &counter ); - } - for ( int i = 0; i < threads; i++ ){ - testers[i]->start( incs ); - } - for ( int i = 0; i < threads; i++ ){ - testers[i]->join(); - ASSERT_EQUALS( testers[i]->requests(), incs ); - delete testers[i]; - } - - ASSERT_EQUALS( counter, threads*incs ); + LockTester* testers[threads]; + + for ( int i = 0; i < threads; i++ ) { + testers[i] = new LockTester( &spin, &counter ); + } + for ( int i = 0; i < threads; i++ ) { + testers[i]->start( incs ); + } + for ( int i = 0; i < threads; i++ ) { + testers[i]->join(); + ASSERT_EQUALS( testers[i]->requests(), incs ); + delete testers[i]; + } + + ASSERT_EQUALS( counter, threads*incs ); #else - // WARNING "TODO Missing spin lock in this platform." - ASSERT( true ); + // WARNING "TODO Missing spin lock in this platform." + ASSERT( true ); + - #endif } }; - class SpinLockSuite : public Suite{ + class SpinLockSuite : public Suite { public: - SpinLockSuite() : Suite( "spinlock" ){} + SpinLockSuite() : Suite( "spinlock" ) {} - void setupTests(){ + void setupTests() { add< ConcurrentIncs >(); } } spinLockSuite; diff --git a/dbtests/test.vcproj b/dbtests/test.vcproj deleted file mode 100644 index c297d85..0000000 --- a/dbtests/test.vcproj +++ /dev/null @@ -1,1453 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dbtests/test.vcxproj b/dbtests/test.vcxproj index d52278a..b80a730 100644 --- a/dbtests/test.vcxproj +++ b/dbtests/test.vcxproj @@ -68,7 +68,7 @@ $(SolutionDir)$(Configuration)\ $(Configuration)\ $(Configuration)\ - true + false true $(SolutionDir)$(Configuration)\ $(SolutionDir)$(Configuration)\ @@ -88,6 +88,10 @@ + ..;$(IncludePath) + ..;$(IncludePath) + ..;$(IncludePath) + ..;$(IncludePath) @@ -100,7 +104,7 @@ Use pch.h Level3 - EditAndContinue + ProgramDatabase 4355;4800;%(DisableSpecificWarnings) true @@ -112,13 +116,14 @@ true Console MachineX86 + true Disabled ..\..\js\src;..\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories) - _UNICODE;UNICODE;SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions) + _DURABLE;_UNICODE;UNICODE;SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions) EnableFastChecks MultiThreadedDebugDLL Use @@ -191,16 +196,17 @@ + + + + + + - - - - - @@ -244,6 +250,7 @@ + @@ -253,14 +260,28 @@ + + + + + + + + + + + + + + @@ -270,6 +291,7 @@ + @@ -510,7 +532,6 @@ - @@ -542,21 +563,29 @@ + + - + + NotUsing + + + + + @@ -564,7 +593,6 @@ - @@ -598,7 +626,6 @@ - @@ -610,8 +637,7 @@ - - + NotUsing @@ -624,14 +650,17 @@ + + + diff --git a/dbtests/test.vcxproj.filters b/dbtests/test.vcxproj.filters index ba4c4af..c52f7f6 100755 --- a/dbtests/test.vcxproj.filters +++ b/dbtests/test.vcxproj.filters @@ -7,9 +7,6 @@ {0a50fb63-4ac3-4e30-a9d4-b0841878ee73} - - {eb2684bf-ca8d-4162-9313-56a81233c471} - {45dab36c-864e-45de-bb8e-cf1d87a2c4f6} @@ -44,15 +41,18 @@ {9320a670-3b28-471a-bf92-6c8d881a37a4} - - {4fff2dbf-30c4-4295-8db8-d513c1e36220} - {d499fdba-b256-4b12-af20-cdd1ae1addff} {353b6f01-1cab-4156-a576-bc75ab204776} + + {4fff2dbf-30c4-4295-8db8-d513c1e36220} + + + {c296d097-0d46-46ee-9097-f2df659d9596} + @@ -73,21 +73,6 @@ misc and third party\pcre - - storage related - - - storage related - - - storage related - - - storage related - - - storage related - client @@ -188,7 +173,7 @@ db\h - btree related + btree util\concurrency @@ -238,6 +223,27 @@ util\h + + dur + + + dur + + + dur + + + dur + + + dur + + + db + + + db + @@ -326,9 +332,6 @@ misc and third party\pcre - - storage related - client @@ -422,9 +425,6 @@ db\cpp - - db\cpp - db\cpp @@ -485,9 +485,6 @@ util\cpp - - util\cpp - util\cpp @@ -591,10 +588,10 @@ replsets - btree related + btree - btree related + btree db\cpp @@ -614,9 +611,6 @@ shard - - shard - util\concurrency @@ -698,6 +692,81 @@ db\cpp + + db\cpp + + + dbtests + + + scripting + + + db\cpp + + + db\cpp + + + dur + + + dur + + + dur + + + dur + + + db\cpp + + + db\cpp + + + db\cpp + + + db\cpp + + + db\cpp + + + util + + + db + + + util + + + dur + + + dur + + + client + + + dur + + + dbtests + + + dbtests + + + db\cpp + + + util\cpp + diff --git a/dbtests/threadedtests.cpp b/dbtests/threadedtests.cpp index af413cc..805b2d5 100644 --- a/dbtests/threadedtests.cpp +++ b/dbtests/threadedtests.cpp @@ -21,6 +21,7 @@ #include "../bson/util/atomic_int.h" #include "../util/concurrency/mvar.h" #include "../util/concurrency/thread_pool.h" +#include "../util/timer.h" #include #include @@ -29,34 +30,108 @@ namespace ThreadedTests { template - class ThreadedTest{ - public: - virtual void setup() {} //optional - virtual void subthread() = 0; - virtual void validate() = 0; + class ThreadedTest { + public: + virtual void setup() {} //optional + virtual void subthread() = 0; + virtual void validate() = 0; - static const int nthreads = nthreads_param; + static const int nthreads = nthreads_param; - void run(){ - setup(); + void run() { + setup(); + launch_subthreads(nthreads); + validate(); + } - launch_subthreads(nthreads); + virtual ~ThreadedTest() {}; // not necessary, but makes compilers happy - validate(); - } + private: + void launch_subthreads(int remaining) { + if (!remaining) return; - virtual ~ThreadedTest() {}; // not necessary, but makes compilers happy + boost::thread athread(boost::bind(&ThreadedTest::subthread, this)); - private: - void launch_subthreads(int remaining){ - if (!remaining) return; + launch_subthreads(remaining - 1); - boost::thread athread(boost::bind(&ThreadedTest::subthread, this)); - - launch_subthreads(remaining - 1); + athread.join(); + } + }; - athread.join(); + class MongoMutexTest : public ThreadedTest<135> { +#if defined(_DEBUG) + enum { N = 5000 }; +#else + enum { N = 40000 }; +#endif + MongoMutex *mm; + public: + void run() { + Timer t; + cout << "MongoMutexTest N:" << N << endl; + ThreadedTest<135>::run(); + cout << "MongoMutexTest " << t.millis() << "ms" << endl; + } + private: + virtual void setup() { + mm = new MongoMutex("MongoMutexTest"); + } + virtual void subthread() { + Client::initThread("mongomutextest"); + sleepmillis(0); + for( int i = 0; i < N; i++ ) { + if( i % 7 == 0 ) { + mm->lock_shared(); + mm->lock_shared(); + mm->unlock_shared(); + mm->unlock_shared(); + } + else if( i % 7 == 1 ) { + mm->lock_shared(); + ASSERT( mm->atLeastReadLocked() ); + mm->unlock_shared(); + } + else if( i % 7 == 2 ) { + mm->lock(); + ASSERT( mm->isWriteLocked() ); + mm->unlock(); + } + else if( i % 7 == 3 ) { + mm->lock(); + mm->lock_shared(); + ASSERT( mm->isWriteLocked() ); + mm->unlock_shared(); + mm->unlock(); + } + else if( i % 7 == 4 ) { + mm->lock(); + mm->releaseEarly(); + mm->unlock(); + } + else if( i % 7 == 5 ) { + if( mm->lock_try(1) ) { + mm->unlock(); + } + } + else if( i % 7 == 6 ) { + if( mm->lock_shared_try(0) ) { + mm->unlock_shared(); + } + } + else { + mm->lock_shared(); + mm->unlock_shared(); + } } + cc().shutdown(); + } + virtual void validate() { + ASSERT( !mm->atLeastReadLocked() ); + mm->lock(); + mm->unlock(); + mm->lock_shared(); + mm->unlock_shared(); + } }; // Tested with up to 30k threads @@ -64,13 +139,13 @@ namespace ThreadedTests { static const int iterations = 1000000; AtomicUInt target; - void subthread(){ - for(int i=0; i < iterations; i++){ + void subthread() { + for(int i=0; i < iterations; i++) { //target.x++; // verified to fail with this version target++; } } - void validate(){ + void validate() { ASSERT_EQUALS(target.x , unsigned(nthreads * iterations)); AtomicUInt u; @@ -80,6 +155,12 @@ namespace ThreadedTests { ASSERT_EQUALS(2u, u--); ASSERT_EQUALS(0u, --u); ASSERT_EQUALS(0u, u); + + u++; + ASSERT( u > 0 ); + + u--; + ASSERT( ! ( u > 0 ) ); } }; @@ -87,10 +168,10 @@ namespace ThreadedTests { static const int iterations = 10000; MVar target; - public: + public: MVarTest() : target(0) {} - void subthread(){ - for(int i=0; i < iterations; i++){ + void subthread() { + for(int i=0; i < iterations; i++) { int val = target.take(); #if BOOST_VERSION >= 103500 //increase chances of catching failure @@ -99,30 +180,30 @@ namespace ThreadedTests { target.put(val+1); } } - void validate(){ + void validate() { ASSERT_EQUALS(target.take() , nthreads * iterations); } }; - class ThreadPoolTest{ + class ThreadPoolTest { static const int iterations = 10000; static const int nThreads = 8; AtomicUInt counter; - void increment(int n){ - for (int i=0; i(); add< MVarTest >(); add< ThreadPoolTest >(); add< LockTest >(); + add< MongoMutexTest >(); } } myall; } diff --git a/dbtests/updatetests.cpp b/dbtests/updatetests.cpp index 17f861e..0f95a32 100644 --- a/dbtests/updatetests.cpp +++ b/dbtests/updatetests.cpp @@ -110,14 +110,14 @@ namespace UpdateTests { class PushAllNonArray : public Fail { void doIt() { - insert( ns(), fromjson( "{a:[1]}" ) ); + insert( ns(), fromjson( "{a:[1]}" ) ); update( ns(), BSONObj(), fromjson( "{$pushAll:{a:'d'}}" ) ); } }; class PullAllNonArray : public Fail { void doIt() { - insert( ns(), fromjson( "{a:[1]}" ) ); + insert( ns(), fromjson( "{a:[1]}" ) ); update( ns(), BSONObj(), fromjson( "{$pullAll:{a:'d'}}" ) ); } }; @@ -241,12 +241,12 @@ namespace UpdateTests { class MultiInc : public SetBase { public: - - string s(){ + + string s() { stringstream ss; auto_ptr cc = client().query( ns() , Query().sort( BSON( "_id" << 1 ) ) ); bool first = true; - while ( cc->more() ){ + while ( cc->more() ) { if ( first ) first = false; else ss << ","; @@ -255,11 +255,11 @@ namespace UpdateTests { } return ss.str(); } - - void run(){ + + void run() { client().insert( ns(), BSON( "_id" << 1 << "x" << 1 ) ); client().insert( ns(), BSON( "_id" << 2 << "x" << 5 ) ); - + ASSERT_EQUALS( "1,5" , s() ); client().update( ns() , BSON( "_id" << 1 ) , BSON( "$inc" << BSON( "x" << 1 ) ) ); @@ -270,7 +270,7 @@ namespace UpdateTests { client().update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) , false , true ); ASSERT_EQUALS( "4,6" , s() ); - + } }; @@ -498,10 +498,10 @@ namespace UpdateTests { client().insert( ns(), BSON( "_id" << 55 << "i" << 5 ) ); client().update( ns(), BSON( "i" << 5 ), BSON( "i" << 6 ) ); ASSERT( !client().findOne( ns(), Query( BSON( "_id" << 55 ) ).hint - ( "{\"_id\":ObjectId(\"000000000000000000000000\")}" ) ).isEmpty() ); + ( "{\"_id\":ObjectId(\"000000000000000000000000\")}" ) ).isEmpty() ); } }; - + class CheckNoMods : public SetBase { public: void run() { @@ -509,7 +509,7 @@ namespace UpdateTests { ASSERT( error() ); } }; - + class UpdateMissingToNull : public SetBase { public: void run() { @@ -520,10 +520,10 @@ namespace UpdateTests { }; namespace ModSetTests { - + class internal1 { public: - void run(){ + void run() { BSONObj b = BSON( "$inc" << BSON( "x" << 1 << "a.b" << 1 ) ); ModSet m(b); @@ -532,7 +532,7 @@ namespace UpdateTests { ASSERT( ! m.haveModForField( "y" ) ); ASSERT( ! m.haveModForField( "a.c" ) ); ASSERT( ! m.haveModForField( "a" ) ); - + ASSERT( m.haveConflictingMod( "x" ) ); ASSERT( m.haveConflictingMod( "a" ) ); ASSERT( m.haveConflictingMod( "a.b" ) ); @@ -541,14 +541,14 @@ namespace UpdateTests { ASSERT( ! m.haveConflictingMod( "a.a" ) ); } }; - + class Base { public: - virtual ~Base(){} + virtual ~Base() {} + - - void test( BSONObj morig , BSONObj in , BSONObj wanted ){ + void test( BSONObj morig , BSONObj in , BSONObj wanted ) { BSONObj m = morig.copy(); ModSet set(m); @@ -556,20 +556,20 @@ namespace UpdateTests { ASSERT_EQUALS( wanted , out ); } }; - + class inc1 : public Base { public: - void run(){ + void run() { BSONObj m = BSON( "$inc" << BSON( "x" << 1 ) ); test( m , BSON( "x" << 5 ) , BSON( "x" << 6 ) ); test( m , BSON( "a" << 5 ) , BSON( "a" << 5 << "x" << 1 ) ); test( m , BSON( "z" << 5 ) , BSON( "x" << 1 << "z" << 5 ) ); } }; - + class inc2 : public Base { public: - void run(){ + void run() { BSONObj m = BSON( "$inc" << BSON( "a.b" << 1 ) ); test( m , BSONObj() , BSON( "a" << BSON( "b" << 1 ) ) ); test( m , BSON( "a" << BSON( "b" << 2 ) ) , BSON( "a" << BSON( "b" << 3 ) ) ); @@ -577,23 +577,23 @@ namespace UpdateTests { m = BSON( "$inc" << BSON( "a.b" << 1 << "a.c" << 1 ) ); test( m , BSONObj() , BSON( "a" << BSON( "b" << 1 << "c" << 1 ) ) ); - + } }; class set1 : public Base { public: - void run(){ + void run() { test( BSON( "$set" << BSON( "x" << 17 ) ) , BSONObj() , BSON( "x" << 17 ) ); test( BSON( "$set" << BSON( "x" << 17 ) ) , BSON( "x" << 5 ) , BSON( "x" << 17 ) ); test( BSON( "$set" << BSON( "x.a" << 17 ) ) , BSON( "z" << 5 ) , BSON( "x" << BSON( "a" << 17 )<< "z" << 5 ) ); } - }; - + }; + class push1 : public Base { public: - void run(){ + void run() { test( BSON( "$push" << BSON( "a" << 5 ) ) , fromjson( "{a:[1]}" ) , fromjson( "{a:[1,5]}" ) ); } }; @@ -602,33 +602,45 @@ namespace UpdateTests { namespace basic { class Base : public ClientBase { + protected: + virtual const char * ns() = 0; virtual void dotest() = 0; - - protected: - void test( const char* initial , const char* mod , const char* after ){ + void insert( const BSONObj& o ) { + client().insert( ns() , o ); + } + + void update( const BSONObj& m ) { + client().update( ns() , BSONObj() , m ); + } + + BSONObj findOne() { + return client().findOne( ns() , BSONObj() ); + } + + void test( const char* initial , const char* mod , const char* after ) { test( fromjson( initial ) , fromjson( mod ) , fromjson( after ) ); } - void test( const BSONObj& initial , const BSONObj& mod , const BSONObj& after ){ + void test( const BSONObj& initial , const BSONObj& mod , const BSONObj& after ) { client().dropCollection( ns() ); - client().insert( ns() , initial ); - client().update( ns() , BSONObj() , mod ); - ASSERT_EQUALS( after , client().findOne( ns(), BSONObj() )); + insert( initial ); + update( mod ); + ASSERT_EQUALS( after , findOne() ); client().dropCollection( ns() ); } public: - - Base(){} - virtual ~Base(){ + + Base() {} + virtual ~Base() { } - void run(){ + void run() { client().dropCollection( ns() ); - + dotest(); client().dropCollection( ns() ); @@ -640,98 +652,124 @@ namespace UpdateTests { virtual BSONObj mod() = 0; virtual BSONObj after() = 0; - void dotest(){ + void dotest() { test( initial() , mod() , after() ); } - + }; - + class inc1 : public SingleTest { - virtual BSONObj initial(){ + virtual BSONObj initial() { return BSON( "_id" << 1 << "x" << 1 ); } - virtual BSONObj mod(){ + virtual BSONObj mod() { return BSON( "$inc" << BSON( "x" << 2 ) ); } - virtual BSONObj after(){ + virtual BSONObj after() { return BSON( "_id" << 1 << "x" << 3 ); } - virtual const char * ns(){ + virtual const char * ns() { return "unittests.inc1"; } }; class inc2 : public SingleTest { - virtual BSONObj initial(){ + virtual BSONObj initial() { return BSON( "_id" << 1 << "x" << 1 ); } - virtual BSONObj mod(){ + virtual BSONObj mod() { return BSON( "$inc" << BSON( "x" << 2.5 ) ); } - virtual BSONObj after(){ + virtual BSONObj after() { return BSON( "_id" << 1 << "x" << 3.5 ); } - virtual const char * ns(){ + virtual const char * ns() { return "unittests.inc2"; } }; - + class inc3 : public SingleTest { - virtual BSONObj initial(){ + virtual BSONObj initial() { return BSON( "_id" << 1 << "x" << 537142123123LL ); } - virtual BSONObj mod(){ + virtual BSONObj mod() { return BSON( "$inc" << BSON( "x" << 2 ) ); } - virtual BSONObj after(){ + virtual BSONObj after() { return BSON( "_id" << 1 << "x" << 537142123125LL ); } - virtual const char * ns(){ - return "unittests.inc2"; + virtual const char * ns() { + return "unittests.inc3"; } }; class inc4 : public SingleTest { - virtual BSONObj initial(){ + virtual BSONObj initial() { return BSON( "_id" << 1 << "x" << 537142123123LL ); } - virtual BSONObj mod(){ + virtual BSONObj mod() { return BSON( "$inc" << BSON( "x" << 2LL ) ); } - virtual BSONObj after(){ + virtual BSONObj after() { return BSON( "_id" << 1 << "x" << 537142123125LL ); } - virtual const char * ns(){ - return "unittests.inc2"; + virtual const char * ns() { + return "unittests.inc4"; } }; class inc5 : public SingleTest { - virtual BSONObj initial(){ + virtual BSONObj initial() { return BSON( "_id" << 1 << "x" << 537142123123LL ); } - virtual BSONObj mod(){ + virtual BSONObj mod() { return BSON( "$inc" << BSON( "x" << 2.0 ) ); } - virtual BSONObj after(){ + virtual BSONObj after() { return BSON( "_id" << 1 << "x" << 537142123125LL ); } - virtual const char * ns(){ - return "unittests.inc2"; + virtual const char * ns() { + return "unittests.inc5"; } }; + class inc6 : public Base { + + virtual const char * ns() { + return "unittests.inc6"; + } + + + virtual BSONObj initial() { return BSONObj(); } + virtual BSONObj mod() { return BSONObj(); } + virtual BSONObj after() { return BSONObj(); } + + void dotest() { + client().insert( ns() , BSON( "x" << 5 ) ); + ASSERT( findOne()["x"].type() == NumberInt ); + long long start = 5; + long long max = numeric_limits::max(); + max *= 32; + + while ( start < max ) { + update( BSON( "$inc" << BSON( "x" << 500000 ) ) ); + start += 500000; + ASSERT_EQUALS( start , findOne()["x"].numberLong() ); // SERVER-2005 + } + + } + }; class bit1 : public Base { - const char * ns(){ + const char * ns() { return "unittests.bit1"; } - void dotest(){ + void dotest() { test( BSON( "_id" << 1 << "x" << 3 ) , BSON( "$bit" << BSON( "x" << BSON( "and" << 2 ) ) ) , BSON( "_id" << 1 << "x" << ( 3 & 2 ) ) ); test( BSON( "_id" << 1 << "x" << 1 ) , BSON( "$bit" << BSON( "x" << BSON( "or" << 4 ) ) ) , BSON( "_id" << 1 << "x" << ( 1 | 4 ) ) ); test( BSON( "_id" << 1 << "x" << 3 ) , BSON( "$bit" << BSON( "x" << BSON( "and" << 2 << "or" << 8 ) ) ) , BSON( "_id" << 1 << "x" << ( ( 3 & 2 ) | 8 ) ) ); @@ -739,21 +777,21 @@ namespace UpdateTests { } }; - + class unset : public Base { - const char * ns(){ + const char * ns() { return "unittests.unset"; } - void dotest(){ + void dotest() { test( "{_id:1,x:1}" , "{$unset:{x:1}}" , "{_id:1}" ); } }; class setswitchint : public Base { - const char * ns(){ + const char * ns() { return "unittests.int1"; } - void dotest(){ + void dotest() { test( BSON( "_id" << 1 << "x" << 1 ) , BSON( "$set" << BSON( "x" << 5.6 ) ) , BSON( "_id" << 1 << "x" << 5.6 ) ); test( BSON( "_id" << 1 << "x" << 5.6 ) , BSON( "$set" << BSON( "x" << 1 ) ) , BSON( "_id" << 1 << "x" << 1 ) ); } @@ -761,12 +799,12 @@ namespace UpdateTests { }; - + class All : public Suite { public: All() : Suite( "update" ) { } - void setupTests(){ + void setupTests() { add< ModId >(); add< ModNonmodMix >(); add< InvalidMod >(); @@ -815,18 +853,19 @@ namespace UpdateTests { add< PreserveIdWithIndex >(); add< CheckNoMods >(); add< UpdateMissingToNull >(); - + add< ModSetTests::internal1 >(); add< ModSetTests::inc1 >(); add< ModSetTests::inc2 >(); add< ModSetTests::set1 >(); add< ModSetTests::push1 >(); - + add< basic::inc1 >(); add< basic::inc2 >(); add< basic::inc3 >(); add< basic::inc4 >(); add< basic::inc5 >(); + add< basic::inc6 >(); add< basic::bit1 >(); add< basic::unset >(); add< basic::setswitchint >(); diff --git a/debian/changelog b/debian/changelog deleted file mode 100644 index c3b32b6..0000000 --- a/debian/changelog +++ /dev/null @@ -1,134 +0,0 @@ -mongodb (1.6.5) unstable; urgency=low - - * full change log http://jira.mongodb.org/browse/SERVER/fixforversion/10207 - - -- Richard Kreuter Tue, 7 Dec 2010 16:56:28 -0500 - -mongodb (1.6.4) unstable; urgency=low - - * replica_sets shell helpers - * sharding chunk safety, yielding during migrate cleanup - * full change log http://jira.mongodb.org/browse/SERVER/fixforversion/10191 - - -- Richard Kreuter Tue, 26 Oct 2010 16:56:28 -0500 - -mongodb (1.6.3) unstable; urgency=low - - * replica_sets slavedelay, rollback - * sharding optimization for larger than ram data sets - * full change log http://jira.mongodb.org/browse/SERVER/fixforversion/10190 - - -- Richard Kreuter Thu, 23 Sep 2010 16:56:28 -0500 - -mongodb (1.6.2) unstable; urgency=low - - * replica_sets some fixes - * sharding some fixes with rs - * full change log http://jira.mongodb.org/browse/SERVER/fixforversion/10187 - - -- Richard Kreuter Wed, 1 Sep 2010 16:56:28 -0500 - - -mongodb (1.6.1) unstable; urgency=low - - * replica_sets some fixes - * sharding some fixes with rs - * full change log http://jira.mongodb.org/browse/SERVER/fixforversion/10183 - - -- Richard Kreuter Tue, 17 Aug 2010 16:56:28 -0500 - -mongodb (1.6.0) unstable; urgency=low - - * sharding stable - * replica_sets stable - - -- Richard Kreuter Thu, 05 Aug 2010 16:56:28 -0500 - -mongodb (1.5.8) unstable; urgency=low - - * sharding lots of changes - * replica_sets lots of changes - - -- Richard Kreuter Tue, 03 Aug 2010 16:56:28 -0500 - -mongodb (1.5.7) unstable; urgency=low - - * sharding lots of changes - * replica_sets lots of changes - - -- Richard Kreuter Fri, 30 Jul 2010 16:56:28 -0500 - - -mongodb (1.5.6) unstable; urgency=low - - * sharding lots of changes, see http://jira.mongodb.org/browse/SERVER/fixforversion/10179 - - -- Richard Kreuter Sat, 24 Jul 2010 16:56:28 -0500 - -mongodb (1.5.5) unstable; urgency=low - - * sharding lots of changes, see http://jira.mongodb.org/browse/SERVER/fixforversion/10157 - - -- Richard Kreuter Fri, 16 Jul 2010 16:56:28 -0500 - -mongodb (1.5.4) unstable; urgency=low - - * sharding lots of changes, see http://jira.mongodb.org/browse/SERVER/fixforversion/10157 - - -- Richard Kreuter Fri, 2 Jul 2010 16:56:28 -0500 - -mongodb (1.5.3) unstable; urgency=low - - * sharding lots of changes, see http://jira.mongodb.org/browse/SERVER/fixforversion/10157 - - -- Richard Kreuter Thu, 17 Jun 2010 16:56:28 -0500 - -mongodb (1.5.2) unstable; urgency=low - - * sharding lots of changes, see http://jira.mongodb.org/browse/SERVER/fixforversion/10143 - - -- Richard Kreuter Wed, 27 May 2010 16:56:28 -0500 - -mongodb (1.5.1) unstable; urgency=low - - * sharding lots of changes, see http://jira.mongodb.org/browse/SERVER/fixforversion/10142 - - -- Richard Kreuter Wed, 3 May 2010 16:56:28 -0500 - -mongodb (1.5.0) unstable; urgency=low - - * replication w & real-time, see http://jira.mongodb.org/browse/SERVER/fixforversion/10125 - - -- Richard Kreuter Wed, 22 Mar 2010 16:56:28 -0500 - - -mongodb (1.3.5) unstable; urgency=low - - * bug fixes - - -- Richard Kreuter Wed, 22 Mar 2010 16:56:28 -0500 - -mongodb (1.3.4) unstable; urgency=low - - * bufg fixes - - -- Richard Kreuter Wed, 17 Mar 2010 16:56:28 -0500 - -mongodb (1.3.3) unstable; urgency=low - - * geo - - -- Richard Kreuter Fri, 05 Feb 2010 16:56:28 -0500 - -mongodb (1.3.2) unstable; urgency=low - - * munged debian files - - -- Richard Kreuter Fri, 05 Feb 2010 16:56:28 -0500 - -mongodb (1.3.1) unstable; urgency=low - - * Initial release - - -- Kristina Chodorow Tue, 07 Apr 2009 10:18:58 -0400 - diff --git a/debian/compat b/debian/compat deleted file mode 100644 index 7f8f011..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -7 diff --git a/debian/control b/debian/control deleted file mode 100644 index 2aef1c3..0000000 --- a/debian/control +++ /dev/null @@ -1,29 +0,0 @@ -Source: mongodb -Section: devel -Priority: optional -Maintainer: Richard Kreuter -Build-Depends: debhelper (>= 7), libpcre3, libpcre3-dev, scons, xulrunner-dev, libboost1.35-dev | libboost1.37-dev | libboost1.38-dev | libboost1.40-dev, libboost-thread1.35-dev | libboost-thread1.37-dev | libboost-thread1.38-dev | libboost-thread1.40-dev, libboost-filesystem1.35-dev | libboost-filesystem1.37-dev | libboost-filesystem1.38-dev | libboost-filesystem1.40-dev, libboost-program-options1.35-dev | libboost-program-options1.37-dev | libboost-program-options1.38-dev | libboost-program-options1.40-dev, libboost-date-time1.35-dev | libboost-date-time1.37-dev | libboost-date-time1.38-dev | libboost-date-time1.40-dev, libpcap-dev, libreadline-dev -Standards-Version: 3.8.0 -Homepage: http://www.mongodb.org - -Package: mongodb -Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends}, xulrunner-dev -Description: An object/document-oriented database - MongoDB is a high-performance, open source, schema-free - document-oriented data store that's easy to deploy, manage - and use. It's network accessible, written in C++ and offers - the following features : - . - * Collection oriented storage - easy storage of object- - style data - * Full index support, including on inner objects - * Query profiling - * Replication and fail-over support - * Efficient storage of binary data including large - objects (e.g. videos) - * Auto-sharding for cloud-level scalability (Q209) - . - High performance, scalability, and reasonable depth of - functionality are the goals for the project. - diff --git a/debian/copyright b/debian/copyright deleted file mode 100644 index 478c6f9..0000000 --- a/debian/copyright +++ /dev/null @@ -1,23 +0,0 @@ -This package was debianized by Kristina Chodorow on -Tue, 07 Apr 2009 10:18:58 -0400. - -It was downloaded from http://www.mongodb.org - -Upstream Authors: - - Eliot Horowitz - Dwight Merriman - Aaron Staple - Michael Dirolf - Kristina Chodorow - -Copyright: - - 2009 10gen - -License: - - AGPL - -The Debian packaging is (C) 2009, Kristina Chodorow and -is licensed under the AGPL, see `http://www.fsf.org/licensing/licenses/agpl-3.0.html'. diff --git a/debian/dirs b/debian/dirs deleted file mode 100644 index a7b6e78..0000000 --- a/debian/dirs +++ /dev/null @@ -1,3 +0,0 @@ -usr/bin -usr/sbin -var/lib/mongodb diff --git a/debian/init.d b/debian/init.d deleted file mode 100644 index 47a10a0..0000000 --- a/debian/init.d +++ /dev/null @@ -1,243 +0,0 @@ -#!/bin/sh -# -# init.d script with LSB support. -# -# Copyright (c) 2007 Javier Fernandez-Sanguino -# -# This is free software; you may redistribute it and/or modify -# it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2, -# or (at your option) any later version. -# -# This is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License with -# the Debian operating system, in /usr/share/common-licenses/GPL; if -# not, write to the Free Software Foundation, Inc., 59 Temple Place, -# Suite 330, Boston, MA 02111-1307 USA -# -### BEGIN INIT INFO -# Provides: mongodb -# Required-Start: $network $local_fs $remote_fs -# Required-Stop: $network $local_fs $remote_fs -# Should-Start: $named -# Should-Stop: -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: An object/document-oriented database -# Description: MongoDB is a high-performance, open source, schema-free -# document-oriented data store that's easy to deploy, manage -# and use. It's network accessible, written in C++ and offers -# the following features: -# -# * Collection oriented storage - easy storage of object- -# style data -# * Full index support, including on inner objects -# * Query profiling -# * Replication and fail-over support -# * Efficient storage of binary data including large -# objects (e.g. videos) -# * Auto-sharding for cloud-level scalability (Q209) -# -# High performance, scalability, and reasonable depth of -# functionality are the goals for the project. -### END INIT INFO - -PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin -DAEMON=/usr/bin/mongod -DESC=database - -# Default defaults. Can be overridden by the /etc/default/$NAME -NAME=mongodb -CONF=/etc/mongodb.conf -DATA=/var/lib/mongodb -LOGDIR=/var/log/mongodb -PIDFILE=/var/run/$NAME.pid -LOGFILE=$LOGDIR/$NAME.log # Server logfile -ENABLE_MONGODB=yes - -# Include mongodb defaults if available -if [ -f /etc/default/$NAME ] ; then - . /etc/default/$NAME -fi - -if test ! -x $DAEMON; then - echo "Could not find $DAEMON" - exit 0 -fi - -if test "x$ENABLE_MONGODB" != "xyes"; then - exit 0 -fi - -if test ! -x $DATA; then - mkdir $DATA || exit 0 -fi - -. /lib/lsb/init-functions - -STARTTIME=1 -DIETIME=10 # Time to wait for the server to die, in seconds - # If this value is set too low you might not - # let some servers to die gracefully and - # 'restart' will not work - -DAEMONUSER=${DAEMONUSER:-mongodb} -DAEMON_OPTS=${DAEMON_OPTS:-"--dbpath $DATA --logpath $LOGFILE run"} -DAEMON_OPTS="$DAEMON_OPTS --config $CONF" - -set -e - - -running_pid() { -# Check if a given process pid's cmdline matches a given name - pid=$1 - name=$2 - [ -z "$pid" ] && return 1 - [ ! -d /proc/$pid ] && return 1 - cmd=`cat /proc/$pid/cmdline | tr "\000" "\n"|head -n 1 |cut -d : -f 1` - # Is this the expected server - [ "$cmd" != "$name" ] && return 1 - return 0 -} - -running() { -# Check if the process is running looking at /proc -# (works for all users) - - # No pidfile, probably no daemon present - [ ! -f "$PIDFILE" ] && return 1 - pid=`cat $PIDFILE` - running_pid $pid $DAEMON || return 1 - return 0 -} - -start_server() { -# Start the process using the wrapper - start-stop-daemon --background --start --quiet --pidfile $PIDFILE \ - --make-pidfile --chuid $DAEMONUSER \ - --exec $DAEMON -- $DAEMON_OPTS - errcode=$? - return $errcode -} - -stop_server() { -# Stop the process using the wrapper - start-stop-daemon --stop --quiet --pidfile $PIDFILE \ - --user $DAEMONUSER \ - --exec $DAEMON - errcode=$? - return $errcode -} - -force_stop() { -# Force the process to die killing it manually - [ ! -e "$PIDFILE" ] && return - if running ; then - kill -15 $pid - # Is it really dead? - sleep "$DIETIME"s - if running ; then - kill -9 $pid - sleep "$DIETIME"s - if running ; then - echo "Cannot kill $NAME (pid=$pid)!" - exit 1 - fi - fi - fi - rm -f $PIDFILE -} - - -case "$1" in - start) - log_daemon_msg "Starting $DESC" "$NAME" - # Check if it's running first - if running ; then - log_progress_msg "apparently already running" - log_end_msg 0 - exit 0 - fi - if start_server ; then - # NOTE: Some servers might die some time after they start, - # this code will detect this issue if STARTTIME is set - # to a reasonable value - [ -n "$STARTTIME" ] && sleep $STARTTIME # Wait some time - if running ; then - # It's ok, the server started and is running - log_end_msg 0 - else - # It is not running after we did start - log_end_msg 1 - fi - else - # Either we could not start it - log_end_msg 1 - fi - ;; - stop) - log_daemon_msg "Stopping $DESC" "$NAME" - if running ; then - # Only stop the server if we see it running - errcode=0 - stop_server || errcode=$? - log_end_msg $errcode - else - # If it's not running don't do anything - log_progress_msg "apparently not running" - log_end_msg 0 - exit 0 - fi - ;; - force-stop) - # First try to stop gracefully the program - $0 stop - if running; then - # If it's still running try to kill it more forcefully - log_daemon_msg "Stopping (force) $DESC" "$NAME" - errcode=0 - force_stop || errcode=$? - log_end_msg $errcode - fi - ;; - restart|force-reload) - log_daemon_msg "Restarting $DESC" "$NAME" - errcode=0 - stop_server || errcode=$? - # Wait some sensible amount, some server need this - [ -n "$DIETIME" ] && sleep $DIETIME - start_server || errcode=$? - [ -n "$STARTTIME" ] && sleep $STARTTIME - running || errcode=$? - log_end_msg $errcode - ;; - status) - - log_daemon_msg "Checking status of $DESC" "$NAME" - if running ; then - log_progress_msg "running" - log_end_msg 0 - else - log_progress_msg "apparently not running" - log_end_msg 1 - exit 1 - fi - ;; - # MongoDB can't reload its configuration. - reload) - log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon" - log_warning_msg "cannot re-read the config file (use restart)." - ;; - - *) - N=/etc/init.d/$NAME - echo "Usage: $N {start|stop|force-stop|restart|force-reload|status}" >&2 - exit 1 - ;; -esac - -exit 0 diff --git a/debian/lintian-overrides b/debian/lintian-overrides deleted file mode 100644 index c843e9e..0000000 --- a/debian/lintian-overrides +++ /dev/null @@ -1,11 +0,0 @@ -# Agreed with upstream, that redefining rpath is necessary as xulrunner used to -# change API without changing so-name -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongo /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongod /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongodump /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongoexport /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongofiles /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongoimport /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongorestore /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongos /usr/lib64/xulrunner-1.9.1 -mongodb: binary-or-shlib-defines-rpath ./usr/bin/mongosniff /usr/lib64/xulrunner-1.9.1 diff --git a/debian/mongo.1 b/debian/mongo.1 deleted file mode 100644 index 89f4881..0000000 --- a/debian/mongo.1 +++ /dev/null @@ -1,62 +0,0 @@ -.\" Documentation for the MongoDB shell -.TH MONGO "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongo \- the Mongo command\-line tool -.SH "SYNOPSIS" -\fBmongo [\fIOPTIONS\fR] [\fIDB_ADDRESS\fR] [\fIFILE+\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongo\fR -is a JavaScript shell (with GNU -readline -capabilities). It supports interactive and non\-interactive use. When used interactively, JavaScript can be used to query the database or perform any other function normally available with SpiderMonkey. Database output is displayed in JSON format. -.PP -If JavaScript files are specified on the command line, the shell will run non\-interactively, running each one in sequence and then exiting. -.SH "EXAMPLES" -.TP -.B mongo -start the shell, connecting to the server at localhost:27017 and using the test database -.TP -.B mongo foo -start the shell using the foo database at localhost:27017 -.TP -.B mongo 192.169.0.5/foo -start the shell using the foo database at 192.169.0.5:27017 -.TP -.B mongo 192.169.0.5:9999/foo -start the shell using the foo database at 192.169.0.5:9999 -.TP -.B mongo script1.js script2.js script3.js -run three scripts and exit -.SH "OPTIONS" -.TP -.B \-\-shell -run the shell after executing files -.TP -.B \-\-help -show usage information -.TP -.B \-\-host HOST -server to connect to (default HOST=localhost) -.TP -.B \-\-port PORT -port to connect to (default PORT=27017) -.TP -.B \-\-nodb -do not connect to mongod -.TP -.B \-\-eval SCRIPT -evaluate JavaScript -.TP -.B \-u USERNAME -specify user to log in as -.TP -.B \-pPASSWORD -specify password of user (notice there is no space) -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongod.1 b/debian/mongod.1 deleted file mode 100644 index 7b86359..0000000 --- a/debian/mongod.1 +++ /dev/null @@ -1,16 +0,0 @@ -.\" Documentation for the MongoDB shell -.TH MONGOD "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongod \- the Mongo Daemon -.SH "SYNOPSIS" -.SH "DESCRIPTION" -.PP -\fBmongod\fR -is a core MongoDB daemon. You are not supposed to call it directly, please refer to the wiki if necessary. -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Antonin Kral diff --git a/debian/mongodb.conf b/debian/mongodb.conf deleted file mode 100644 index 6a5de05..0000000 --- a/debian/mongodb.conf +++ /dev/null @@ -1,95 +0,0 @@ -# mongodb.conf - -# Where to store the data. - -# Note: if you run mongodb as a non-root user (recommended) you may -# need to create and set permissions for this directory manually, -# e.g., if the parent directory isn't mutable by the mongodb user. -dbpath=/var/lib/mongodb - -#where to log -logpath=/var/log/mongodb/mongodb.log - -logappend=true - -#port = 27017 - - - -# Enables periodic logging of CPU utilization and I/O wait -#cpu = true - -# Turn on/off security. Off is currently the default -#noauth = true -#auth = true - -# Verbose logging output. -#verbose = true - -# Inspect all client data for validity on receipt (useful for -# developing drivers) -#objcheck = true - -# Enable db quota management -#quota = true - -# Set oplogging level where n is -# 0=off (default) -# 1=W -# 2=R -# 3=both -# 7=W+some reads -#oplog = 0 - -# Diagnostic/debugging option -#nocursors = true - -# Ignore query hints -#nohints = true - -# Disable the HTTP interface (Defaults to localhost:27018). -#nohttpinterface = true - -# Turns off server-side scripting. This will result in greatly limited -# functionality -#noscripting = true - -# Turns off table scans. Any query that would do a table scan fails. -#notablescan = true - -# Disable data file preallocation. -#noprealloc = true - -# Specify .ns file size for new databases. -# nssize = - -# Accout token for Mongo monitoring server. -#mms-token = - -# Server name for Mongo monitoring server. -#mms-name = - -# Ping interval for Mongo monitoring server. -#mms-interval = - -# Replication Options - -# in replicated mongo databases, specify here whether this is a slave or master -#slave = true -#source = master.example.com -# Slave only: specify a single database to replicate -#only = master.example.com -# or -#master = true -#source = slave.example.com - -# Address of a server to pair with. -#pairwith = -# Address of arbiter server. -#arbiter = -# Automatically resync if slave data is stale -#autoresync -# Custom size for replication operation log. -#oplogSize = -# Size limit for in-memory storage of op ids. -#opIdMem = diff --git a/debian/mongodump.1 b/debian/mongodump.1 deleted file mode 100644 index 5cb33ce..0000000 --- a/debian/mongodump.1 +++ /dev/null @@ -1,36 +0,0 @@ -.\" Documentation for the MongoDB dump tool -.TH MONGODUMP "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongodump \- the Mongo dump tool -.SH "SYNOPSIS" -\fBmongodump [\fIOPTIONS\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongodump\fR -is a tool to output a binary representation of a database. It is mostly used for doing hot backups of a database. -.SH "OPTIONS" -.TP -.B \-\-help -show usage information -.TP -.B \-h, \-\-host HOST -server to connect to (default HOST=localhost) -.TP -.B \-d, \-\-db DATABASE -database to use -.TP -.B \-c, \-\-c COLLECTION -collection to use -.TP -.B \-o, \-\-out FILE -output file, if not specified, stdout is used -.TP -.B \-\-dbpath PATH -directly access mongod data files in this path, instead of connecting to a mongod instance -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongoexport.1 b/debian/mongoexport.1 deleted file mode 100644 index 1996b36..0000000 --- a/debian/mongoexport.1 +++ /dev/null @@ -1,51 +0,0 @@ -.\" Documentation for the MongoDB shell -.TH MONGOEXPORT "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongoexport \- the Mongo export tool -.SH "SYNOPSIS" -\fBmongoexport [\fIOPTIONS\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongoexport\fR -is a tool to export a MongoDB collection to either JSON or CSV. The query can be filtered or a list of fields to output can be given. -.PP -If the output is CSV, the fields must be specified in order. -.SH "EXAMPLES" -.TP -.B mongoexport -d test -c test1 --csv -f "name,num" -export documents from test.test1 in CSV format -.SH "OPTIONS" -.TP -.B \-\-help -show usage information -.TP -.B \-h, \-\-host HOST -server to connect to (default HOST=localhost) -.TP -.B \-d, \-\-db DATABASE -database to use -.TP -.B \-c, \-\-c COLLECTION -collection to use -.TP -.B \-q, \-\-query QUERY -query filter -.TP -.B \-f, \-\-fields FIELDS -comma\-separated list of field names -.TP -.B \-\-csv -export to CSV instead of JSON -.TP -.B \-o, \-\-out FILE -output file, if not specified, stdout is used -.TP -.B \-\-dbpath PATH -directly access mongod data files in this path, instead of connecting to a mongod instance -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongofiles.1 b/debian/mongofiles.1 deleted file mode 100644 index 4d7c0c5..0000000 --- a/debian/mongofiles.1 +++ /dev/null @@ -1,52 +0,0 @@ -.\" Documentation for the MongoDB dump tool -.TH MONGOFILES "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongofiles \- a simple GridFS interface -.SH "SYNOPSIS" -\fBmongofiles [\fIOPTIONS\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongofiles\fR -is used to list, get, and insert files in the database. -.SH "EXAMPLES" -.TP -.B mongofiles list -lists files in test.fs.files -.TP -.B mongofiles put README.txt -inserts the file README.txt into the collection test.fs.files -.TP -.B mongofiles get photo.jpg -retrieves photo.jpg from test.fs.files and saves it locally -.SH "OPTIONS" -.TP -.B \-\-help -show usage information -.TP -.B \-h, \-\-host HOST -mongo host to which to connect -.TP -.B \-d, \-\-db DB -database to use (default DB=test) -.TP -.B \-c, \-\-collection COLLECTION (default COLLECTION=fs.files) -collection to use -.TP -.B \-\-command [list\||\|search\||\|put\||\|get] -execute a command -.TP -.B \-\-file FILE -filename for get or put -.TP -.B list -list all files. takes an optional filename. the file has to start with the filename -.TP -.B search -search all files for something that contains the string -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongoimport.1 b/debian/mongoimport.1 deleted file mode 100644 index 4b6c3de..0000000 --- a/debian/mongoimport.1 +++ /dev/null @@ -1,63 +0,0 @@ -.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.37.1. -.TH MONGOIMPORT "1" "January 2010" "10gen" "Mongo Database" -.SH "NAME" -mongoimport \- the Mongo import tool -.SH "SYNOPSIS" -\fBmongoimport [\fIOPTIONS\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongoimport\fR -is a tool to import a MongoDB collection from JSON, CSV, or TSV. The query can be filtered or a list of fields to input can be given. -.\".SH "EXAMPLES" -.\".TP -.\".B mongoimport -d test -c test1 --csv -f "name,num" -.\"import documents from test.test1 in CSV format -.SS "OPTIONS" -.TP -\fB\-\-help\fR -show usage information -.TP -.B \-h, \-\-host HOST -server to connect to (default HOST=localhost) -.TP -.B \-d, \-\-db DATABASE -database to use -.TP -.B \-c, \-\-c COLLECTION -collection to use (some commands) -.TP -.B \-\-dbpath PATH -directly access mongod data files in this path, -instead of connecting to a mongod instance -.TP -.B \-v, \-\-verbose -be more verbose (include multiple times for more -verbosity e.g. \fB\-vvvvv\fR) -.TP -.B \-f, \-\-fields NAMES -comma seperated list of field names e.g. \fB\-f\fR name,age -.TP -.B \-\-fieldFile FILE -file with fields names \- 1 per line -.TP -.B \-\-ignoreBlanks -if given, empty fields in csv and tsv will be ignored -.TP -.B \-\-type TYPE -type of file to import. default: json (json,csv,tsv) -.TP -.B \-\-file FILE -file to import from; if not specified stdin is used -.TP -.B \-\-drop -drop collection first -.TP -.B \-\-headerline -CSV,TSV only \- use first line as headers -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongorestore.1 b/debian/mongorestore.1 deleted file mode 100644 index 5f207b0..0000000 --- a/debian/mongorestore.1 +++ /dev/null @@ -1,36 +0,0 @@ -.\" Documentation for the MongoDB dump tool -.TH MONGORESTORE "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongorestore \- the Mongo restoration tool -.SH "SYNOPSIS" -\fBmongorestore [\fIOPTIONS\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongorestore\fR -is a tool to use the output from mongodump to restore a database. -.SH "OPTIONS" -.TP -.B \-\-help -show usage information -.TP -.B \-h, \-\-host HOST -server to connect to (default HOST=localhost) -.TP -.B \-d, \-\-db DATABASE -database to use -.TP -.B \-c, \-\-c COLLECTION -collection to use -.TP -.B \-\-dir PATH -directory from which to restore -.TP -.B \-\-dbpath PATH -directly access mongod data files in this path, instead of connecting to a mongod instance -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongos.1 b/debian/mongos.1 deleted file mode 100644 index 74d01c6..0000000 --- a/debian/mongos.1 +++ /dev/null @@ -1,39 +0,0 @@ -.\" Documentation for the MongoDB dump tool -.TH MONGOS "1" "June 2009" "10gen" "Mongo Database" -.SH "NAME" -mongos \- the Mongo sharding server -.SH "SYNOPSIS" -\fBmongos [\fIOPTIONS\fR]\fR -.SH "DESCRIPTION" -.PP -\fBmongos\fR -is used to setup, configure, and get information about sharded databases. -.SH "EXAMPLES" -.PP -.B ./mongod --port 9999 --dbpath /data/db/a # first server -.PP -.B ./mongod --port 9998 --dbpath /data/db/b # second server -.PP -.B ./mongos --configdb localhost:9999 # mongos -.PP -starts three servers to set up sharding -.SH "OPTIONS" -.TP -.B \-\-help -show usage information -.TP -.B \-\-port N -port on which to listen -.TP -.B \-\-configdb DATABASE+ -one or more databases to use as the configuration databases -.TP -.B \-v+ -verbosity -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Kristina Chodorow diff --git a/debian/mongosniff.1 b/debian/mongosniff.1 deleted file mode 100644 index b6f1063..0000000 --- a/debian/mongosniff.1 +++ /dev/null @@ -1,30 +0,0 @@ -.TH MONGOSNIFF "1" "Jan 2010" "10gen" "Mongo Database" -.SH "NAME" -mongosniff \- the Mongo packet analyzer -.SH "SYNOPSIS" -\fBmongosniff [\fIOPTIONS\fR] [\fI ...\fR] -.SH "DESCRIPTION" -.PP -\fBmongosniff\fR -is a analyzer tool for analyzing packets coming to your database. -.PP -.SH "OPTIONS" -.TP -.B \-\-forward -Forward all parsed request messages to mongod instance at specified host:port -.TP -.B \-\-source -Source of traffic to sniff, either a network interface or a file containing previously captured packets, in pcap format. If no source is specified, mongosniff will attempt to sniff from one of the machine's network interfaces. -.TP -.B \-\-help -print a short help message. -.TP -.B -These parameters are used to filter sniffing. By default, only port 27017 is sniffed. -.SH "COPYRIGHT" -.PP -Copyright 2007\-2009 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Antonin Kral diff --git a/debian/mongostat.1 b/debian/mongostat.1 deleted file mode 100644 index 5828104..0000000 --- a/debian/mongostat.1 +++ /dev/null @@ -1,39 +0,0 @@ -.\" Documentation for the MongoDB shell -.TH MONGOSTAT "15" "March 2010" "10gen" "Mongo Database" -.SH "NAME" -mongostat \- view statistics on a running mongod instance -.SH "SYNOPSIS" -\fBmongostat [\fIOPTIONS\fR] -.SH "DESCRIPTION" -.PP -\fBmongostat\fR -prints statistics on a running mongod instance. -.SH "OPTIONS" -.TP -.B \-\-help -show usage information -.TP -.B \-h, \-\-host HOST -mongo host to connect to (use "left,right" for pairs) -\" .TP -\" .B \-\-port PORT -\" port to connect to (default PORT=27017) -.TP -.B \-d, \-\-db ARG -db to use -.TP -.B \-c, \-\-collection ARG -collection to use (some commands) -.TP -.B \-u, \-\-username USERNAME -specify user to log in as -.TP -.B \-p, \-\-password PASSWORD -specify password of user (notice there is no space) -.SH "COPYRIGHT" -.PP -Copyright 2010 10gen -.SH "SEE ALSO" -For more information, please refer to the MongoDB wiki, available at http://www.mongodb.org. -.SH "AUTHOR" -Eliot Horowitz diff --git a/debian/postinst b/debian/postinst deleted file mode 100644 index 4d0e786..0000000 --- a/debian/postinst +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/sh -# postinst script for mongodb -# -# see: dh_installdeb(1) - -set -e - -# summary of how this script can be called: -# * `configure' -# * `abort-upgrade' -# * `abort-remove' `in-favour' -# -# * `abort-remove' -# * `abort-deconfigure' `in-favour' -# `removing' -# -# for details, see http://www.debian.org/doc/debian-policy/ or -# the debian-policy package - - -case "$1" in - configure) - # create a mongodb group and user - if ! grep -q mongodb /etc/passwd; then - adduser --system --no-create-home mongodb - addgroup --system mongodb - adduser mongodb mongodb - fi - - # create db -- note: this should agree with dbpath in mongodb.conf - mkdir -p /var/lib/mongodb - chown -R mongodb:mongodb /var/lib/mongodb - - # create logdir -- note: this should agree with logpath in mongodb.conf - mkdir -p /var/log/mongodb - chown -R mongodb:mongodb /var/log/mongodb - ;; - - abort-upgrade|abort-remove|abort-deconfigure) - ;; - - *) - echo "postinst called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -# dh_installdeb will replace this with shell code automatically -# generated by other debhelper scripts. - -#DEBHELPER# - -exit 0 - - diff --git a/debian/postrm b/debian/postrm deleted file mode 100644 index 4bbb708..0000000 --- a/debian/postrm +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh -# postrm script for mongodb -# -# see: dh_installdeb(1) - -set -e - -# summary of how this script can be called: -# * `remove' -# * `purge' -# * `upgrade' -# * `failed-upgrade' -# * `abort-install' -# * `abort-install' -# * `abort-upgrade' -# * `disappear' -# -# for details, see http://www.debian.org/doc/debian-policy/ or -# the debian-policy package - - -case "$1" in - purge|remove|upgrade|failed-upgrade|abort-install|abort-upgrade|disappear) - ;; - - *) - echo "postrm called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -# dh_installdeb will replace this with shell code automatically -# generated by other debhelper scripts. - -#DEBHELPER# - -exit 0 - - diff --git a/debian/prerm b/debian/prerm deleted file mode 100644 index 9507ade..0000000 --- a/debian/prerm +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/sh -# prerm script for mongodb -# -# see: dh_installdeb(1) - -set -e - -# summary of how this script can be called: -# * `remove' -# * `upgrade' -# * `failed-upgrade' -# * `remove' `in-favour' -# * `deconfigure' `in-favour' -# `removing' -# -# for details, see http://www.debian.org/doc/debian-policy/ or -# the debian-policy package - -echo "arg: $1" - -case "$1" in - remove|upgrade|deconfigure) - ;; - - failed-upgrade) - ;; - - *) - echo "prerm called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -# dh_installdeb will replace this with shell code automatically -# generated by other debhelper scripts. - -#DEBHELPER# - -exit 0 - - diff --git a/debian/rules b/debian/rules deleted file mode 100644 index 2afdfdb..0000000 --- a/debian/rules +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/make -f -# -*- makefile -*- -# Sample debian/rules that uses debhelper. -# This file was originally written by Joey Hess and Craig Small. -# As a special exception, when this file is copied by dh-make into a -# dh-make output file, you may use that output file without restriction. -# This special exception was added by Craig Small in version 0.37 of dh-make. - -# Uncomment this to turn on verbose mode. -#export DH_VERBOSE=1 - - -configure: configure-stamp -configure-stamp: - dh_testdir - # Add here commands to configure the package. - - touch configure-stamp - - -build: build-stamp - -build-stamp: configure-stamp - dh_testdir - - # Add here commands to compile the package. - scons - #docbook-to-man debian/mongodb.sgml > mongodb.1 - ls debian/*.1 > debian/mongodb.manpages - - touch $@ - -clean: - dh_testdir - dh_testroot - rm -f build-stamp configure-stamp - - # FIXME: scons freaks out at the presence of target files - # under debian/mongodb. - #scons -c - rm -rf $(CURDIR)/debian/mongodb - rm -f config.log - rm -f mongo - rm -f mongod - rm -f mongoimportjson - rm -f mongoexport - rm -f mongorestore - rm -f mongodump - rm -f mongofiles - rm -f .sconsign.dblite - rm -f libmongoclient.a - rm -rf client/*.o - rm -rf tools/*.o - rm -rf shell/*.o - rm -rf .sconf_temp - rm -f buildscripts/*.pyc - rm -f *.pyc - rm -f buildinfo.cpp - dh_clean debian/files - -install: build - dh_testdir - dh_testroot - dh_prep - dh_installdirs - - scons --prefix=$(CURDIR)/debian/mongodb/usr install - mkdir -p $(CURDIR)/debian/mongodb/etc - cp $(CURDIR)/debian/mongodb.conf $(CURDIR)/debian/mongodb/etc/mongodb.conf - - mkdir -p $(CURDIR)/debian/mongodb/usr/share/lintian/overrides/ - install -m 644 $(CURDIR)/debian/lintian-overrides \ - $(CURDIR)/debian/mongodb/usr/share/lintian/overrides/mongodb - -# Build architecture-independent files here. -binary-indep: build install -# We have nothing to do by default. - -# Build architecture-dependent files here. -binary-arch: build install - dh_testdir - dh_testroot - dh_installchangelogs - dh_installdocs - dh_installexamples -# dh_install -# dh_installmenu -# dh_installdebconf -# dh_installlogrotate -# dh_installemacsen -# dh_installpam -# dh_installmime - dh_installinit -# dh_installinfo - dh_installman - dh_link - dh_strip - dh_compress - dh_fixperms - dh_installdeb - dh_shlibdeps - dh_gencontrol - dh_md5sums - dh_builddeb - -binary: binary-indep binary-arch -.PHONY: build clean binary-indep binary-arch binary install configure diff --git a/debian/watch b/debian/watch deleted file mode 100644 index 08ce42b..0000000 --- a/debian/watch +++ /dev/null @@ -1,10 +0,0 @@ -# Example watch control file for uscan -# Rename this file to "watch" and then you can run the "uscan" command -# to check for upstream updates and more. -# See uscan(1) for format - -# Compulsory line, this is a version 3 file -version=3 - -# examine a Webserver directory -http://downloads.mongodb.org/linux/mongodb-linux-(.*)\.tar\.gz diff --git a/distsrc/client/SConstruct b/distsrc/client/SConstruct index 8a8bae9..a97699e 100644 --- a/distsrc/client/SConstruct +++ b/distsrc/client/SConstruct @@ -1,6 +1,7 @@ import os +# options AddOption( "--extrapath", dest="extrapath", type="string", @@ -8,6 +9,15 @@ AddOption( "--extrapath", action="store", help="comma separated list of add'l paths (--extrapath /opt/foo/,/foo) static linking" ) +AddOption( "--prefix", + dest="prefix", + type="string", + nargs=1, + action="store", + default="/usr/local", + help="installation root" ) + + env = Environment() def addExtraLibs( s ): @@ -36,6 +46,7 @@ elif "linux2" == os.sys.platform: if nix: env.Append( CPPFLAGS=" -O3" ) + env.Append( LIBS=["pthread"] ) if linux: env.Append( LINKFLAGS=" -Wl,--as-needed -Wl,-zdefs " ) @@ -44,18 +55,37 @@ conf = Configure(env) for lib in boostLibs: if not conf.CheckLib("boost_%s-mt" % lib): conf.CheckLib("boost_%s" % lib) + +dirs = [ "" , "bson/" , "bson/util/" , + "client/" , "s/" , "shell/" , + "db/" , + "scripting/" , + "util/" , "util/concurrency/" , "util/mongoutils/" ] + allClientFiles = [] -allClientFiles += Glob( "mongo/*.cpp" ) -allClientFiles += Glob( "mongo/client/*.cpp" ) -allClientFiles += Glob( "mongo/s/*.cpp" ) -allClientFiles += Glob( "mongo/shell/*.cpp" ) -allClientFiles += Glob( "mongo/db/*.cpp" ) -allClientFiles += Glob( "mongo/scripting/*.cpp" ) -allClientFiles += Glob( "mongo/util/*.cpp" ) +for x in dirs: + allClientFiles += Glob( "mongo/" + x + "*.cpp" ) allClientFiles += Glob( "mongo/util/*.c" ) -env.SharedLibrary( "mongoclient" , allClientFiles ) -env.Library( "mongoclient" , allClientFiles ) +libs = [] +libs += env.SharedLibrary( "mongoclient" , allClientFiles ) +libs += env.Library( "mongoclient" , allClientFiles ) + +# install + +prefix = GetOption( "prefix" ) + +for x in libs: + env.Install( prefix + "/lib/" , str(x) ) + +for x in dirs: + x = "mongo/" + x + env.Install( prefix + "/include/" + x , Glob( x + "*.h" ) ) + +env.Alias( "install" , prefix ) + + +# example setup clientTests = [] clientEnv = env.Clone(); diff --git a/doxygenConfig b/doxygenConfig index 9d4bbfb..fcf10e7 100644 --- a/doxygenConfig +++ b/doxygenConfig @@ -3,7 +3,7 @@ #--------------------------------------------------------------------------- DOXYFILE_ENCODING = UTF-8 PROJECT_NAME = MongoDB -PROJECT_NUMBER = 1.6.6-pre- +PROJECT_NUMBER = 1.8.0 OUTPUT_DIRECTORY = docs/doxygen CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English @@ -101,7 +101,7 @@ WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- -INPUT = client db/jsobj.h db/json.h bson +INPUT = client db/jsobj.h db/json.h bson util INPUT_ENCODING = UTF-8 FILE_PATTERNS = *.c \ *.cc \ diff --git a/jstests/_tst.js b/jstests/_tst.js new file mode 100644 index 0000000..f208164 --- /dev/null +++ b/jstests/_tst.js @@ -0,0 +1,41 @@ +/* a general testing framework (helpers) for us in the jstests/ + + to use, from your test file: + testname="mytestname"; + load("jstests/_tst.js"); +*/ + +if( typeof tst == "undefined" ) { + tst = {} + + tst.log = function (optional_msg) { + print("\n\nstep " + ++this._step + " " + (optional_msg || "")); + } + + tst.success = function () { + print(testname + " SUCCESS"); + } + + /* diff files a and b, returning the difference (empty str if no difference) */ + tst.diff = function(a, b) { + function reSlash(s) { + var x = s; + if (_isWindows()) { + while (1) { + var y = x.replace('/', '\\'); + if (y == x) + break; + x = y; + } + } + return x; + } + a = reSlash(a); + b = reSlash(b); + print("diff " + a + " " + b); + return run("diff", a, b); + } +} + +print(testname + " BEGIN"); +tst._step = 0; diff --git a/jstests/apitest_db.js b/jstests/apitest_db.js index f54879c..c734d67 100644 --- a/jstests/apitest_db.js +++ b/jstests/apitest_db.js @@ -70,3 +70,8 @@ assert( asserted, "should have asserted" ); dd( "g" ); + + +assert.eq( "foo" , db.getSisterDB( "foo" ).getName() ) +assert.eq( "foo" , db.getSiblingDB( "foo" ).getName() ) + diff --git a/jstests/array4.js b/jstests/array4.js new file mode 100644 index 0000000..1053e16 --- /dev/null +++ b/jstests/array4.js @@ -0,0 +1,30 @@ + +t = db.array4; +t.drop(); + +t.insert({"a": ["1", "2", "3"]}); +t.insert({"a" : ["2", "1"]}); + +var x = {'a.0' : /1/}; + +assert.eq(t.count(x), 1); + +assert.eq(t.findOne(x).a[0], 1); +assert.eq(t.findOne(x).a[1], 2); + +t.drop(); + +t.insert({"a" : {"0" : "1"}}); +t.insert({"a" : ["2", "1"]}); + +assert.eq(t.count(x), 1); +assert.eq(t.findOne(x).a[0], 1); + +t.drop(); + +t.insert({"a" : ["0", "1", "2", "3", "4", "5", "6", "1", "1", "1", "2", "3", "2", "1"]}); +t.insert({"a" : ["2", "1"]}); + +x = {"a.12" : /2/}; +assert.eq(t.count(x), 1); +assert.eq(t.findOne(x).a[0], 0); diff --git a/jstests/arrayfind3.js b/jstests/arrayfind3.js new file mode 100644 index 0000000..60da713 --- /dev/null +++ b/jstests/arrayfind3.js @@ -0,0 +1,21 @@ + +t = db.arrayfind3; +t.drop() + +t.save({a:[1,2]}) +t.save({a:[1, 2, 6]}) +t.save({a:[1, 4, 6]}) + + +assert.eq( 2 , t.find( {a:{$gte:3, $lte: 5}} ).itcount() , "A1" ) +assert.eq( 1 , t.find( {a:{$elemMatch:{$gte:3, $lte: 5}}} ).itcount() , "A2" ) + +t.ensureIndex( { a : 1 } ) + +printjson( t.find( {a:{$gte:3, $lte: 5}} ).explain() ); + +//assert.eq( 2 , t.find( {a:{$gte:3, $lte: 5}} ).itcount() , "B1" ); // SERVER-1264 +assert.eq( 1 , t.find( {a:{$elemMatch:{$gte:3, $lte: 5}}} ).itcount() , "B2" ) + + + diff --git a/jstests/auth/auth1.js b/jstests/auth/auth1.js index 6fc6dc5..2f2a1b4 100644 --- a/jstests/auth/auth1.js +++ b/jstests/auth/auth1.js @@ -68,6 +68,6 @@ if ( db.runCommand( "features" ).readlock ){ initial: { count: 0 } }; - assert.throws( function() { return t.group( p ) }, "write reduce didn't fail" ); + assert.throws( function() { return t.group( p ) }, null , "write reduce didn't fail" ); } diff --git a/jstests/basic3.js b/jstests/basic3.js index 2deee2b..4488865 100644 --- a/jstests/basic3.js +++ b/jstests/basic3.js @@ -3,14 +3,13 @@ t = db.getCollection( "foo_basic3" ); t.find( { "a.b" : 1 } ).toArray(); -ok = false; +ok = true; try{ t.save( { "a.b" : 5 } ); ok = false; } catch ( e ){ - ok = true; } assert( ok , ". in names aren't allowed doesn't work" ); @@ -19,6 +18,33 @@ try{ ok = false; } catch ( e ){ - ok = true; } assert( ok , ". in embedded names aren't allowed doesn't work" ); + +// following tests make sure update keys are checked +t.save({"a": 0,"b": 1}) +try { + t.update({"a": 0}, {"b.b": 1}); + ok = false; +} catch (e) {} +assert( ok , "must deny '.' in key of update" ); + +// upsert with embedded doc +try { + t.update({"a": 10}, {"b": { "c.c" : 1 }}, true); + ok = false; +} catch (e) {} +assert( ok , "must deny '.' in key of update" ); + +// if it is a modifier, it should still go through +t.update({"a": 0}, {$set: { "c.c": 1}}) +t.update({"a": 0}, {$inc: { "c.c": 1}}) + +// edge cases +try { + t.update({"a": 0}, {"": { "c.c": 1}}) + ok = false; +} catch (e) {} +assert( ok , "must deny '.' in key of update" ); +t.update({"a": 0}, {}) + diff --git a/jstests/big_object1.js b/jstests/big_object1.js new file mode 100644 index 0000000..be841e0 --- /dev/null +++ b/jstests/big_object1.js @@ -0,0 +1,46 @@ + +t = db.big_object1 +t.drop(); + +if ( db.adminCommand( "buildinfo" ).bits == 64 ){ + + s = "" + while ( s.length < 850 * 1024 ){ + s += "x"; + } + + x = 0; + while ( true ){ + n = { _id : x , a : [] } + for ( i=0; i<14+x; i++ ) + n.a.push( s ) + try { + t.insert( n ) + o = n + } + catch ( e ){ + break; + } + + if ( db.getLastError() != null ) + break; + x++; + } + + printjson( t.stats(1024*1024) ) + + assert.lt( 15 * 1024 * 1024 , Object.bsonsize( o ) , "A1" ) + assert.gt( 17 * 1024 * 1024 , Object.bsonsize( o ) , "A2" ) + + assert.eq( x , t.count() , "A3" ) + + for ( i=0; i min ) { + // 'n' is the number of documents to remove - we must account for the + // possibility that 'inc' will be true, and avoid removing all documents + // from the collection in that case, as removing all documents is not + // allowed by 'captrunc' var n = Random.randInt( count - min - 1 ); // 0 <= x <= count - min - 1 var inc = Random.rand() > 0.5; debug( count + " " + n + " " + inc ); @@ -58,10 +78,13 @@ function doTest() { } count -= n; max -= n; + // Validate the remaining documents. checkOrder( max - 1 ); } } +// Repeatedly add up to 'oldMax' documents and then truncate the newest +// documents. Newer documents take up more space than older documents. for( var i = 0; i < 10; ++i ) { doTest(); } @@ -77,6 +100,8 @@ db.capped6.drop(); db._dbCommand( { create: "capped6", capped: true, size: 1000, $nExtents: 11, autoIndexId: false } ); tzz = db.capped6; +// Same test as above, but now the newer documents take less space than the +// older documents instead of more. for( var i = 0; i < 10; ++i ) { doTest(); } diff --git a/jstests/capped7.js b/jstests/capped7.js index ecb689e..693828d 100644 --- a/jstests/capped7.js +++ b/jstests/capped7.js @@ -1,3 +1,5 @@ +// Test NamespaceDetails::emptyCappedCollection via 'emptycapped' command + Random.setRandomSeed(); db.capped7.drop(); @@ -8,6 +10,10 @@ var ten = new Array( 11 ).toString().replace( /,/g, "-" ); count = 0; +/** + * Insert new documents until the capped collection loops and the document + * count doesn't increase on insert. + */ function insertUntilFull() { count = tzz.count(); var j = 0; @@ -23,21 +29,27 @@ while( 1 ) { insertUntilFull(); +// oldCount == count before empty oldCount = count; assert.eq.automsg( "11", "tzz.stats().numExtents" ); + +// oldSize == size before empty var oldSize = tzz.stats().storageSize; assert.commandWorked( db._dbCommand( { emptycapped: "capped7" } ) ); +// check that collection storage parameters are the same after empty assert.eq.automsg( "11", "tzz.stats().numExtents" ); assert.eq.automsg( "oldSize", "tzz.stats().storageSize" ); +// check that the collection is empty after empty assert.eq.automsg( "0", "tzz.find().itcount()" ); assert.eq.automsg( "0", "tzz.count()" ); +// check that we can reuse the empty collection, inserting as many documents +// as we were able to the first time through. insertUntilFull(); - assert.eq.automsg( "oldCount", "count" ); assert.eq.automsg( "oldCount", "tzz.find().itcount()" ); assert.eq.automsg( "oldCount", "tzz.count()" ); @@ -47,12 +59,16 @@ var oldSize = tzz.stats().storageSize; assert.commandWorked( db._dbCommand( { emptycapped: "capped7" } ) ); +// check that the collection storage parameters are unchanged after another empty assert.eq.automsg( "11", "tzz.stats().numExtents" ); assert.eq.automsg( "oldSize", "tzz.stats().storageSize" ); +// insert an arbitrary number of documents var total = Random.randInt( 2000 ); for( var j = 1; j <= total; ++j ) { tzz.save( {i:ten,j:j} ); + // occasionally check that only the oldest documents are removed to make room + // for the newest documents if ( Random.rand() > 0.95 ) { assert.automsg( "j >= tzz.count()" ); assert.eq.automsg( "tzz.count()", "tzz.find().itcount()" ); @@ -62,6 +78,7 @@ for( var j = 1; j <= total; ++j ) { while( c.hasNext() ) { assert.eq.automsg( "c.next().j", "k--" ); } + // check the same thing with a reverse iterator as well var c = tzz.find().sort( {$natural:1} ); assert.automsg( "c.hasNext()" ); while( c.hasNext() ) { diff --git a/jstests/capped8.js b/jstests/capped8.js new file mode 100644 index 0000000..cce0eec --- /dev/null +++ b/jstests/capped8.js @@ -0,0 +1,86 @@ +// Test NamespaceDetails::cappedTruncateAfter with empty extents + +Random.setRandomSeed(); + +t = db.jstests_capped8; + +function debug( x ) { +// printjson( x ); +} + +/** Generate an object with a string field of specified length */ +function obj( size ) { + return {a:new Array( size + 1 ).toString()};; +} + +function withinOne( a, b ) { + assert( Math.abs( a - b ) <= 1, "not within one: " + a + ", " + b ) +} + +/** + * Insert enough documents of the given size spec that the collection will + * contain only documents having this size spec. + */ +function insertMany( size ) { + // Add some variability, as the precise number can trigger different cases. + n = 250 + Random.randInt( 10 ); + for( i = 0; i < n; ++i ) { + t.save( obj( size ) ); + debug( t.count() ); + } +} + +/** + * Insert some documents in such a way that there may be an empty extent, then + * truncate the capped collection. + */ +function insertAndTruncate( first ) { + myInitialCount = t.count(); + // Insert enough documents to make the capped allocation loop over. + insertMany( 50 ); + myFiftyCount = t.count(); + // Insert documents that are too big to fit in the smaller extents. + insertMany( 2000 ); + myTwokCount = t.count(); + if ( first ) { + initialCount = myInitialCount; + fiftyCount = myFiftyCount; + twokCount = myTwokCount; + // Sanity checks for collection count + assert( fiftyCount > initialCount ); + assert( fiftyCount > twokCount ); + } else { + // Check that we are able to insert roughly the same number of documents + // after truncating. The exact values are slightly variable as a result + // of the capped allocation algorithm. + withinOne( initialCount, myInitialCount ); + withinOne( fiftyCount, myFiftyCount ); + withinOne( twokCount, myTwokCount ); + } + count = t.count(); + // Check that we can truncate the collection successfully. + assert.commandWorked( db.runCommand( { captrunc:"jstests_capped8", n:count - 1, inc:false } ) ); +} + +/** Test truncating and subsequent inserts */ +function testTruncate() { + insertAndTruncate( true ); + insertAndTruncate( false ); + insertAndTruncate( false ); +} + +t.drop(); +db._dbCommand( { create:"jstests_capped8", capped: true, $nExtents: [ 10000, 10000, 1000 ] } ); +testTruncate(); + +t.drop(); +db._dbCommand( { create:"jstests_capped8", capped: true, $nExtents: [ 10000, 1000, 1000 ] } ); +testTruncate(); + +t.drop(); +db._dbCommand( { create:"jstests_capped8", capped: true, $nExtents: [ 10000, 1000 ] } ); +testTruncate(); + +t.drop(); +db._dbCommand( { create:"jstests_capped8", capped: true, $nExtents: [ 10000 ] } ); +testTruncate(); diff --git a/jstests/check_shard_index.js b/jstests/check_shard_index.js new file mode 100644 index 0000000..a5a1fc1 --- /dev/null +++ b/jstests/check_shard_index.js @@ -0,0 +1,45 @@ +// ------------------------- +// CHECKSHARDINGINDEX TEST UTILS +// ------------------------- + +f = db.jstests_shardingindex; +f.drop(); + + +// ------------------------- +// Case 1: all entries filled or empty should make a valid index +// + +f.drop(); +f.ensureIndex( { x: 1 , y: 1 } ); +assert.eq( 0 , f.count() , "1. initial count should be zero" ); + +res = db.runCommand( { checkShardingIndex: "test.jstests_shardingindex" , keyPattern: {x:1, y:1} , force: true }); +assert.eq( true , res.ok, "1a" ); + +f.save( { x: 1 , y : 1 } ); +assert.eq( 1 , f.count() , "1. count after initial insert should be 1" ); +res = db.runCommand( { checkShardingIndex: "test.jstests_shardingindex" , keyPattern: {x:1, y:1} , force: true }); +assert.eq( true , res.ok , "1b" ); + + +// ------------------------- +// Case 2: entry with null values would make an index unsuitable +// + +f.drop(); +f.ensureIndex( { x: 1 , y: 1 } ); +assert.eq( 0 , f.count() , "2. initial count should be zero" ); + +f.save( { x: 1 , y : 1 } ); +f.save( { x: null , y : 1 } ); + +res = db.runCommand( { checkShardingIndex: "test.jstests_shardingindex" , keyPattern: {x:1, y:1} , force: true }); +assert.eq( true , res.ok , "2a " + tojson(res) ); + +f.save( { y: 2 } ); +assert.eq( 3 , f.count() , "2. count after initial insert should be 3" ); +res = db.runCommand( { checkShardingIndex: "test.jstests_shardingindex" , keyPattern: {x:1, y:1} , force: true }); +assert.eq( false , res.ok , "2b " + tojson(res) ); + +print("PASSED"); diff --git a/jstests/conc_update.js b/jstests/conc_update.js deleted file mode 100644 index ac70861..0000000 --- a/jstests/conc_update.js +++ /dev/null @@ -1,45 +0,0 @@ -// db = db.getSisterDB("concurrency") -// db.dropDatabase(); -// -// NRECORDS=10*1024*1024 // this needs to be relatively big so that -// // the update() will take a while. -// -// print("loading data (will take a while; progress msg every 1024*1024 documents)") -// for (i=0; i<(10*1024*1024); i++) { -// db.conc.insert({x:i}) -// if ((i%(1024*1024))==0) -// print("loaded " + i/(1024*1024) + " mibi-records") -// } -// -// print("making an index (will take a while)") -// db.conc.ensureIndex({x:1}) -// -// var c1=db.conc.count({x:{$lt:NRECORDS}}) -// // this is just a flag that the child will toggle when it's done. -// db.concflag.update({}, {inprog:true}, true) -// -// updater=startParallelShell("db=db.getSisterDB('concurrency');\ -// db.conc.update({}, {$inc:{x: "+NRECORDS+"}}, false, true);\ -// print(db.getLastError());\ -// db.concflag.update({},{inprog:false})"); -// -// querycount=0; -// decrements=0; -// misses=0 -// while (1) { -// if (db.concflag.findOne().inprog) { -// c2=db.conc.count({x:{$lt:10*1024*1024}}) -// print(c2) -// querycount++; -// if (c2 0; } ).sort( { _id : -1 } ).limit(n).itcount() - end = new Date() + var start = null; + var ex = null; + var num = null; + var end = null; + try { + start = new Date() + ex = t.find(function () { num = 2; for (var x = 0; x < 1000; x++) num += 2; return num > 0; }).sort({ _id: -1 }).explain() + num = ex.n + end = new Date() + } + catch (e) { + print("cursora.js FAIL " + e); + join(); + throw e; + } + join() - print( "num: " + num + " time:" + ( end.getTime() - start.getTime() ) ) - assert.eq( 0 , t.count() , "after remove" ) + //print( "cursora.js num: " + num + " time:" + ( end.getTime() - start.getTime() ) ) + assert.eq( 0 , t.count() , "after remove: " + tojson( ex ) ) + // assert.lt( 0 , ex.nYields , "not enough yields : " + tojson( ex ) ); // TODO make this more reliable so cen re-enable assert if ( n == num ) - print( "warning: shouldn't have counted all n: " + n + " num: " + num ); + print( "cursora.js warning: shouldn't have counted all n: " + n + " num: " + num ); } run( 1500 ) run( 5000 ) - run( 1500 , true ) run( 5000 , true ) - - +print("cursora.js SUCCESS") diff --git a/jstests/datasize3.js b/jstests/datasize3.js index d45f34b..df79e6d 100644 --- a/jstests/datasize3.js +++ b/jstests/datasize3.js @@ -22,10 +22,12 @@ t.ensureIndex( { x : 1 } ) for ( i=2; i<100; i++ ) t.insert( { x : i } ) -a = run( { min : { x : 20 } , max : { x : 50 } } ) -b = run( { min : { x : 20 } , max : { x : 50 } , estimate : true } ) +a = run( { min : { x : 20 } , max : { x : 50 } } ).size +b = run( { min : { x : 20 } , max : { x : 50 } , estimate : true } ).size -assert.eq( a.size , b.size ); +ratio = Math.min( a , b ) / Math.max( a , b ); + +assert.lt( 0.97 , ratio , "sizes not equal a: " + a + " b: " + b ); diff --git a/jstests/dbcase.js b/jstests/dbcase.js index d76b739..21854d8 100644 --- a/jstests/dbcase.js +++ b/jstests/dbcase.js @@ -1,4 +1,6 @@ +/* +TODO SERVER-2111 a = db.getSisterDB( "dbcasetest_dbnamea" ) b = db.getSisterDB( "dbcasetest_dbnameA" ) @@ -19,5 +21,5 @@ a.dropDatabase(); b.dropDatabase(); print( db.getMongo().getDBNames() ) - +*/ diff --git a/jstests/disk/directoryperdb.js b/jstests/disk/directoryperdb.js index 90a1f03..3b65bd0 100644 --- a/jstests/disk/directoryperdb.js +++ b/jstests/disk/directoryperdb.js @@ -9,7 +9,7 @@ db[ baseName ].save( {} ); assert.eq( 1, db[ baseName ].count() , "A : " + tojson( db[baseName].find().toArray() ) ); checkDir = function( dir ) { - db.runCommand( {fsync:1} ); + db.adminCommand( {fsync:1} ); files = listFiles( dir ); found = false; for( f in files ) { @@ -60,3 +60,5 @@ assert( m.getDBs().totalSize > 0, "bad size calc" ); db.dropDatabase(); files = listFiles( dbpath ); files.forEach( function( f ) { assert( !new RegExp( baseName ).test( f.name ), "drop database - dir not cleared" ); } ); + +print("SUCCESS directoryperdb.js"); diff --git a/jstests/disk/diskfull.js b/jstests/disk/diskfull.js index 6cbcbb7..26b707d 100644 --- a/jstests/disk/diskfull.js +++ b/jstests/disk/diskfull.js @@ -1,19 +1,25 @@ doIt = false; +dbpath = "/data/db/diskfulltest"; + files = listFiles( "/data/db" ); for ( i in files ) { - if ( files[ i ].name == "/data/db/diskfulltest" ) { + if ( files[ i ].name == dbpath ) { doIt = true; } } if ( !doIt ) { - print( "path /data/db/diskfulltest/ missing, skipping diskfull test" ); + print( "path " + dbpath + " missing, skipping diskfull test" ); doIt = false; } if ( doIt ) { + // Clear dbpath without removing and recreating diskfulltest directory, as resetDbpath does + files = listFiles( dbpath ); + files.forEach( function( x ) { removeFile( x.name ) } ); + port = allocatePorts( 1 )[ 0 ]; - m = startMongoProgram( "mongod", "--port", port, "--dbpath", "/data/db/diskfulltest", "--nohttpinterface", "--bind_ip", "127.0.0.1" ); + m = startMongoProgram( "mongod", "--port", port, "--dbpath", dbpath, "--nohttpinterface", "--bind_ip", "127.0.0.1" ); c = m.getDB( "diskfulltest" ).getCollection( "diskfulltest" ) c.save( { a: 6 } ); assert.soon( function() { return rawMongoProgramOutput().match( /file allocation failure/ ); }, "didn't see 'file allocation failure'" ); diff --git a/jstests/disk/killall.js b/jstests/disk/killall.js new file mode 100644 index 0000000..a1487bb --- /dev/null +++ b/jstests/disk/killall.js @@ -0,0 +1,42 @@ +// running ops should be killed +// dropped collection should be ok after restart + +if ( typeof _threadInject == "undefined" ) { // don't run in v8 mode - SERVER-2076 + +port = allocatePorts( 1 )[ 0 ] + +var baseName = "jstests_disk_killall"; + +var m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName, "--nohttpinterface" ); + +m.getDB( "test" ).getCollection( baseName ).save( {} ); +m.getDB( "test" ).getLastError(); + +s1 = startParallelShell( "db." + baseName + ".count( { $where: function() { while( 1 ) { ; } } } )", port ); +sleep( 1000 ); + +s2 = startParallelShell( "db." + baseName + ".drop()", port ); +sleep( 1000 ); + +/** + * 12 == mongod's exit code on interrupt (eg standard kill) + * stopMongod sends a standard kill signal to mongod, then waits for mongod to stop. If mongod doesn't stop + * in a reasonable amount of time, stopMongod sends kill -9 and in that case will not return 12. We're checking + * in this assert that mongod will stop quickly even while evaling an infinite loop in server side js. + * + * 14 is sometimes returned instead due to SERVER-2184 + */ +exitCode = stopMongod( port ); +assert( exitCode == 12 || exitCode == 14, "got unexpected exitCode: " + exitCode ); + +s1(); +s2(); + +var m = startMongoProgram( "mongod", "--port", port, "--dbpath", "/data/db/" + baseName ); + +m.getDB( "test" ).getCollection( baseName ).stats(); +m.getDB( "test" ).getCollection( baseName ).drop(); + +stopMongod( port ); + +} \ No newline at end of file diff --git a/jstests/disk/preallocate.js b/jstests/disk/preallocate.js index d772fbb..4f35866 100644 --- a/jstests/disk/preallocate.js +++ b/jstests/disk/preallocate.js @@ -2,7 +2,7 @@ port = allocatePorts( 1 )[ 0 ]; -var baseName = "jstests_preallocate2"; +var baseName = "jstests_preallocate"; var m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName ); @@ -10,7 +10,11 @@ assert.eq( 0, m.getDBs().totalSize ); m.getDB( baseName ).createCollection( baseName + "1" ); -assert.soon( function() { return m.getDBs().totalSize > 100000000; }, "expected second file to bring total size over 100MB" ); +expectedMB = 100; +if ( m.getDB( baseName ).serverBits() < 64 ) + expectedMB /= 4; + +assert.soon( function() { return m.getDBs().totalSize > expectedMB * 1000000; }, "\n\n\nFAIL preallocate.js expected second file to bring total size over " + expectedMB + "MB" ); stopMongod( port ); diff --git a/jstests/disk/preallocate2.js b/jstests/disk/preallocate2.js index ee9382c..9b2159f 100644 --- a/jstests/disk/preallocate2.js +++ b/jstests/disk/preallocate2.js @@ -8,4 +8,8 @@ var m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName ); m.getDB( baseName )[ baseName ].save( {i:1} ); -assert.soon( function() { return m.getDBs().totalSize > 100000000; }, "expected second file to bring total size over 100MB" ); \ No newline at end of file +expectedMB = 100; +if ( m.getDB( baseName ).serverBits() < 64 ) + expectedMB /= 4; + +assert.soon( function() { return m.getDBs().totalSize > expectedMB * 1000000; }, "\n\n\nFAIL preallocate.js expected second file to bring total size over " + expectedMB + "MB" ); diff --git a/jstests/disk/preallocate_directoryperdb.js b/jstests/disk/preallocate_directoryperdb.js new file mode 100644 index 0000000..fd92aaf --- /dev/null +++ b/jstests/disk/preallocate_directoryperdb.js @@ -0,0 +1,50 @@ +/** + * Test for SERVER-2417 - should not preallocate a database file while we are + * dropping its directory in directoryperdb mode. + */ + +var baseDir = "jstests_disk_preallocate_directoryperdb"; +var baseName = "preallocate_directoryperdb" +var baseName2 = "preallocate_directoryperdb2" +var baseName3 = "preallocate_directoryperdb3" +port = allocatePorts( 1 )[ 0 ]; +dbpath = "/data/db/" + baseDir + "/"; + +function checkDb2DirAbsent() { + files = listFiles( dbpath ); +// printjson( files ); + for( var f in files ) { + var name = files[ f ].name; + assert.eq( -1, name.indexOf( dbpath + baseName2 ), "baseName2 dir still present" ); + } +} + +var m = startMongod( "--smallfiles", "--directoryperdb", "--port", port, "--dbpath", dbpath, "--nohttpinterface", "--bind_ip", "127.0.0.1" ); +db = m.getDB( baseName ); +db2 = m.getDB( baseName2 ); +c = db[ baseName ]; +c2 = db2[ baseName2 ]; +big = new Array( 5000 ).toString(); +for( var i = 0; i < 3000; ++i ) { + c.save( { b:big } ); + c2.save( { b:big } ); + db.getLastError(); +} + +// Due to our write pattern, we expect db2's .3 file to be queued up in the file +// allocator behind db's .3 file at the time db2 is dropped. This will +// (incorrectly) cause db2's dir to be recreated until SERVER-2417 is fixed. +db2.dropDatabase(); + +checkDb2DirAbsent(); + +db.dropDatabase(); + +// Try writing a new database, to ensure file allocator is still working. +db3 = m.getDB( baseName3 ); +c3 = db[ baseName3 ]; +c3.save( {} ); +assert( !db3.getLastError() ); +assert.eq( 1, c3.count() ); + +checkDb2DirAbsent(); diff --git a/jstests/distinct1.js b/jstests/distinct1.js index 433e051..5e47400 100644 --- a/jstests/distinct1.js +++ b/jstests/distinct1.js @@ -2,6 +2,8 @@ t = db.distinct1; t.drop(); +assert.eq( 0 , t.distinct( "a" ).length , "test empty" ); + t.save( { a : 1 } ) t.save( { a : 2 } ) t.save( { a : 2 } ) diff --git a/jstests/distinct_array1.js b/jstests/distinct_array1.js index 0d41b80..f654dba 100644 --- a/jstests/distinct_array1.js +++ b/jstests/distinct_array1.js @@ -21,4 +21,5 @@ t.save( { a : [] , c : 12 } ); t.save( { a : { b : "z"} , c : 12 } ); res = t.distinct( "a.b" ); +res.sort() assert.eq( "a,b,c,d,e,f,z" , res.toString() , "B1" ); diff --git a/jstests/distinct_index1.js b/jstests/distinct_index1.js new file mode 100644 index 0000000..8677457 --- /dev/null +++ b/jstests/distinct_index1.js @@ -0,0 +1,50 @@ + +t = db.distinct_index1 +t.drop(); + +function r( x ){ + return Math.floor( Math.sqrt( x * 123123 ) ) % 10; +} + +function d( k , q ){ + return t.runCommand( "distinct" , { key : k , query : q || {} } ) +} + +for ( i=0; i<1000; i++ ){ + o = { a : r(i*5) , b : r(i) }; + t.insert( o ); +} + +x = d( "a" ); +assert.eq( 1000 , x.stats.n , "AA1" ) +assert.eq( 1000 , x.stats.nscanned , "AA2" ) +assert.eq( 1000 , x.stats.nscannedObjects , "AA3" ) + +x = d( "a" , { a : { $gt : 5 } } ); +assert.eq( 398 , x.stats.n , "AB1" ) +assert.eq( 1000 , x.stats.nscanned , "AB2" ) +assert.eq( 1000 , x.stats.nscannedObjects , "AB3" ) + +x = d( "b" , { a : { $gt : 5 } } ); +assert.eq( 398 , x.stats.n , "AC1" ) +assert.eq( 1000 , x.stats.nscanned , "AC2" ) +assert.eq( 1000 , x.stats.nscannedObjects , "AC3" ) + + + +t.ensureIndex( { a : 1 } ) + +x = d( "a" ); +assert.eq( 1000 , x.stats.n , "BA1" ) +assert.eq( 1000 , x.stats.nscanned , "BA2" ) +assert.eq( 0 , x.stats.nscannedObjects , "BA3" ) + +x = d( "a" , { a : { $gt : 5 } } ); +assert.eq( 398 , x.stats.n , "BB1" ) +assert.eq( 398 , x.stats.nscanned , "BB2" ) +assert.eq( 0 , x.stats.nscannedObjects , "BB3" ) + +x = d( "b" , { a : { $gt : 5 } } ); +assert.eq( 398 , x.stats.n , "BC1" ) +assert.eq( 398 , x.stats.nscanned , "BC2" ) +assert.eq( 398 , x.stats.nscannedObjects , "BC3" ) diff --git a/jstests/distinct_index2.js b/jstests/distinct_index2.js new file mode 100644 index 0000000..2ba65f9 --- /dev/null +++ b/jstests/distinct_index2.js @@ -0,0 +1,35 @@ +t = db.distinct_index2; +t.drop(); + +t.ensureIndex( { a : 1 , b : 1 } ) +t.ensureIndex( { c : 1 } ) + +function x(){ + return Math.floor( Math.random() * 10 ); +} + +for ( i=0; i<2000; i++ ){ + t.insert( { a : x() , b : x() , c : x() } ) +} + +correct = [] +for ( i=0; i<10; i++ ) + correct.push( i ) + +function check( field ){ + res = t.distinct( field ) + res = res.sort() + assert.eq( correct , res , "check: " + field ); + + if ( field != "a" ){ + res = t.distinct( field , { a : 1 } ) + res = res.sort() + assert.eq( correct , res , "check 2: " + field ); + } +} + +check( "a" ) +check( "b" ) +check( "c" ) + + diff --git a/jstests/drop2.js b/jstests/drop2.js new file mode 100644 index 0000000..fa239fd --- /dev/null +++ b/jstests/drop2.js @@ -0,0 +1,43 @@ +t = db.jstests_drop2; +t.drop(); + +function debug( x ) { +// printjson( x ); +} + +t.save( {} ); +db.getLastError(); + +function op( drop ) { + p = db.currentOp().inprog; + debug( p ); + for ( var i in p ) { + var o = p[ i ]; + if ( drop ) { + if ( o.active && o.query && o.query.drop && o.query.drop == "jstests_drop2" ) { + return o.opid; + } + } else { + if ( o.active && o.query && o.query.query && o.query.query.$where && o.ns == "test.jstests_drop2" ) { + return o.opid; + } + } + } + return null; +} + +s1 = startParallelShell( "db.jstests_drop2.count( { $where: function() { while( 1 ) { ; } } } )" ); +countOp = null; +assert.soon( function() { countOp = op( false ); return countOp; } ); + +s2 = startParallelShell( "db.jstests_drop2.drop()" ); +dropOp = null; +assert.soon( function() { dropOp = op( true ); return dropOp; } ); + +db.killOp( dropOp ); +db.killOp( countOp ); + +s1(); +s2(); + +t.drop(); // in SERVER-1818, this fails diff --git a/jstests/dropIndex.js b/jstests/dropIndex.js deleted file mode 100644 index a6e5f46..0000000 --- a/jstests/dropIndex.js +++ /dev/null @@ -1,16 +0,0 @@ - -t = db.dropIndex; -t.drop(); - -t.insert( { _id : 1 , a : 2 , b : 3 } ); -assert.eq( 1 , t.getIndexes().length , "A1" ); - -t.ensureIndex( { a : 1 } ); -t.ensureIndex( { b : 1 } ); -assert.eq( 3 , t.getIndexes().length , "A2" ); - -x = db._dbCommand( { dropIndexes: t.getName() , index : t._genIndexName( { a : 1 } ) } ); -assert.eq( 2 , t.getIndexes().length , "B1" ); - -x = db._dbCommand( { dropIndexes: t.getName() , index : { b : 1 } } ) -assert.eq( 1 , t.getIndexes().length , "B2" ); diff --git a/jstests/drop_index.js b/jstests/drop_index.js new file mode 100644 index 0000000..a6e5f46 --- /dev/null +++ b/jstests/drop_index.js @@ -0,0 +1,16 @@ + +t = db.dropIndex; +t.drop(); + +t.insert( { _id : 1 , a : 2 , b : 3 } ); +assert.eq( 1 , t.getIndexes().length , "A1" ); + +t.ensureIndex( { a : 1 } ); +t.ensureIndex( { b : 1 } ); +assert.eq( 3 , t.getIndexes().length , "A2" ); + +x = db._dbCommand( { dropIndexes: t.getName() , index : t._genIndexName( { a : 1 } ) } ); +assert.eq( 2 , t.getIndexes().length , "B1" ); + +x = db._dbCommand( { dropIndexes: t.getName() , index : { b : 1 } } ) +assert.eq( 1 , t.getIndexes().length , "B2" ); diff --git a/jstests/dur/a_quick.js b/jstests/dur/a_quick.js new file mode 100755 index 0000000..f703f3f --- /dev/null +++ b/jstests/dur/a_quick.js @@ -0,0 +1,123 @@ +/* quick.js + test durability + this file should always run quickly + other tests can be slow +*/ + +testname = "a_quick"; +load("jstests/_tst.js"); + +function checkNoJournalFiles(path, pass) { + var files = listFiles(path); + if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) { + if (pass == null) { + // wait a bit longer for mongod to potentially finish if it is still running. + sleep(10000); + return checkNoJournalFiles(path, 1); + } + print("\n\n\n"); + print("FAIL path:" + path); + print("unexpected files:"); + printjson(files); + assert(false, "FAIL a journal/lsn file is present which is unexpected"); + } +} + +// directories +var path1 = "/data/db/quicknodur"; +var path2 = "/data/db/quickdur"; + +// non-durable version +tst.log("start mongod without dur"); +var conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur"); +tst.log("without dur work"); +var d = conn.getDB("test"); +d.foo.insert({ _id:123 }); +d.getLastError(); +tst.log("stop without dur"); +stopMongod(30000); + +// durable version +tst.log("start mongod with dur"); +conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--durOptions", 8); +tst.log("with dur work"); +d = conn.getDB("test"); +d.foo.insert({ _id: 123 }); +d.getLastError(); // wait + +// we could actually do getlasterror fsync:1 now, but maybe this is agood +// as it will assure that commits happen on a timely basis. a bunch of the other dur/*js +// tests use fsync +tst.log("sleep a bit for a group commit"); +sleep(8000); + +// kill the process hard +tst.log("kill -9 mongod"); +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard + +// we will force removal of a datafile to be sure we can recreate everything +// without it being present. +removeFile(path2 + "/test.0"); + +// for that to work, we can't skip anything though: +removeFile(path2 + "/journal/lsn"); + +// with the file deleted, we MUST start from the beginning of the journal. +// thus this check to be careful +var files = listFiles(path2 + "/journal/"); +if (files.some(function (f) { return f.name.indexOf("lsn") >= 0; })) { + print("\n\n\n"); + print(path2); + printjson(files); + assert(false, "a journal/lsn file is present which will make this test potentially fail."); +} + +// restart and recover +tst.log("restart and recover"); +conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--durOptions", 9); +tst.log("check data results"); +d = conn.getDB("test"); + +var countOk = (d.foo.count() == 1); +if (!countOk) { + print("\n\n\na_quick.js FAIL count " + d.foo.count() + " is wrong\n\n\n"); + // keep going - want to see if the diff matches. if so the sleep() above was too short? +} + +tst.log("stop"); +stopMongod(30002); + +// at this point, after clean shutdown, there should be no journal files +tst.log("check no journal files"); +checkNoJournalFiles(path2 + "/journal"); + +tst.log("check data matches"); +var diff = tst.diff(path1 + "/test.ns", path2 + "/test.ns"); +print("diff of .ns files returns:" + diff); + +function showfiles() { + print("\n\nERROR: files for dur and nodur do not match"); + print(path1 + " files:"); + printjson(listFiles(path1)); + print(path2 + " files:"); + printjson(listFiles(path2)); + print(); +} + +if (diff != "") { + showfiles(); + assert(diff == "", "error test.ns files differ"); +} + +diff = tst.diff(path1 + "/test.0", path2 + "/test.0"); +print("diff of .0 files returns:" + diff); +if (diff != "") { + showfiles(); + assert(diff == "", "error test.0 files differ"); +} + +assert(countOk, "a_quick.js document count after recovery was not the expected value"); + +tst.success(); diff --git a/jstests/dur/closeall.js b/jstests/dur/closeall.js new file mode 100644 index 0000000..f169f06 --- /dev/null +++ b/jstests/dur/closeall.js @@ -0,0 +1,80 @@ +// testing closealldatabases concurrency +// this is also a test of recoverFromYield() as that will get exercised by the update + +function f() { + var variant = (new Date()) % 4; + var path = "/data/db/closeall"; + var path2 = "/data/db/closeall_slave"; + var ourdb = "closealltest"; + + print("closeall.js start mongod variant:" + variant); + var options = (new Date()-0)%2==0 ? 8 : 0; + print("closeall.js --durOptions " + options); + var N = 1000; + if (options) + N = 300; + + // use replication to exercise that code too with a close, and also to test local.sources with a close + var conn = startMongodEmpty("--port", 30001, "--dbpath", path, "--dur", "--durOptions", options, "--master", "--oplogSize", 64); + var connSlave = startMongodEmpty("--port", 30002, "--dbpath", path2, "--dur", "--durOptions", options, "--slave", "--source", "localhost:30001"); + + var slave = connSlave.getDB(ourdb); + + // we'll use two connections to make a little parallelism + var db1 = conn.getDB(ourdb); + var db2 = new Mongo(db1.getMongo().host).getDB(ourdb); + + print("closeall.js run test"); + + for( var i = 0; i < N; i++ ) { + db1.foo.insert({x:1}); // this does wait for a return code so we will get some parallelism + if( i % 7 == 0 ) + db1.foo.insert({x:99, y:2}); + if( i % 49 == 0 ) + db1.foo.update({ x: 99 }, { a: 1, b: 2, c: 3, d: 4 }); + if (i % 100 == 0) + db1.foo.find(); + if( i == 800 ) + db1.foo.ensureIndex({ x: 1 }); + var res = null; + try { + if( variant == 1 ) + sleep(0); + else if( variant == 2 ) + sleep(1); + else if( variant == 3 && i % 10 == 0 ) + print(i); + res = db2.adminCommand("closeAllDatabases"); + } + catch (e) { + sleep(5000); // sleeping a little makes console output order prettier + print("\n\n\nFAIL closeall.js closeAllDatabases command invocation threw an exception. i:" + i); + try { + print("getlasterror:"); + printjson(db2.getLastErrorObj()); + print("trying one more closealldatabases:"); + res = db2.adminCommand("closeAllDatabases"); + printjson(res); + } + catch (e) { + print("got another exception : " + e); + } + print("\n\n\n"); + // sleep a little to capture possible mongod output? + sleep(2000); + throw e; + } + assert( res.ok, "closeAllDatabases res.ok=false"); + } + + print("closeall.js end test loop. slave.foo.count:"); + print(slave.foo.count()); + + print("closeall.js shutting down servers"); + stopMongod(30002); + stopMongod(30001); +} + +f(); +sleep(500); +print("SUCCESS closeall.js"); diff --git a/jstests/dur/diskfull.js b/jstests/dur/diskfull.js new file mode 100644 index 0000000..da45c20 --- /dev/null +++ b/jstests/dur/diskfull.js @@ -0,0 +1,136 @@ +/** Test running out of disk space with durability enabled */ + +startPath = "/data/db/diskfulltest"; +recoverPath = "/data/db/dur_diskfull"; + +doIt = false; +files = listFiles( "/data/db" ); +for ( i in files ) { + if ( files[ i ].name == startPath ) { + doIt = true; + } +} + +if ( !doIt ) { + print( "path " + startPath + " missing, skipping diskfull test" ); + doIt = false; +} + +function checkNoJournalFiles(path, pass) { + var files = listFiles(path); + if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) { + if (pass == null) { + // wait a bit longer for mongod to potentially finish if it is still running. + sleep(10000); + return checkNoJournalFiles(path, 1); + } + print("\n\n\n"); + print("FAIL path:" + path); + print("unexpected files:"); + printjson(files); + assert(false, "FAIL a journal/lsn file is present which is unexpected"); + } +} + +/** Clear dbpath without removing and recreating diskfulltest directory, as resetDbpath does */ +function clear() { + files = listFiles( startPath ); + files.forEach( function( x ) { removeFile( x.name ) } ); +} + +function log(str) { + print(); + if(str) + print(testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +function work() { + log("work"); + try { + var d = conn.getDB("test"); + + big = new Array( 5000 ).toString(); + for( i = 0; i < 10000; ++i ) { + d.foo.insert( { _id:i, b:big } ); + } + + d.getLastError(); + } catch ( e ) { + print( e ); + raise( e ); + } finally { + log("endwork"); + } +} + +function verify() { + log("verify"); + var d = conn.getDB("test"); + c = d.foo.count(); + v = d.foo.validate(); + // not much we can guarantee about the writes, just validate when possible + if ( c != 0 && !v.valid ) { + printjson( v ); + print( c ); + assert( v.valid ); + assert.gt( c, 0 ); + } +} + +function runFirstMongodAndFillDisk() { + log(); + + clear(); + conn = startMongodNoReset("--port", 30001, "--dbpath", startPath, "--dur", "--smallfiles", "--durOptions", 8, "--noprealloc"); + + assert.throws( work, null, "no exception thrown when exceeding disk capacity" ); + waitMongoProgramOnPort( 30001 ); + + // the above wait doesn't work on windows + sleep(5000); +} + +function runSecondMongdAndRecover() { + // restart and recover + log(); + conn = startMongodNoReset("--port", 30003, "--dbpath", startPath, "--dur", "--smallfiles", "--durOptions", 8, "--noprealloc"); + verify(); + + log("stop"); + stopMongod(30003); + + // stopMongod seems to be asynchronous (hmmm) so we sleep here. + sleep(5000); + + // at this point, after clean shutdown, there should be no journal files + log("check no journal files"); + checkNoJournalFiles(startPath + "/journal/"); + + log(); +} + +function someWritesInJournal() { + runFirstMongodAndFillDisk(); + runSecondMongdAndRecover(); +} + +function noWritesInJournal() { + // It is too difficult to consistently trigger cases where there are no existing journal files due to lack of disk space, but + // if we were to test this case we would need to manualy remove the lock file. +// removeFile( startPath + "/mongod.lock" ); +} + +if ( doIt ) { + + var testname = "dur_diskfull"; + var step = 1; + var conn = null; + + someWritesInJournal(); + noWritesInJournal(); + + print(testname + " SUCCESS"); + +} \ No newline at end of file diff --git a/jstests/dur/dropdb.js b/jstests/dur/dropdb.js new file mode 100644 index 0000000..7f82cd7 --- /dev/null +++ b/jstests/dur/dropdb.js @@ -0,0 +1,163 @@ +/* durability test dropping a database +*/ + +var debugging = false; +var testname = "dropdb"; +var step = 1; +var conn = null; + +function checkNoJournalFiles(path, pass) { + var files = listFiles(path); + if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) { + if (pass == null) { + // wait a bit longer for mongod to potentially finish if it is still running. + sleep(10000); + return checkNoJournalFiles(path, 1); + } + print("\n\n\n"); + print("FAIL path:" + path); + print("unexpected files:"); + printjson(files); + assert(false, "FAIL a journal/lsn file is present which is unexpected"); + } +} + +function runDiff(a, b) { + function reSlash(s) { + var x = s; + if (_isWindows()) { + while (1) { + var y = x.replace('/', '\\'); + if (y == x) + break; + x = y; + } + } + return x; + } + a = reSlash(a); + b = reSlash(b); + print("diff " + a + " " + b); + return run("diff", a, b); +} + +function log(str) { + if (str) + print("\n" + testname + " step " + step++ + " " + str); + else + print("\n" + testname + " step " + step++); +} + +// if you do inserts here, you will want to set _id. otherwise they won't match on different +// runs so we can't do a binary diff of the resulting files to check they are consistent. +function work() { + log("work (add data, drop database)"); + + var e = conn.getDB("teste"); + e.foo.insert({ _id: 99 }); + + var d = conn.getDB("test"); + d.foo.insert({ _id: 3, x: 22 }); + d.bar.insert({ _id: 3, x: 22 }); + + d.dropDatabase(); + + d.foo.insert({ _id: 100 }); + + // assure writes applied in case we kill -9 on return from this function + assert(d.runCommand({ getlasterror: 1, fsync: 1 }).ok, "getlasterror not ok"); +} + +function verify() { + log("verify"); + var d = conn.getDB("test"); + var count = d.foo.count(); + if (count != 1) { + print("going to fail, count mismatch in verify()"); + sleep(10000); // easier to read the output this way + print("\n\n\ndropdb.js FAIL test.foo.count() should be 1 but is : " + count); + print(d.foo.count() + "\n\n\n"); + assert(false); + } + assert(d.foo.findOne()._id == 100, "100"); + + print("dropdb.js teste.foo.findOne:"); + printjson(conn.getDB("teste").foo.findOne()); + + var teste = conn.getDB("teste"); + print("dropdb count " + teste.foo.count()); + assert(teste.foo.findOne()._id == 99, "teste"); + +} + +if (debugging) { + // mongod already running in debugger + conn = db.getMongo(); + work(); + verify(); + sleep(30000); + quit(); +} + +// directories +var path1 = "/data/db/" + testname + "nodur"; +var path2 = "/data/db/" + testname + "dur"; + +// non-durable version +log("mongod nodur"); +conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur", "--smallfiles"); +work(); +verify(); +stopMongod(30000); + +// durable version +log("mongod dur"); +conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); +work(); +verify(); + +// kill the process hard +log("kill 9"); +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard + +// we will force removal of a datafile to be sure we can recreate everything. +removeFile(path2 + "/test.0"); +// the trick above is only valid if journals haven't rotated out, and also if lsn isn't skipping +removeFile(path2 + "/lsn"); + +log("restart and recover"); +conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 9); + +log("verify after recovery"); +verify(); + +log("stop mongod 30002"); +stopMongod(30002); +sleep(5000); + +// at this point, after clean shutdown, there should be no journal files +log("check no journal files"); +checkNoJournalFiles(path2 + "/journal"); + +log("check data matches ns"); +var diff = runDiff(path1 + "/test.ns", path2 + "/test.ns"); +if (diff != "") { + print("\n\n\nDIFFERS\n"); + print(diff); +} +assert(diff == "", "error test.ns files differ"); + +log("check data matches .0"); +diff = runDiff(path1 + "/test.0", path2 + "/test.0"); +if (diff != "") { + print("\n\n\nDIFFERS\n"); + print(diff); +} +assert(diff == "", "error test.0 files differ"); + +log("check data matches done"); + +print(testname + " SUCCESS"); + diff --git a/jstests/dur/dur1.js b/jstests/dur/dur1.js new file mode 100755 index 0000000..4c8f1bf --- /dev/null +++ b/jstests/dur/dur1.js @@ -0,0 +1,154 @@ +/* + test durability +*/ + +var debugging = false; +var testname = "dur1"; +var step = 1; +var conn = null; + +function checkNoJournalFiles(path, pass) { + var files = listFiles(path); + if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) { + if (pass == null) { + // wait a bit longer for mongod to potentially finish if it is still running. + sleep(10000); + return checkNoJournalFiles(path, 1); + } + print("\n\n\n"); + print("FAIL path:" + path); + print("unexpected files:"); + printjson(files); + assert(false, "FAIL a journal/lsn file is present which is unexpected"); + } +} + +function runDiff(a, b) { + function reSlash(s) { + var x = s; + if (_isWindows()) { + while (1) { + var y = x.replace('/', '\\'); + if (y == x) + break; + x = y; + } + } + return x; + } + a = reSlash(a); + b = reSlash(b); + print("diff " + a + " " + b); + return run("diff", a, b); +} + +function log(str) { + print(); + if(str) + print(testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +// if you do inserts here, you will want to set _id. otherwise they won't match on different +// runs so we can't do a binary diff of the resulting files to check they are consistent. +function work() { + log("work"); + var d = conn.getDB("test"); + d.foo.insert({ _id: 3, x: 22 }); + d.foo.insert({ _id: 4, x: 22 }); + d.a.insert({ _id: 3, x: 22, y: [1, 2, 3] }); + d.a.insert({ _id: 4, x: 22, y: [1, 2, 3] }); + d.a.update({ _id: 4 }, { $inc: { x: 1} }); + + // try building an index. however, be careful as object id's in system.indexes would vary, so we do it manually: + d.system.indexes.insert({ _id: 99, ns: "test.a", key: { x: 1 }, name: "x_1", v: 0 }); + +// d.a.update({ _id: 4 }, { $inc: { x: 1} }); +// d.a.reIndex(); + + // assure writes applied in case we kill -9 on return from this function + d.getLastError(); + + log("endwork"); + return d; +} + +function verify() { + log("verify"); + var d = conn.getDB("test"); + var ct = d.foo.count(); + if (ct != 2) { + print("\n\n\nFAIL dur1.js count is wrong in verify(): " + ct + "\n\n\n"); + assert(ct == 2); + } +} + +if( debugging ) { + // mongod already running in debugger + conn = db.getMongo(); + work(); + sleep(30000); + quit(); +} + +log(); + +// directories +var path1 = "/data/db/" + testname+"nodur"; +var path2 = "/data/db/" + testname+"dur"; + +// non-durable version +log(); +conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur", "--smallfiles"); +work(); +stopMongod(30000); + +// durable version +log(); +conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); +work(); + +// wait for group commit. +printjson(conn.getDB('admin').runCommand({getlasterror:1, fsync:1})); + +// kill the process hard +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard + +// restart and recover +log(); +conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); +verify(); + +log("stop"); +stopMongod(30002); + +// stopMongod seems to be asynchronous (hmmm) so we sleep here. +sleep(5000); + +// at this point, after clean shutdown, there should be no journal files +log("check no journal files"); +checkNoJournalFiles(path2 + "/journal"); + +log("check data matches ns"); +var diff = runDiff(path1 + "/test.ns", path2 + "/test.ns"); +if (diff != "") { + print("\n\n\nDIFFERS\n"); + print(diff); +} +assert(diff == "", "error test.ns files differ"); + +log("check data matches .0"); +var diff = runDiff(path1 + "/test.0", path2 + "/test.0"); +if (diff != "") { + print("\n\n\nDIFFERS\n"); + print(diff); +} +assert(diff == "", "error test.0 files differ"); + +log("check data matches done"); + +print(testname + " SUCCESS"); + diff --git a/jstests/dur/dur2.js b/jstests/dur/dur2.js new file mode 100644 index 0000000..dd0ab0f --- /dev/null +++ b/jstests/dur/dur2.js @@ -0,0 +1,92 @@ +/* test durability + runs mongod, kill -9's, recovers +*/ + +var debugging = false; +var testname = "dur2"; +var step = 1; +var conn = null; + +var start = new Date(); +function howLongSecs() { + return (new Date() - start) / 1000; +} + +function log(str) { + if(str) + print("\n" + testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +function verify() { + log("verify"); + var d = conn.getDB("test"); + var mycount = d.foo.count(); + //print("count:" + mycount); + assert(mycount>2, "count wrong"); +} + +function work() { + log("work"); + x = 'x'; while(x.length < 1024) x+=x; + var d = conn.getDB("test"); + d.foo.drop(); + d.foo.insert({}); + + // go long enough we will have time to kill it later during recovery + var j = 2; + var MaxTime = 15; + if (Math.random() < 0.1) { + print("dur2.js DOING A LONGER (120 sec) PASS - if an error, try long pass to replicate"); + MaxTime = 120; + } + while (1) { + d.foo.insert({ _id: j, z: x }); + d.foo.update({ _id: j }, { $inc: { a: 1} }); + if (j % 25 == 0) + d.foo.remove({ _id: j }); + j++; + if( j % 3 == 0 ) + d.foo.update({ _id: j }, { $inc: { a: 1} }, true); + if (j % 10000 == 0) + print(j); + if (howLongSecs() > MaxTime) + break; + } + + verify(); + d.runCommand({ getLastError: 1, fsync: 1 }); +} + +if( debugging ) { + // mongod already running in debugger + print("DOING DEBUG MODE BEHAVIOR AS 'db' IS DEFINED -- RUN mongo --nodb FOR REGULAR TEST BEHAVIOR"); + conn = db.getMongo(); + work(); + sleep(30000); + quit(); +} + +// directories +var path = "/data/db/" + testname+"dur"; + +log("run mongod with --dur"); +conn = startMongodEmpty("--port", 30001, "--dbpath", path, "--dur", "--smallfiles", "--durOptions", /*DurParanoid*/8, "--master", "--oplogSize", 64); +work(); + +log("kill -9"); +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard +assert(listFiles(path + "/journal/").length > 0, "journal directory is unexpectantly empty after kill"); + +// restart and recover +log("restart mongod and recover"); +conn = startMongodNoReset("--port", 30002, "--dbpath", path, "--dur", "--smallfiles", "--durOptions", 8, "--master", "--oplogSize", 64); +verify(); + +log("stopping mongod 30002"); +stopMongod(30002); + +print(testname + " SUCCESS"); diff --git a/jstests/dur/lsn.js b/jstests/dur/lsn.js new file mode 100755 index 0000000..505d8f5 --- /dev/null +++ b/jstests/dur/lsn.js @@ -0,0 +1,126 @@ +/* test durability, specifically last sequence number function + runs mongod, kill -9's, recovers + then writes more data and verifies with DurParanoid that it matches +*/ + +var debugging = false; +var testname = "lsn"; +var step = 1; +var conn = null; + +var start = new Date(); +function howLongSecs() { + return (new Date() - start) / 1000; +} + +function log(str) { + if(str) + print("\n" + testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +function verify() { + log("verify"); + var d = conn.getDB("test"); + var mycount = d.foo.count(); + print("count:" + mycount); + assert(mycount>2, "count wrong"); +} + +// if you do inserts here, you will want to set _id. otherwise they won't match on different +// runs so we can't do a binary diff of the resulting files to check they are consistent. +function work() { + log("work"); + x = 'x'; while(x.length < 1024) x+=x; + var d = conn.getDB("test"); + d.foo.drop(); + d.foo.insert({}); + + // go long enough we will have time to kill it later during recovery + var j = 2; + var MaxTime = 15; + if (Math.random() < 0.05) { + print("doing a longer pass"); + MaxTime = 90; + } + while (1) { + d.foo.insert({ _id: j, z: x }); + d.foo.update({ _id: j }, { $inc: { a: 1} }); + if (j % 25 == 0) + d.foo.remove({ _id: j }); + j++; + if( j % 3 == 0 ) + d.foo.update({ _id: j }, { $inc: { a: 1} }, true); + if (j % 10000 == 0) + print(j); + if (howLongSecs() > MaxTime) + break; + } + + verify(); + d.runCommand({ getLastError: 1, fsync: 1 }); +} + +if( debugging ) { + // mongod already running in debugger + print("DOING DEBUG MODE BEHAVIOR AS 'db' IS DEFINED -- RUN mongo --nodb FOR REGULAR TEST BEHAVIOR"); + conn = db.getMongo(); + work(); + sleep(30000); + quit(); +} + +// directories +var path2 = "/data/db/" + testname+"dur"; + +// run mongod with a short --syncdelay to make LSN writing sooner +log("run mongod --dur and a short --syncdelay"); +conn = startMongodEmpty("--syncdelay", 2, "--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", /*DurParanoid*/8, "--master", "--oplogSize", 64); +work(); + +log("wait a while for a sync and an lsn write"); +sleep(14); // wait for lsn write + +log("kill mongod -9"); +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard + +// check that there is an lsn file +{ + var files = listFiles(path2 + "/journal/"); + assert(files.some(function (f) { return f.name.indexOf("lsn") >= 0; }), + "lsn.js FAIL no lsn file found after kill, yet one is expected"); +} +/*assert.soon( + function () { + var files = listFiles(path2 + "/journal/"); + return files.some(function (f) { return f.name.indexOf("lsn") >= 0; }); + }, + "lsn.js FAIL no lsn file found after kill, yet one is expected" +);*/ + +// restart and recover +log("restart mongod, recover, verify"); +conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 24, "--master", "--oplogSize", 64); +verify(); + +// idea here is to verify (in a simplistic way) that we are in a good state to do further ops after recovery +log("add data after recovery"); +{ + var d = conn.getDB("test"); + d.xyz.insert({ x: 1 }); + d.xyz.insert({ x: 1 }); + d.xyz.insert({ x: 1 }); + d.xyz.update({}, { $set: { x: "aaaaaaaaaaaa"} }); + d.xyz.reIndex(); + d.xyz.drop(); + sleep(1); + d.xyz.insert({ x: 1 }); +} + +log("stop mongod 30002"); +stopMongod(30002); + +print(testname + " SUCCESS"); diff --git a/jstests/dur/manyRestart.js b/jstests/dur/manyRestart.js new file mode 100755 index 0000000..04e4318 --- /dev/null +++ b/jstests/dur/manyRestart.js @@ -0,0 +1,191 @@ +/* + test durability +*/ + +var debugging = false; +var testname = "manyRestarts"; +var step = 1; +var conn = null; + +function checkNoJournalFiles(path, pass) { + var files = listFiles(path); + if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) { + if (pass == null) { + // wait a bit longer for mongod to potentially finish if it is still running. + sleep(10000); + return checkNoJournalFiles(path, 1); + } + print("\n\n\n"); + print("FAIL path:" + path); + print("unexpected files:"); + printjson(files); + assert(false, "FAIL a journal/lsn file is present which is unexpected"); + } +} + +function runDiff(a, b) { + function reSlash(s) { + var x = s; + if (_isWindows()) { + while (1) { + var y = x.replace('/', '\\'); + if (y == x) + break; + x = y; + } + } + return x; + } + a = reSlash(a); + b = reSlash(b); + print("diff " + a + " " + b); + return run("diff", a, b); +} + +function log(str) { + print(); + if(str) + print(testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +// if you do inserts here, you will want to set _id. otherwise they won't match on different +// runs so we can't do a binary diff of the resulting files to check they are consistent. +function work() { + log("work"); + var d = conn.getDB("test"); + d.foo.insert({ _id: 3, x: 22 }); + d.foo.insert({ _id: 4, x: 22 }); + d.a.insert({ _id: 3, x: 22, y: [1, 2, 3] }); + d.a.insert({ _id: 4, x: 22, y: [1, 2, 3] }); + d.a.update({ _id: 4 }, { $inc: { x: 1} }); + + // try building an index. however, be careful as object id's in system.indexes would vary, so we do it manually: + d.system.indexes.insert({ _id: 99, ns: "test.a", key: { x: 1 }, name: "x_1", v: 0 }); + +// d.a.update({ _id: 4 }, { $inc: { x: 1} }); +// d.a.reIndex(); + + // assure writes applied in case we kill -9 on return from this function + d.getLastError(); + log("endwork"); + return d; +} + +function addRows() { + var rand = Random.randInt(10000); + log("add rows " + rand); + var d = conn.getDB("test"); + for (var j = 0; j < rand; ++j) { + d.rows.insert({a:1, b: "blah"}); + } + return rand; +} + +function verify() { + log("verify"); + var d = conn.getDB("test"); + assert.eq(d.foo.count(), 2, "collection count is wrong"); + assert.eq(d.a.count(), 2, "collection count is wrong"); +} + +function verifyRows(nrows) { + log("verify rows " + nrows); + var d = conn.getDB("test"); + assert.eq(d.rows.count(), nrows, "collection count is wrong"); +} + +if( debugging ) { + // mongod already running in debugger + conn = db.getMongo(); + work(); + sleep(30000); + quit(); +} + +log(); + +// directories +var path1 = "/data/db/" + testname+"nodur"; +var path2 = "/data/db/" + testname+"dur"; + +// non-durable version +log("starting 30000"); +conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur", "--smallfiles"); +work(); +stopMongod(30000); + +log("starting 30001"); +conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); +work(); +// wait for group commit. +printjson(conn.getDB('admin').runCommand({getlasterror:1, fsync:1})); + +stopMongod(30001); +sleep(5000); + +for (var i = 0; i < 3; ++i) { + + // durable version + log("restarting 30001"); + conn = startMongodNoReset("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); + + // wait for group commit. + printjson(conn.getDB('admin').runCommand({getlasterror:1, fsync:1})); + + verify(); + + // kill the process hard + log("hard kill"); + stopMongod(30001, /*signal*/9); + + sleep(5000); +} + +// journal file should be present, and non-empty as we killed hard + +// restart and recover +log("restart"); +conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); +log("verify"); +verify(); +log("stop"); +stopMongod(30002); +sleep(5000); + +// at this point, after clean shutdown, there should be no journal files +log("check no journal files"); +checkNoJournalFiles(path2 + "/journal"); + +log("check data matches ns"); +var diff = runDiff(path1 + "/test.ns", path2 + "/test.ns"); +assert(diff == "", "error test.ns files differ"); + +log("check data matches .0"); +var diff = runDiff(path1 + "/test.0", path2 + "/test.0"); +assert(diff == "", "error test.0 files differ"); + +log("check data matches done"); + +var nrows = 0; +for (var i = 0; i < 5; ++i) { + + // durable version + log("restarting 30001"); + conn = startMongodNoReset("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8); + nrows += addRows(); + // wait for group commit. + printjson(conn.getDB('admin').runCommand({getlasterror:1, fsync:1})); + + verifyRows(nrows); + + // kill the process hard + log("hard kill"); + stopMongod(30001, /*signal*/9); + + sleep(5000); +} + +print(testname + " SUCCESS"); + diff --git a/jstests/dur/md5.js b/jstests/dur/md5.js new file mode 100644 index 0000000..107476e --- /dev/null +++ b/jstests/dur/md5.js @@ -0,0 +1,101 @@ +/** + * Test md5 validation of journal file. + * This test is dependent on the journal file format and may require an update if the format changes, + * see comments near fuzzFile() below. + */ + +var debugging = false; +var testname = "dur_md5"; +var step = 1; +var conn = null; + +function log(str) { + print(); + if(str) + print(testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +/** Changes here may require updating the byte index of the md5 hash, see File comments below. */ +function work() { + log("work"); + var d = conn.getDB("test"); + d.foo.insert({ _id: 3, x: 22 }); + d.foo.insert({ _id: 4, x: 22 }); + d.a.insert({ _id: 3, x: 22, y: [1, 2, 3] }); + d.a.insert({ _id: 4, x: 22, y: [1, 2, 3] }); + d.a.update({ _id: 4 }, { $inc: { x: 1} }); + + // try building an index. however, be careful as object id's in system.indexes would vary, so we do it manually: + d.system.indexes.insert({ _id: 99, ns: "test.a", key: { x: 1 }, name: "x_1", v: 0 }); + + // d.a.update({ _id: 4 }, { $inc: { x: 1} }); + // d.a.reIndex(); + + // assure writes applied in case we kill -9 on return from this function + d.getLastError(); + + log("endwork"); +} + +if( debugging ) { + // mongod already running in debugger + conn = db.getMongo(); + work(); + sleep(30000); + quit(); +} + +log(); + +var path = "/data/db/" + testname+"dur"; + +log(); +conn = startMongodEmpty("--port", 30001, "--dbpath", path, "--dur", "--smallfiles", "--durOptions", 8); +work(); + +// wait for group commit. +printjson(conn.getDB('admin').runCommand({getlasterror:1, fsync:1})); + +log("kill -9"); + +// kill the process hard +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard + +// Bit flip the first byte of the md5sum contained within the opcode footer. +// This ensures we get an md5 exception instead of some other type of exception. +var file = path + "/journal/j._0"; + +// if test fails, uncomment these "cp" lines to debug: +// run("cp", file, "/tmp/before"); + +// journal header is 8192 +// jsectheader is 20 +// so a little beyond that +fuzzFile(file, 8214+8); + +// run("cp", file, "/tmp/after"); + +log("run mongod again recovery should fail"); + +// 100 exit code corresponds to EXIT_UNCAUGHT, which is triggered when there is an exception during recovery. +// 14 is is sometimes triggered instead due to SERVER-2184 +exitCode = runMongoProgram( "mongod", "--port", 30002, "--dbpath", path, "--dur", "--smallfiles", "--durOptions", /*9*/13 ); + +if (exitCode != 100 && exitCode != 14) { + print("\n\n\nFAIL md5.js expected mongod to fail but didn't? mongod exitCode: " + exitCode + "\n\n\n"); + // sleep a little longer to get more output maybe + sleep(2000); + assert(false); +} + +// TODO Possibly we could check the mongod log to verify that the correct type of exception was thrown. But +// that would introduce a dependency on the mongod log format, which we may not want. + +print("SUCCESS md5.js"); + +// if we sleep a littler here we may get more out the mongod output logged +sleep(500); diff --git a/jstests/dur/oplog.js b/jstests/dur/oplog.js new file mode 100755 index 0000000..379c1b6 --- /dev/null +++ b/jstests/dur/oplog.js @@ -0,0 +1,159 @@ +/* oplog.js */ + +var debugging = false; +var testname = "oplog"; +var step = 1; +var conn = null; + +function checkNoJournalFiles(path, pass) { + var files = listFiles(path); + if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) { + if (pass == null) { + // wait a bit longer for mongod to potentially finish if it is still running. + sleep(10000); + return checkNoJournalFiles(path, 1); + } + print("\n\n\n"); + print("FAIL path:" + path); + print("unexpected files:"); + printjson(files); + assert(false, "FAIL a journal/lsn file is present which is unexpected"); + } +} + +function runDiff(a, b) { + function reSlash(s) { + var x = s; + if (_isWindows()) { + while (1) { + var y = x.replace('/', '\\'); + if (y == x) + break; + x = y; + } + } + return x; + } + a = reSlash(a); + b = reSlash(b); + print("diff " + a + " " + b); + return runProgram("diff", a, b); +} + +function log(str) { + print(); + if(str) + print(testname+" step " + step++ + " " + str); + else + print(testname+" step " + step++); +} + +function verify() { + log("verify"); + var d = conn.getDB("local"); + var mycount = d.oplog.$main.find({ "o.z": 3 }).count(); + print(mycount); + assert(mycount == 3, "oplog doesnt match"); +} + +// if you do inserts here, you will want to set _id. otherwise they won't match on different +// runs so we can't do a binary diff of the resulting files to check they are consistent. +function work() { + log("work"); + var d = conn.getDB("test"); + var q = conn.getDB("testq"); // use tewo db's to exercise JDbContext a bit. + d.foo.insert({ _id: 3, x: 22 }); + d.foo.insert({ _id: 4, x: 22 }); + q.foo.insert({ _id: 4, x: 22 }); + d.a.insert({ _id: 3, x: 22, y: [1, 2, 3] }); + q.a.insert({ _id: 3, x: 22, y: [1, 2, 3] }); + d.a.insert({ _id: 4, x: 22, y: [1, 2, 3] }); + d.a.update({ _id: 4 }, { $inc: { x: 1} }); + // OpCode_ObjCopy fires on larger operations so make one that isn't tiny + var big = "axxxxxxxxxxxxxxb"; + big = big + big; + big = big + big; + big = big + big; + big = big + big; + big = big + big; + d.foo.insert({ _id: 5, q: "aaaaa", b: big, z: 3 }); + q.foo.insert({ _id: 5, q: "aaaaa", b: big, z: 3 }); + d.foo.insert({ _id: 6, q: "aaaaa", b: big, z: 3 }); + d.foo.update({ _id: 5 }, { $set: { z: 99} }); + + // assure writes applied in case we kill -9 on return from this function + d.getLastError(); + + log("endwork"); + + verify(); +} + +if( debugging ) { + // mongod already running in debugger + print("DOING DEBUG MODE BEHAVIOR AS 'db' IS DEFINED -- RUN mongo --nodb FOR REGULAR TEST BEHAVIOR"); + conn = db.getMongo(); + work(); + sleep(30000); + quit(); +} + +log(); + +// directories +var path1 = "/data/db/" + testname+"nodur"; +var path2 = "/data/db/" + testname+"dur"; + +// non-durable version +log(); +conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur", "--smallfiles", "--master", "--oplogSize", 64); +work(); +stopMongod(30000); + +// durable version +log(); +conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", /*DurParanoid*/8, "--master", "--oplogSize", 64); +work(); + +// wait for group commit. +printjson(conn.getDB('admin').runCommand({getlasterror:1, fsync:1})); + +// kill the process hard +stopMongod(30001, /*signal*/9); + +// journal file should be present, and non-empty as we killed hard + +// restart and recover +log(); +conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8, "--master", "--oplogSize", 64); +verify(); + +log("stop"); +stopMongod(30002); + +// stopMongod seems to be asynchronous (hmmm) so we sleep here. +sleep(5000); + +// at this point, after clean shutdown, there should be no journal files +log("check no journal files"); +checkNoJournalFiles(path2 + "/journal"); + +log("check data matches ns"); +var diff = runDiff(path1 + "/test.ns", path2 + "/test.ns"); +if (diff != "") { + print("\n\n\nDIFFERS\n"); + print(diff); +} +assert(diff == "", "error test.ns files differ"); + +log("check data matches .0"); +diff = runDiff(path1 + "/test.0", path2 + "/test.0"); +if (diff != "") { + print("\n\n\nDIFFERS\n"); + print(diff); +} +assert(diff == "", "error test.0 files differ"); + +log("check data matches done"); + +print(testname + " SUCCESS"); diff --git a/jstests/error5.js b/jstests/error5.js index ed8d922..5884d20 100644 --- a/jstests/error5.js +++ b/jstests/error5.js @@ -2,7 +2,7 @@ t = db.error5 t.drop(); -assert.throws( function(){ t.save( 4 ); } , "A" ); +assert.throws( function(){ t.save( 4 ); printjson( t.findOne() ) } , null , "A" ); t.save( { a : 1 } ) assert.eq( 1 , t.count() , "B" ); diff --git a/jstests/eval_nolock.js b/jstests/eval_nolock.js new file mode 100644 index 0000000..2688ec5 --- /dev/null +++ b/jstests/eval_nolock.js @@ -0,0 +1,16 @@ + +t = db.eval_nolock +t.drop(); + +for ( i=0; i<10; i++ ) + t.insert( { _id : i } ); + +res = db.runCommand( { eval : + function(){ + db.eval_nolock.insert( { _id : 123 } ); + return db.eval_nolock.count(); + } + , nlock : true } ); + +assert.eq( 11 , res.retval , "A" ) + diff --git a/jstests/evalc.js b/jstests/evalc.js index 59c9467..8a9e889 100644 --- a/jstests/evalc.js +++ b/jstests/evalc.js @@ -7,20 +7,6 @@ for( i = 0; i < 10; ++i ) { // SERVER-1610 -function op() { - uri = db.runCommand( "whatsmyuri" ).you; - printjson( uri ); - p = db.currentOp().inprog; - for ( var i in p ) { - var o = p[ i ]; - if ( o.client == uri ) { - print( "found it" ); - return o.opid; - } - } - return -1; -} - s = startParallelShell( "print( 'starting forked:' + Date() ); for ( i=0; i<500000; i++ ){ db.currentOp(); } print( 'ending forked:' + Date() ); " ) print( "starting eval: " + Date() ) diff --git a/jstests/evald.js b/jstests/evald.js new file mode 100644 index 0000000..78cabb6 --- /dev/null +++ b/jstests/evald.js @@ -0,0 +1,68 @@ +t = db.jstests_evald; +t.drop(); + +function debug( x ) { +// printjson( x ); +} + +for( i = 0; i < 10; ++i ) { + t.save( {i:i} ); +} +db.getLastError(); + +function op( ev, where ) { + p = db.currentOp().inprog; + debug( p ); + for ( var i in p ) { + var o = p[ i ]; + if ( where ) { + if ( o.active && o.query && o.query.query && o.query.query.$where && o.ns == "test.jstests_evald" ) { + return o.opid; + } + } else { + if ( o.active && o.query && o.query.$eval && o.query.$eval == ev ) { + return o.opid; + } + } + } + return -1; +} + +function doIt( ev, wait, where ) { + + if ( where ) { + s = startParallelShell( ev ); + } else { + s = startParallelShell( "db.eval( '" + ev + "' )" ); + } + + o = null; + assert.soon( function() { o = op( ev, where ); return o != -1 } ); + + if ( wait ) { + sleep( 2000 ); + } + + debug( "going to kill" ); + + db.killOp( o ); + + debug( "sent kill" ); + + s(); + +} + +doIt( "db.jstests_evald.count( { $where: function() { while( 1 ) { ; } } } )", true, true ); +doIt( "db.jstests_evald.count( { $where: function() { while( 1 ) { ; } } } )", false, true ); +doIt( "while( true ) {;}", false ); +doIt( "while( true ) {;}", true ); + +// the for loops are currently required, as a spawned op masks the parent op - see SERVER-1931 +doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count( {i:10} ); }", true ); +doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count( {i:10} ); }", false ); +doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count(); }", true ); +doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count(); }", false ); + +doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} try { db.jstests_evald.count( {i:10} ); } catch ( e ) { } }", true ); +doIt( "while( 1 ) { try { while( 1 ) { ; } } catch ( e ) { } }", true ); diff --git a/jstests/evale.js b/jstests/evale.js new file mode 100644 index 0000000..af5a303 --- /dev/null +++ b/jstests/evale.js @@ -0,0 +1,5 @@ +t = db.jstests_evale; +t.drop(); + +db.eval( function() { return db.jstests_evale.count( { $where:function() { return true; } } ) } ); +db.eval( "db.jstests_evale.count( { $where:function() { return true; } } )" ); \ No newline at end of file diff --git a/jstests/evalf.js b/jstests/evalf.js new file mode 100644 index 0000000..12d0192 --- /dev/null +++ b/jstests/evalf.js @@ -0,0 +1,26 @@ +// test that killing a parent op interrupts the child op + +t = db.jstests_evalf; +t.drop(); + +if ( typeof _threadInject == "undefined" ) { // don't run in v8 mode - SERVER-1900 + +db.eval( function() { + opid = null; + while( opid == null ) { + ops = db.currentOp().inprog; + for( i in ops ) { + o = ops[ i ]; + if ( o.active && o.query && o.query.$eval ) { + opid = o.opid; + } + } + } + db.jstests_evalf.save( {opid:opid} ); + db.jstests_evalf.count( { $where:function() { + db.killOp( db.jstests_evalf.findOne().opid ); + while( 1 ) { ; } + } } ); + } ); + +} \ No newline at end of file diff --git a/jstests/exists.js b/jstests/exists.js index 28f69e8..3f1e904 100644 --- a/jstests/exists.js +++ b/jstests/exists.js @@ -25,7 +25,7 @@ function dotest( n ){ assert.eq( 3, t.count( {'a.b': {$exists:true}} ) , n ); assert.eq( 2, t.count( {'a.b.c': {$exists:true}} ) , n ); assert.eq( 1, t.count( {'a.b.c.d': {$exists:true}} ) , n ); - + assert.eq( 1, t.count( {a: {$exists:false}} ) , n ); assert.eq( 2, t.count( {'a.b': {$exists:false}} ) , n ); assert.eq( 3, t.count( {'a.b.c': {$exists:false}} ) , n ); @@ -38,6 +38,7 @@ t.ensureIndex( { "a.b" : 1 } ) t.ensureIndex( { "a.b.c" : 1 } ) t.ensureIndex( { "a.b.c.d" : 1 } ) dotest( "after index" ) +assert.eq( 1, t.find( {a: {$exists:false}} ).hint( {a:1} ).itcount() ); t.drop(); diff --git a/jstests/explain1.js b/jstests/explain1.js index 6d5ac55..2460c28 100644 --- a/jstests/explain1.js +++ b/jstests/explain1.js @@ -20,5 +20,5 @@ assert.eq( 20 , t.find( q ).limit(20).itcount() , "F" ); assert.eq( 49 , t.find(q).explain().n , "G" ); assert.eq( 20 , t.find(q).limit(20).explain().n , "H" ); -assert.eq( 49 , t.find(q).limit(-20).explain().n , "I" ); +assert.eq( 20 , t.find(q).limit(-20).explain().n , "I" ); diff --git a/jstests/explain2.js b/jstests/explain2.js index 4960e5a..6cb5160 100644 --- a/jstests/explain2.js +++ b/jstests/explain2.js @@ -16,12 +16,12 @@ function go( q , c , b , o ){ } q = { a : { $gt : 3 } } -go( q , 6 , 7 , 6 ); +go( q , 6 , 6 , 6 ); q.b = 5 -go( q , 1 , 1 , 1 ); +go( q , 1 , 6 , 1 ); delete q.b q.c = 5 -go( q , 1 , 7 , 6 ); +go( q , 1 , 6 , 6 ); diff --git a/jstests/explain3.js b/jstests/explain3.js new file mode 100644 index 0000000..69dcac5 --- /dev/null +++ b/jstests/explain3.js @@ -0,0 +1,24 @@ +/** SERVER-2451 Kill cursor while explain is yielding */ + +t = db.jstests_explain3; +t.drop(); + +t.ensureIndex( {i:1} ); +for( var i = 0; i < 10000; ++i ) { + t.save( {i:i,j:0} ); +} +db.getLastError(); + +s = startParallelShell( "sleep( 20 ); db.jstests_explain3.dropIndex( {i:1} );" ); + +try { + t.find( {i:{$gt:-1},j:1} ).hint( {i:1} ).explain() +} catch (e) { + print( "got exception" ); + printjson( e ); +} + +s(); + +// Sanity check to make sure mongod didn't seg fault. +assert.eq( 10000, t.count() ); \ No newline at end of file diff --git a/jstests/find_and_modify3.js b/jstests/find_and_modify3.js index 1d30204..4214dfb 100644 --- a/jstests/find_and_modify3.js +++ b/jstests/find_and_modify3.js @@ -8,13 +8,13 @@ t.insert({_id:2, other:2, comments:[{i:0, j:0}, {i:1, j:1}]}); orig0 = t.findOne({_id:0}) orig2 = t.findOne({_id:2}) -out = t.findAndModify({query: {_id:1, 'comments.i':0}, update: {$set: {'comments.$.j':2}}, 'new': true}); +out = t.findAndModify({query: {_id:1, 'comments.i':0}, update: {$set: {'comments.$.j':2}}, 'new': true, sort:{other:1}}); assert.eq(out.comments[0], {i:0, j:2}); assert.eq(out.comments[1], {i:1, j:1}); assert.eq(t.findOne({_id:0}), orig0); assert.eq(t.findOne({_id:2}), orig2); -out = t.findAndModify({query: {other:1, 'comments.i':1}, update: {$set: {'comments.$.j':3}}, 'new': true}); +out = t.findAndModify({query: {other:1, 'comments.i':1}, update: {$set: {'comments.$.j':3}}, 'new': true, sort:{other:1}}); assert.eq(out.comments[0], {i:0, j:2}); assert.eq(out.comments[1], {i:1, j:3}); assert.eq(t.findOne({_id:0}), orig0); diff --git a/jstests/geo_borders.js b/jstests/geo_borders.js new file mode 100644 index 0000000..85ffe35 --- /dev/null +++ b/jstests/geo_borders.js @@ -0,0 +1,189 @@ + +t = db.borders +t.drop() + +// FIXME: FAILS for all epsilon < 1 +epsilon = 1 +//epsilon = 0.99 + +// For these tests, *required* that step ends exactly on max +min = -1 +max = 1 +step = 1 +numItems = 0; + +for(var x = min; x <= max; x += step){ + for(var y = min; y <= max; y += step){ + t.insert({ loc: { x : x, y : y } }) + numItems++; + } +} + +overallMin = -1 +overallMax = 1 + +// Create a point index slightly smaller than the points we have +t.ensureIndex({ loc : "2d" }, { max : overallMax - epsilon / 2, min : overallMin + epsilon / 2}) +assert(db.getLastError(), "A1") + +// FIXME: FAILS for all epsilon < 1 +// Create a point index only slightly bigger than the points we have +t.ensureIndex({ loc : "2d" }, { max : overallMax + epsilon, min : overallMin - epsilon }) +assert.isnull(db.getLastError(), "A2") + + + + + + + + +//************ +// Box Tests +//************ + + +/* +// FIXME: Fails w/ non-nice error +// Make sure we can get all points in full bounds +assert(numItems == t.find({ loc : { $within : { $box : [[overallMin - epsilon, + overallMin - epsilon], + [overallMax + epsilon, + overallMax + epsilon]] } } }).count(), "B1"); +*/ + +// Make sure an error is thrown if the bounds are bigger than the box itself +// TODO: Do we really want an error in this case? Shouldn't we just clip the box? +try{ + t.findOne({ loc : { $within : { $box : [[overallMin - 2 * epsilon, + overallMin - 2 * epsilon], + [overallMax + 2 * epsilon, + overallMax + 2 * epsilon]] } } }); + assert(false, "B2"); +} +catch(e){} + +//Make sure we can get at least close to the bounds of the index +assert(numItems == t.find({ loc : { $within : { $box : [[overallMin - epsilon / 2, + overallMin - epsilon / 2], + [overallMax + epsilon / 2, + overallMax + epsilon / 2]] } } }).count(), "B3"); + + +//************** +//Circle tests +//************** + +center = (overallMax + overallMin) / 2 +center = [center, center] +radius = overallMax + +offCenter = [center[0] + radius, center[1] + radius] +onBounds = [offCenter[0] + epsilon, offCenter[1] + epsilon] +offBounds = [onBounds[0] + epsilon, onBounds[1] + epsilon] + + +//Make sure we can get all points when radius is exactly at full bounds +assert(0 < t.find({ loc : { $within : { $center : [center, radius + epsilon] } } }).count(), "C1"); + +//Make sure we can get points when radius is over full bounds +assert(0 < t.find({ loc : { $within : { $center : [center, radius + 2 * epsilon] } } }).count(), "C2"); + +//Make sure we can get points when radius is over full bounds, off-centered +assert(0 < t.find({ loc : { $within : { $center : [offCenter, radius + 2 * epsilon] } } }).count(), "C3"); + +//Make sure we get correct corner point when center is in bounds +// (x bounds wrap, so could get other corner) +cornerPt = t.findOne({ loc : { $within : { $center : [offCenter, step / 2] } } }); +assert(cornerPt.loc.y == overallMax, "C4") + +/* +// FIXME: FAILS, returns opposite corner +// Make sure we get correct corner point when center is on bounds +cornerPt = t.findOne({ loc : { $within : { $center : [onBounds, + Math.sqrt(2 * epsilon * epsilon) + (step / 2) ] } } }); +assert(cornerPt.loc.y == overallMax, "C5") +*/ + +// TODO: Handle gracefully? +// Make sure we can't get corner point when center is over bounds +try{ + t.findOne({ loc : { $within : { $center : [offBounds, + Math.sqrt(8 * epsilon * epsilon) + (step / 2) ] } } }); + assert(false, "C6") +} +catch(e){} + + + + + + + +//*********** +//Near tests +//*********** + +//Make sure we can get all nearby points to point in range +assert(t.find({ loc : { $near : offCenter } }).next().loc.y == overallMax, + "D1"); + +/* +// FIXME: FAILS, returns opposite list +// Make sure we can get all nearby points to point on boundary +assert(t.find({ loc : { $near : onBounds } }).next().loc.y == overallMax, + "D2"); +*/ + +//TODO: Could this work? +//Make sure we can't get all nearby points to point over boundary +try{ + t.findOne({ loc : { $near : offBounds } }) + assert(false, "D3") +} +catch(e){} + +/* +// FIXME: FAILS, returns only single point +//Make sure we can get all nearby points within one step (4 points in top corner) +assert(4 == t.find({ loc : { $near : offCenter, $maxDistance : step * 1.9 } }).count(), + "D4"); +*/ + + + +//************** +//Command Tests +//************** + + +//Make sure we can get all nearby points to point in range +assert(db.runCommand({ geoNear : "borders", near : offCenter }).results[0].obj.loc.y == overallMax, + "E1"); + + +/* +// FIXME: FAILS, returns opposite list +//Make sure we can get all nearby points to point on boundary +assert(db.runCommand({ geoNear : "borders", near : onBounds }).results[0].obj.loc.y == overallMax, + "E2"); +*/ + +//TODO: Could this work? +//Make sure we can't get all nearby points to point over boundary +try{ + db.runCommand({ geoNear : "borders", near : offBounds }).results.length + assert(false, "E3") +} +catch(e){} + + +/* +// FIXME: Fails, returns one point +//Make sure we can get all nearby points within one step (4 points in top corner) +assert(4 == db.runCommand({ geoNear : "borders", near : offCenter, maxDistance : step * 1.5 }).results.length, + "E4"); +*/ + + + diff --git a/jstests/geo_center_sphere1.js b/jstests/geo_center_sphere1.js new file mode 100644 index 0000000..dd7c98a --- /dev/null +++ b/jstests/geo_center_sphere1.js @@ -0,0 +1,93 @@ + +t = db.geo_center_sphere1; +t.drop(); + +skip = 3 // lower for more rigor, higher for more speed (tested with .5, .678, 1, 2, 3, and 4) + +searches = [ + // x , y rad + [ [ 5 , 0 ] , 0.05 ] , // ~200 miles + [ [ 135 , 0 ] , 0.05 ] , + + [ [ 5 , 70 ] , 0.05 ] , + [ [ 135 , 70 ] , 0.05 ] , + [ [ 5 , 85 ] , 0.05 ] , + + [ [ 20 , 0 ] , 0.25 ] , // ~1000 miles + [ [ 20 , -45 ] , 0.25 ] , + [ [ -20 , 60 ] , 0.25 ] , + [ [ -20 , -70 ] , 0.25 ] , +]; +correct = searches.map( function(z){ return []; } ); + +num = 0; + +for ( x=-179; x<=179; x += skip ){ + for ( y=-89; y<=89; y += skip ){ + o = { _id : num++ , loc : [ x , y ] } + t.save( o ) + for ( i=0; i 0 ) { + assert.eq.automsg( "2", "t.find( { a:5, b:{$gte:5.5,$lte:6}, c:5 } ).sort( sort ).explain().nscanned" ); + assert.eq.automsg( "2", "t.find( { a:5, b:{$gte:5,$lte:5.5}, c:5 } ).sort( sort ).explain().nscanned" ); + } else { + assert.eq.automsg( "2", "t.find( { a:5, b:{$gte:5.5,$lte:6}, c:5 } ).sort( sort ).explain().nscanned" ); + assert.eq.automsg( "2", "t.find( { a:5, b:{$gte:5,$lte:5.5}, c:5 } ).sort( sort ).explain().nscanned" ); + } +assert.eq.automsg( "7", "t.find( { a:5, b:{$gte:5,$lte:7}, c:5 } ).sort( sort ).explain().nscanned" ); +assert.eq.automsg( "4", "t.find( { a:{$gte:5,$lte:6}, b:5, c:5 } ).sort( sort ).explain().nscanned" ); + if ( s.a > 0 ) { + assert.eq.automsg( "2", "t.find( { a:{$gte:5.5,$lte:6}, b:5, c:5 } ).sort( sort ).explain().nscanned" ); + assert.eq.automsg( "2", "t.find( { a:{$gte:5,$lte:5.5}, b:5, c:5 } ).sort( sort ).explain().nscanned" ); + assert.eq.automsg( "3", "t.find( { a:{$gte:5.5,$lte:6}, b:5, c:{$gte:5,$lte:6} } ).sort( sort ).explain().nscanned" ); + } else { + assert.eq.automsg( "2", "t.find( { a:{$gte:5.5,$lte:6}, b:5, c:5 } ).sort( sort ).explain().nscanned" ); + assert.eq.automsg( "2", "t.find( { a:{$gte:5,$lte:5.5}, b:5, c:5 } ).sort( sort ).explain().nscanned" ); + assert.eq.automsg( "3", "t.find( { a:{$gte:5.5,$lte:6}, b:5, c:{$gte:5,$lte:6} } ).sort( sort ).explain().nscanned" ); + } +assert.eq.automsg( "7", "t.find( { a:{$gte:5,$lte:7}, b:5, c:5 } ).sort( sort ).explain().nscanned" ); +assert.eq.automsg( "6", "t.find( { a:{$gte:5,$lte:6}, b:5, c:{$gte:5,$lte:6} } ).sort( sort ).explain().nscanned" ); +assert.eq.automsg( "6", "t.find( { a:5, b:{$gte:5,$lte:6}, c:{$gte:5,$lte:6} } ).sort( sort ).explain().nscanned" ); +assert.eq.automsg( "10", "t.find( { a:{$gte:5,$lte:6}, b:{$gte:5,$lte:6}, c:5 } ).sort( sort ).explain().nscanned" ); +assert.eq.automsg( "14", "t.find( { a:{$gte:5,$lte:6}, b:{$gte:5,$lte:6}, c:{$gte:5,$lte:6} } ).sort( sort ).explain().nscanned" ); } for ( var a = -1; a <= 1; a += 2 ) { diff --git a/jstests/index_check7.js b/jstests/index_check7.js index 68102d6..1d0aaeb 100644 --- a/jstests/index_check7.js +++ b/jstests/index_check7.js @@ -11,5 +11,5 @@ assert.eq( 1 , t.find( { x : 27 } ).explain().nscanned , "A" ) t.ensureIndex( { x : -1 } ) assert.eq( 1 , t.find( { x : 27 } ).explain().nscanned , "B" ) -assert.eq( 41 , t.find( { x : { $gt : 59 } } ).explain().nscanned , "C" ); +assert.eq( 40 , t.find( { x : { $gt : 59 } } ).explain().nscanned , "C" ); diff --git a/jstests/index_many2.js b/jstests/index_many2.js index 3fca5f5..f113b8b 100644 --- a/jstests/index_many2.js +++ b/jstests/index_many2.js @@ -27,3 +27,5 @@ assert.eq( num - 1 , t.getIndexKeys().length , "B0" ) t.ensureIndex( { z : 1 } ) assert.eq( num , t.getIndexKeys().length , "B1" ) +t.dropIndex( "*" ); +assert.eq( 1 , t.getIndexKeys().length , "C1" ) diff --git a/jstests/index_sparse1.js b/jstests/index_sparse1.js new file mode 100644 index 0000000..f2805b3 --- /dev/null +++ b/jstests/index_sparse1.js @@ -0,0 +1,46 @@ + +t = db.index_sparse1; +t.drop(); + +t.insert( { _id : 1 , x : 1 } ) +t.insert( { _id : 2 , x : 2 } ) +t.insert( { _id : 3 , x : 2 } ) +t.insert( { _id : 4 } ) +t.insert( { _id : 5 } ) + +assert.eq( 5 , t.count() , "A1" ) +assert.eq( 5 , t.find().sort( { x : 1 } ).itcount() , "A2" ) + +t.ensureIndex( { x : 1 } ) +assert.eq( 2 , t.getIndexes().length , "B1" ) +assert.eq( 5 , t.find().sort( { x : 1 } ).itcount() , "B2" ) +t.dropIndex( { x : 1 } ) +assert.eq( 1 , t.getIndexes().length , "B3" ) + +t.ensureIndex( { x : 1 } , { sparse : 1 } ) +assert.eq( 2 , t.getIndexes().length , "C1" ) +assert.eq( 3 , t.find().sort( { x : 1 } ).itcount() , "C2" ) +t.dropIndex( { x : 1 } ) +assert.eq( 1 , t.getIndexes().length , "C3" ) + +// -- sparse & unique + +t.remove( { _id : 2 } ) + +// test that we can't create a unique index without sparse +t.ensureIndex( { x : 1 } , { unique : 1 } ) +assert( db.getLastError() , "D1" ) +assert.eq( 1 , t.getIndexes().length , "D2" ) + + +t.ensureIndex( { x : 1 } , { unique : 1 , sparse : 1 } ) +assert.eq( 2 , t.getIndexes().length , "E1" ) +t.dropIndex( { x : 1 } ) +assert.eq( 1 , t.getIndexes().length , "E3" ) + + +t.insert( { _id : 2 , x : 2 } ) +t.ensureIndex( { x : 1 } , { unique : 1 , sparse : 1 } ) +assert.eq( 1 , t.getIndexes().length , "F1" ) + + diff --git a/jstests/index_sparse2.js b/jstests/index_sparse2.js new file mode 100644 index 0000000..2b16c9d --- /dev/null +++ b/jstests/index_sparse2.js @@ -0,0 +1,21 @@ +t = db.index_sparse2; +t.drop(); + +t.insert( { _id : 1 , x : 1 , y : 1 } ) +t.insert( { _id : 2 , x : 2 } ) +t.insert( { _id : 3 } ) + +t.ensureIndex( { x : 1 , y : 1 } ) +assert.eq( 2 , t.getIndexes().length , "A1" ) +assert.eq( 3 , t.find().sort( { x : 1 , y : 1 } ).itcount() , "A2" ) +t.dropIndex( { x : 1 , y : 1 } ) +assert.eq( 1 , t.getIndexes().length , "A3" ) + +t.ensureIndex( { x : 1 , y : 1 } , { sparse : 1 } ) +assert.eq( 2 , t.getIndexes().length , "B1" ) +assert.eq( 2 , t.find().sort( { x : 1 , y : 1 } ).itcount() , "B2" ) +t.dropIndex( { x : 1 , y : 1 } ) +assert.eq( 1 , t.getIndexes().length , "B3" ) + + + diff --git a/jstests/indexh.js b/jstests/indexh.js index c6aad18..ac2a93e 100644 --- a/jstests/indexh.js +++ b/jstests/indexh.js @@ -6,11 +6,17 @@ function debug( t ) { print( t ); } +function extraDebug() { +// printjson( db.stats() ); +// db.printCollectionStats(); +} + // index extent freeing t.drop(); t.save( {} ); var s1 = db.stats().dataSize; debug( "s1: " + s1 ); +extraDebug(); t.ensureIndex( {a:1} ); var s2 = db.stats().dataSize; debug( "s2: " + s2 ); @@ -18,6 +24,7 @@ assert.automsg( "s1 < s2" ); t.dropIndex( {a:1} ); var s3 = db.stats().dataSize; debug( "s3: " + s3 ); +extraDebug(); assert.eq.automsg( "s1", "s3" ); // index node freeing diff --git a/jstests/indexi.js b/jstests/indexi.js new file mode 100644 index 0000000..b54ffce --- /dev/null +++ b/jstests/indexi.js @@ -0,0 +1,16 @@ +t = db.jstests_indexi; + +t.drop(); + +for( var a = 0; a < 10; ++a ) { + for( var b = 0; b < 10; ++b ) { + for( var c = 0; c < 10; ++c ) { + t.save( {a:a,b:b,c:c} ); + } + } +} + +t.ensureIndex( {a:1,b:1,c:1} ); +t.ensureIndex( {a:1,c:1} ); + +assert.automsg( "!t.find( {a:{$gt:1,$lt:10},c:{$gt:1,$lt:10}} ).explain().indexBounds.b" ); \ No newline at end of file diff --git a/jstests/indexj.js b/jstests/indexj.js new file mode 100644 index 0000000..0d1afc2 --- /dev/null +++ b/jstests/indexj.js @@ -0,0 +1,44 @@ +// SERVER-726 + +t = db.jstests_indexj; +t.drop(); + +t.ensureIndex( {a:1} ); +t.save( {a:5} ); +assert.eq( 0, t.find( { a: { $gt:4, $lt:5 } } ).explain().nscanned, "A" ); + +t.drop(); +t.ensureIndex( {a:1} ); +t.save( {a:4} ); +assert.eq( 0, t.find( { a: { $gt:4, $lt:5 } } ).explain().nscanned, "B" ); + +t.save( {a:5} ); +assert.eq( 0, t.find( { a: { $gt:4, $lt:5 } } ).explain().nscanned, "D" ); + +t.save( {a:4} ); +assert.eq( 0, t.find( { a: { $gt:4, $lt:5 } } ).explain().nscanned, "C" ); + +t.save( {a:5} ); +assert.eq( 0, t.find( { a: { $gt:4, $lt:5 } } ).explain().nscanned, "D" ); + +t.drop(); +t.ensureIndex( {a:1,b:1} ); +t.save( { a:1,b:1 } ); +t.save( { a:1,b:2 } ); +t.save( { a:2,b:1 } ); +t.save( { a:2,b:2 } ); + +assert.eq( 2, t.find( { a:{$in:[1,2]}, b:{$gt:1,$lt:2} } ).explain().nscanned ); +assert.eq( 2, t.find( { a:{$in:[1,2]}, b:{$gt:1,$lt:2} } ).sort( {a:-1,b:-1} ).explain().nscanned ); + +t.save( {a:1,b:1} ); +t.save( {a:1,b:1} ); +assert.eq( 2, t.find( { a:{$in:[1,2]}, b:{$gt:1,$lt:2} } ).explain().nscanned ); +assert.eq( 2, t.find( { a:{$in:[1,2]}, b:{$gt:1,$lt:2} } ).explain().nscanned ); +assert.eq( 2, t.find( { a:{$in:[1,2]}, b:{$gt:1,$lt:2} } ).sort( {a:-1,b:-1} ).explain().nscanned ); + +assert.eq( 1, t.find( { a:{$in:[1,1.9]}, b:{$gt:1,$lt:2} } ).explain().nscanned ); +assert.eq( 1, t.find( { a:{$in:[1.1,2]}, b:{$gt:1,$lt:2} } ).sort( {a:-1,b:-1} ).explain().nscanned ); + +t.save( { a:1,b:1.5} ); +assert.eq( 3, t.find( { a:{$in:[1,2]}, b:{$gt:1,$lt:2} } ).explain().nscanned, "F" ); diff --git a/jstests/insert2.js b/jstests/insert2.js new file mode 100644 index 0000000..442e7dc --- /dev/null +++ b/jstests/insert2.js @@ -0,0 +1,8 @@ + +t = db.insert2 +t.drop() + +assert.isnull( t.findOne() , "A" ) +t.insert( { z : 1 , $inc : { x : 1 } } , true ); +assert.isnull( t.findOne() , "B" ) + diff --git a/jstests/jni2.js b/jstests/jni2.js index 221780d..53ad58c 100644 --- a/jstests/jni2.js +++ b/jstests/jni2.js @@ -14,8 +14,8 @@ assert.throws( function(){ db.jni2t.save( { y : 1 } ); return 1; } - } ).length(); -} , "can't save from $where" ); + } ).forEach( printjson ); +} , null , "can't save from $where" ); assert.eq( 0 , db.jni2t.find().length() , "B" ) diff --git a/jstests/killop.js b/jstests/killop.js new file mode 100644 index 0000000..b5e50d9 --- /dev/null +++ b/jstests/killop.js @@ -0,0 +1,43 @@ +t = db.jstests_killop +t.drop(); + +if ( typeof _threadInject == "undefined" ) { // don't run in v8 mode - SERVER-1900 + +function debug( x ) { +// printjson( x ); +} + +t.save( {} ); +db.getLastError(); + +function ops() { + p = db.currentOp().inprog; + debug( p ); + ids = []; + for ( var i in p ) { + var o = p[ i ]; + if ( o.active && o.query && o.query.query && o.query.query.$where && o.ns == "test.jstests_killop" ) { + ids.push( o.opid ); + } + } + return ids; +} + +s1 = startParallelShell( "db.jstests_killop.count( { $where: function() { while( 1 ) { ; } } } )" ); +s2 = startParallelShell( "db.jstests_killop.count( { $where: function() { while( 1 ) { ; } } } )" ); + +o = []; +assert.soon( function() { o = ops(); return o.length == 2; } ); +debug( o ); +db.killOp( o[ 0 ] ); +db.killOp( o[ 1 ] ); + +start = new Date(); + +s1(); +s2(); + +// don't want to pass if timeout killed the js function +assert( ( new Date() ) - start < 30000 ); + +} \ No newline at end of file diff --git a/jstests/libs/concurrent.js b/jstests/libs/concurrent.js new file mode 100644 index 0000000..9198818 --- /dev/null +++ b/jstests/libs/concurrent.js @@ -0,0 +1,30 @@ +/* NOTE: Requires mongo shell to be built with V8 javascript engine, +which implements concurrent threads via fork() */ + +// Fork and start +function fork_(thunk) { + thread = fork(thunk) + thread.start() + return thread +} + +// In functional form, useful for high-order functions like map in fun.js +function join_(thread) {thread.join()} + +// Fork a loop on each one-arg block and wait for all of them to terminate. Foreground blocks are executed n times, background blocks are executed repeatedly until all forground loops finish. If any fail, stop all loops and reraise exception in main thread +function parallel(n, foregroundBlock1s, backgroundBlock1s) { + var err = null + var stop = false + function loop(m) {return function(block1) {return function() { + for (var i = 0; i < m; i++) {if (stop) break; block1(i)} }}} + function watch(block) {return function() { + try {block()} catch(e) {err = e; stop = true}}} + foreThunks = map(watch, map(loop(n), foregroundBlock1s)) + backThunks = map(watch, map(loop(Infinity), backgroundBlock1s)) + foreThreads = map(fork_, foreThunks) + backThreads = map(fork_, backThunks) + map(join_, foreThreads) + stop = true + map(join_, backThreads) + if (err != null) throw err +} diff --git a/jstests/libs/fun.js b/jstests/libs/fun.js new file mode 100644 index 0000000..276f32a --- /dev/null +++ b/jstests/libs/fun.js @@ -0,0 +1,32 @@ +// General high-order functions + +function forEach (action, array) { + for (var i = 0; i < array.length; i++) + action (array[i]); +} + +function foldl (combine, base, array) { + for (var i = 0; i < array.length; i++) + base = combine (base, array[i]); + return base +} + +function foldr (combine, base, array) { + for (var i = array.length - 1; i >= 0; i--) + base = combine (array[i], base); + return base +} + +function map (func, array) { + var result = []; + for (var i = 0; i < array.length; i++) + result.push (func (array[i])); + return result +} + +function filter (pred, array) { + var result = [] + for (var i = 0; i < array.length; i++) + if (pred (array[i])) result.push (array[i]); + return result +} diff --git a/jstests/libs/geo_near_random.js b/jstests/libs/geo_near_random.js new file mode 100644 index 0000000..8624ef2 --- /dev/null +++ b/jstests/libs/geo_near_random.js @@ -0,0 +1,78 @@ +GeoNearRandomTest = function(name) { + this.name = name; + this.t = db[name]; + this.nPts = 0; + + // reset state + this.t.drop(); + Random.srand(1234); + + print("starting test: " + name); +} + + +GeoNearRandomTest.prototype.mkPt = function mkPt(scale){ + scale = scale || 1; // scale is good for staying away from edges + return [((Random.rand() * 359.8) - 179.9) * scale, ((Random.rand() * 180) - 90) * scale]; +} + +GeoNearRandomTest.prototype.insertPts = function(nPts) { + assert.eq(this.nPts, 0, "insertPoints already called"); + this.nPts = nPts; + + for (var i=0; i FreshPorts Server +function Server (name) { + this.dbpath = '/data/db/' + name + nextPort + this.port = nextPort++ + this.noprealloc = '' + this.smallfiles = '' + this.rest = '' + this.oplogSize = 8 +} + +Server.prototype.addr = '127.0.0.1' + +// Server -> String +Server.prototype.host = function() { + return this.addr + ':' + this.port +} + +// Start a new server with this spec and return connection to it +// Server -> IO Connection +Server.prototype.begin = function() { + return startMongodEmpty(this) +} + +// Stop server and remove db directory +// Server -> IO () +Server.prototype.end = function() { + print('Stopping mongod on port ' + this.port) + stopMongod (this.port) + resetDbpath (this.dbpath) +} + +// Cut server from network so it is unreachable (but still alive) +// Requires sudo access and ipfw program (Mac OS X and BSD Unix). TODO: use iptables on Linux. +function cutServer (conn) { + var addrport = parseHost (conn.host) + cutNetwork (addrport.port) +} + +// Ensure server is connected to network (undo cutServer) +// Requires sudo access and ipfw program (Mac OS X and BSD Unix). TODO: use iptables on Linux. +function uncutServer (conn) { + var iport = parseHost (conn.host) + restoreNetwork (iport.port) +} + +// Kill server process at other end of this connection +function killServer (conn, _signal) { + var signal = _signal || 15 + var iport = parseHost (conn.host) + stopMongod (iport.port, signal) +} + +/*** ReplicaSet is the spec of a replica set, ie. options given to ReplicaSetTest. + To start a replica set call 'begin' ***/ +// new ReplicaSet :: String -> Int -> FreshPorts ReplicaSet +function ReplicaSet (name, numServers) { + this.name = name + this.host = '127.0.0.1' + this.nodes = numServers + this.startPort = nextPort + this.oplogSize = 40 + nextPort += numServers +} + +// Start a replica set with this spec and return ReplSetTest, which hold connections to the servers including the master server. Call ReplicaSetTest.stopSet() to end all servers +// ReplicaSet -> IO ReplicaSetTest +ReplicaSet.prototype.begin = function() { + var rs = new ReplSetTest(this) + rs.startSet() + rs.initiate() + rs.awaitReplication() + return rs +} + +// Create a new server and add it to replica set +// ReplicaSetTest -> IO Connection +ReplSetTest.prototype.addServer = function() { + var conn = this.add() + nextPort++ + this.reInitiate() + this.awaitReplication() + assert.soon(function() { + var doc = conn.getDB('admin').isMaster() + return doc['ismaster'] || doc['secondary'] + }) + return conn +} + +/*** ConfigSet is a set of specs (Servers) for sharding config servers. + Supply either the servers or the number of servers desired. + To start the config servers call 'begin' ***/ +// new ConfigSet :: [Server] or Int -> FreshPorts ConfigSet +function ConfigSet (configSvrsOrNumSvrs) { + if (typeof configSvrsOrNumSvrs == 'number') { + this.configSvrs = [] + for (var i = 0; i < configSvrsOrNumSvrs; i++) + this.configSvrs.push (new Server ('config')) + } else + this.configSvrs = configSvrs +} + +// Start config servers, return list of connections to them +// ConfigSet -> IO [Connection] +ConfigSet.prototype.begin = function() { + return map (function(s) {return s.begin()}, this.configSvrs) +} + +// Stop config servers +// ConfigSet -> IO () +ConfigSet.prototype.end = function() { + return map (function(s) {return s.end()}, this.configSvrs) +} + +/*** Router is the spec for a mongos, ie, its command line options. + To start a router (mongos) call 'begin' ***/ +// new Router :: ConfigSet -> FreshPorts Router +function Router (configSet) { + this.port = nextPort++ + this.v = 0 + this.configdb = map (function(s) {return s.host()}, configSet.configSvrs) .join(',') + this.chunkSize = 1 +} + +// Start router (mongos) with this spec and return connection to it +// Router -> IO Connection +Router.prototype.begin = function() { + return startMongos (this) +} + +// Stop router +// Router -> IO () +Router.prototype.end = function() { + return stopMongoProgram (this.port) +} + +// Add shard to config via router (mongos) connection. Shard is either a replSet name (replSet.getURL()) or single server (server.host) +// Connection -> String -> IO () +function addShard (routerConn, repSetOrHostName) { + var ack = routerConn.getDB('admin').runCommand ({addshard: repSetOrHostName}) + assert (ack['ok'], tojson(ack)) +} + +// Connection -> String -> IO () +function enableSharding (routerConn, dbName) { + var ack = routerConn.getDB('admin').runCommand ({enablesharding: dbName}) + assert (ack['ok'], tojson(ack)) +} + +// Connection -> String -> String -> String -> IO () +function shardCollection (routerConn, dbName, collName, shardKey) { + var ack = routerConn.getDB('admin').runCommand ({shardcollection: dbName + '.' + collName, key: shardKey}) + assert (ack['ok'], tojson(ack)) +} + +// Move db from its current primary shard to given shard. Shard is either a replSet name (replSet.getURL()) or single server (server.host) +// Connection -> String -> String -> IO () +function moveDB (routerConn, dbname, repSetOrHostName) { + var ack = routerConn.getDB('admin').runCommand ({moveprimary: dbname, to: repSetOrHostName}) + printjson(ack) + assert (ack['ok'], tojson(ack)) +} diff --git a/jstests/libs/network.js b/jstests/libs/network.js new file mode 100644 index 0000000..e5b33f3 --- /dev/null +++ b/jstests/libs/network.js @@ -0,0 +1,37 @@ + +// Parse "127.0.0.1:300" into {addr: "127.0.0.1", port: 300}, +// and "127.0.0.1" into {addr: "127.0.0.1", port: undefined} +function parseHost (hostString) { + var items = hostString.match(/(\d+.\d+.\d+.\d+)(:(\d+))?/) + return {addr: items[1], port: parseInt(items[3])} +} + + +/* Network traffic shaping (packet dropping) to simulate network problems + Currently works on BSD Unix and Mac OS X only (using ipfw). + Requires sudo access. + TODO: make it work on Linux too (using iptables). */ + +var nextRuleNum = 100 // this grows indefinitely but can't exceed 65534, so can't call routines below indefinitely +var portRuleNum = {} + +// Cut network connection to local port by dropping packets using iptables +function cutNetwork (port) { + portRuleNum[port] = nextRuleNum + runProgram ('sudo', 'ipfw', 'add ' + nextRuleNum++ + ' deny tcp from any to any ' + port) + runProgram ('sudo', 'ipfw', 'add ' + nextRuleNum++ + ' deny tcp from any ' + port + ' to any') + //TODO: confirm it worked (since sudo may not work) + runProgram ('sudo', 'ipfw', 'show') +} + +// Restore network connection to local port by not dropping packets using iptables +function restoreNetwork (port) { + var ruleNum = portRuleNum[port] + if (ruleNum) { + runProgram ('sudo', 'ipfw', 'delete ' + ruleNum++) + runProgram ('sudo', 'ipfw', 'delete ' + ruleNum) + delete portRuleNum[port] + } + //TODO: confirm it worked (since sudo may not work) + runProgram ('sudo', 'ipfw', 'show') +} diff --git a/jstests/misc/biginsert.js b/jstests/misc/biginsert.js new file mode 100755 index 0000000..ebbdc18 --- /dev/null +++ b/jstests/misc/biginsert.js @@ -0,0 +1,18 @@ +o = "xxxxxxxxxxxxxxxxxxx"; +o = o + o; +o + o; +o = o + o; +o = o + o; +o = o + o; + +var B = 40000; +var last = new Date(); +for (i = 0; i < 30000000; i++) { + db.foo.insert({ o: o }); + if (i % B == 0) { + var n = new Date(); + print(i); + print("per sec: " + B*1000 / (n - last)); + last = n; + } +} diff --git a/jstests/mr1.js b/jstests/mr1.js index aacd69b..dc81534 100644 --- a/jstests/mr1.js +++ b/jstests/mr1.js @@ -49,7 +49,7 @@ r2 = function( key , values ){ return total; }; -res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r } ); +res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , out : "mr1_out" } ); d( res ); if ( ks == "_id" ) assert( res.ok , "not ok" ); assert.eq( 4 , res.counts.input , "A" ); @@ -66,7 +66,7 @@ assert.eq( 3 , z.b , "E" ); assert.eq( 3 , z.c , "F" ); x.drop(); -res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , query : { x : { "$gt" : 2 } } } ); +res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , query : { x : { "$gt" : 2 } } , out : "mr1_out" } ); d( res ); assert.eq( 2 , res.counts.input , "B" ); x = db[res.result]; @@ -77,7 +77,7 @@ assert.eq( 1 , z.b , "C2" ); assert.eq( 2 , z.c , "C3" ); x.drop(); -res = db.runCommand( { mapreduce : "mr1" , map : m2 , reduce : r2 , query : { x : { "$gt" : 2 } } } ); +res = db.runCommand( { mapreduce : "mr1" , map : m2 , reduce : r2 , query : { x : { "$gt" : 2 } } , out : "mr1_out" } ); d( res ); assert.eq( 2 , res.counts.input , "B" ); x = db[res.result]; @@ -104,7 +104,7 @@ for ( i=5; i<1000; i++ ){ t.save( { x : i , tags : [ "b" , "d" ] } ); } -res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r } ); +res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , out : "mr1_out" } ); d( res ); assert.eq( 999 , res.counts.input , "Z1" ); x = db[res.result]; @@ -125,12 +125,12 @@ assert.eq( 995 , getk( "d" ).value.count , "ZD" ); x.drop(); if ( true ){ - printjson( db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , verbose : true } ) ); + printjson( db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , verbose : true , out : "mr1_out" } ) ); } print( "t1: " + Date.timeFunc( function(){ - var out = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r } ); + var out = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , out : "mr1_out" } ); if ( ks == "_id" ) assert( out.ok , "XXX : " + tojson( out ) ); db[out.result].drop(); } , 10 ) + " (~500 on 2.8ghz) - itcount: " + Date.timeFunc( function(){ db.mr1.find().itcount(); } , 10 ) ); @@ -138,7 +138,7 @@ print( "t1: " + Date.timeFunc( // test doesn't exist -res = db.runCommand( { mapreduce : "lasjdlasjdlasjdjasldjalsdj12e" , map : m , reduce : r } ); +res = db.runCommand( { mapreduce : "lasjdlasjdlasjdjasldjalsdj12e" , map : m , reduce : r , out : "mr1_out" } ); assert( ! res.ok , "should be not ok" ); if ( true ){ @@ -166,11 +166,15 @@ if ( true ){ } x.drop(); - res = db.runCommand( { mapreduce : "mr1" , out : "mr1_foo" , map : m2 , reduce : r2 } ); + res = db.runCommand( { mapreduce : "mr1" , out : "mr1_foo" , map : m2 , reduce : r2 , out : "mr1_out" } ); d(res); print( "t3: " + res.timeMillis + " (~3500 on 2.8ghz)" ); + + res = db.runCommand( { mapreduce : "mr1" , map : m2 , reduce : r2 , out : { inline : true } } ); + print( "t4: " + res.timeMillis ); + } -res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r } ); +res = db.runCommand( { mapreduce : "mr1" , map : m , reduce : r , out : "mr1_out" } ); assert( res.ok , "should be ok" ); diff --git a/jstests/mr2.js b/jstests/mr2.js index 0a8e9d6..709c305 100644 --- a/jstests/mr2.js +++ b/jstests/mr2.js @@ -29,7 +29,12 @@ function r( who , values ){ function reformat( r ){ var x = {}; - r.find().forEach( + var cursor; + if ( r.results ) + cursor = r.results; + else + cursor = r.find(); + cursor.forEach( function(z){ x[z._id] = z.value; } @@ -41,10 +46,22 @@ function f( who , res ){ res.avg = res.totalSize / res.num; return res; } -res = t.mapReduce( m , r , { finalize : f } ); + +res = t.mapReduce( m , r , { finalize : f , out : "mr2_out" } ); +printjson( res ) x = reformat( res ); -assert.eq( 9 , x.a.avg , "A" ); -assert.eq( 16 , x.b.avg , "B" ); -assert.eq( 18 , x.c.avg , "C" ); +assert.eq( 9 , x.a.avg , "A1" ); +assert.eq( 16 , x.b.avg , "A2" ); +assert.eq( 18 , x.c.avg , "A3" ); res.drop(); +res = t.mapReduce( m , r , { finalize : f , out : { inline : 1 } } ); +printjson( res ) +x = reformat( res ); +assert.eq( 9 , x.a.avg , "B1" ); +assert.eq( 16 , x.b.avg , "B2" ); +assert.eq( 18 , x.c.avg , "B3" ); +res.drop(); + +assert( ! ( "result" in res ) , "B4" ) + diff --git a/jstests/mr3.js b/jstests/mr3.js index e7d1f2c..3b0a918 100644 --- a/jstests/mr3.js +++ b/jstests/mr3.js @@ -25,7 +25,7 @@ r = function( key , values ){ return { count : total }; }; -res = t.mapReduce( m , r ); +res = t.mapReduce( m , r , { out : "mr3_out" } ); z = res.convertToSingleObject() assert.eq( 3 , Object.keySet( z ).length , "A1" ); @@ -35,7 +35,7 @@ assert.eq( 3 , z.c.count , "A4" ); res.drop(); -res = t.mapReduce( m , r , { mapparams : [ 2 , 2 ] } ); +res = t.mapReduce( m , r , { out : "mr3_out" , mapparams : [ 2 , 2 ] } ); z = res.convertToSingleObject() assert.eq( 3 , Object.keySet( z ).length , "B1" ); @@ -52,7 +52,7 @@ realm = m; m = function(){ emit( this._id , 1 ); } -res = t.mapReduce( m , r ); +res = t.mapReduce( m , r , { out : "mr3_out" } ); res.drop(); m = function(){ @@ -60,7 +60,7 @@ m = function(){ } before = db.getCollectionNames().length; -assert.throws( function(){ t.mapReduce( m , r ); } ); +assert.throws( function(){ t.mapReduce( m , r , { out : "mr3_out" } ); } ); assert.eq( before , db.getCollectionNames().length , "after throw crap" ); @@ -69,5 +69,5 @@ r = function( k , v ){ return v.x.x.x; } before = db.getCollectionNames().length; -assert.throws( function(){ t.mapReduce( m , r ); } ); +assert.throws( function(){ t.mapReduce( m , r , "mr3_out" ) } ) assert.eq( before , db.getCollectionNames().length , "after throw crap" ); diff --git a/jstests/mr4.js b/jstests/mr4.js index b14cdfe..78c8bce 100644 --- a/jstests/mr4.js +++ b/jstests/mr4.js @@ -23,7 +23,7 @@ r = function( key , values ){ return { count : total }; }; -res = t.mapReduce( m , r , { scope : { xx : 1 } } ); +res = t.mapReduce( m , r , { out : "mr4_out" , scope : { xx : 1 } } ); z = res.convertToSingleObject() assert.eq( 3 , Object.keySet( z ).length , "A1" ); @@ -34,7 +34,7 @@ assert.eq( 3 , z.c.count , "A4" ); res.drop(); -res = t.mapReduce( m , r , { scope : { xx : 2 } } ); +res = t.mapReduce( m , r , { scope : { xx : 2 } , out : "mr4_out" } ); z = res.convertToSingleObject() assert.eq( 3 , Object.keySet( z ).length , "A1" ); diff --git a/jstests/mr5.js b/jstests/mr5.js index bbac3fe..50a63d1 100644 --- a/jstests/mr5.js +++ b/jstests/mr5.js @@ -25,7 +25,7 @@ r = function( k , v ){ return { stats : stats , total : total } } -res = t.mapReduce( m , r , { scope : { xx : 1 } } ); +res = t.mapReduce( m , r , { out : "mr5_out" , scope : { xx : 1 } } ); //res.find().forEach( printjson ) z = res.convertToSingleObject() @@ -44,7 +44,7 @@ m = function(){ -res = t.mapReduce( m , r , { scope : { xx : 1 } } ); +res = t.mapReduce( m , r , { out : "mr5_out" , scope : { xx : 1 } } ); //res.find().forEach( printjson ) z = res.convertToSingleObject() diff --git a/jstests/mr_bigobject.js b/jstests/mr_bigobject.js index 8224209..4466b8d 100644 --- a/jstests/mr_bigobject.js +++ b/jstests/mr_bigobject.js @@ -3,11 +3,11 @@ t = db.mr_bigobject t.drop() s = ""; -while ( s.length < ( 1024 * 1024 ) ){ +while ( s.length < ( 6 * 1024 * 1024 ) ){ s += "asdasdasd"; } -for ( i=0; i<10; i++ ) +for ( i=0; i<5; i++ ) t.insert( { _id : i , s : s } ) m = function(){ @@ -18,13 +18,14 @@ r = function( k , v ){ return 1; } -assert.throws( function(){ t.mapReduce( m , r ); } , "emit should fail" ) +assert.throws( function(){ r = t.mapReduce( m , r , "mr_bigobject_out" ); } , null , "emit should fail" ) + m = function(){ emit( 1 , this.s ); } -assert.eq( { 1 : 1 } , t.mapReduce( m , r ).convertToSingleObject() , "A1" ) +assert.eq( { 1 : 1 } , t.mapReduce( m , r , "mr_bigobject_out" ).convertToSingleObject() , "A1" ) r = function( k , v ){ total = 0; @@ -38,4 +39,6 @@ r = function( k , v ){ return total; } -assert.eq( { 1 : 10 * s.length } , t.mapReduce( m , r ).convertToSingleObject() , "A1" ) +assert.eq( { 1 : t.count() * s.length } , t.mapReduce( m , r , "mr_bigobject_out" ).convertToSingleObject() , "A1" ) + +t.drop() diff --git a/jstests/mr_comments.js b/jstests/mr_comments.js new file mode 100644 index 0000000..f6a0699 --- /dev/null +++ b/jstests/mr_comments.js @@ -0,0 +1,28 @@ + +t = db.mr_comments +t.drop() + +t.insert( { foo : 1 } ) +t.insert( { foo : 1 } ) +t.insert( { foo : 2 } ) + +res = db.runCommand( + { mapreduce : "mr_comments", + map : "// This will fail\n\n // Emit some stuff\n emit(this.foo, 1)\n", + reduce : function(key, values){ + return Array.sum(values); + }, + out: "mr_comments_out" + }); +assert.eq( 3 , res.counts.emit ) + +res = db.runCommand( + { mapreduce : "mr_comments", + map : "// This will fail\nfunction(){\n // Emit some stuff\n emit(this.foo, 1)\n}\n", + reduce : function(key, values){ + return Array.sum(values); + }, + out: "mr_comments_out" + }); + +assert.eq( 3 , res.counts.emit ) diff --git a/jstests/mr_errorhandling.js b/jstests/mr_errorhandling.js index 57724f1..c4e1137 100644 --- a/jstests/mr_errorhandling.js +++ b/jstests/mr_errorhandling.js @@ -24,7 +24,7 @@ r = function( k , v ){ return total; } -res = t.mapReduce( m_good , r ); +res = t.mapReduce( m_good , r , "mr_errorhandling_out" ); assert.eq( { 1 : 1 , 2 : 2 , 3 : 2 , 4 : 1 } , res.convertToSingleObject() , "A" ); res.drop() @@ -32,7 +32,7 @@ res = null; theerror = null; try { - res = t.mapReduce( m_bad , r ); + res = t.mapReduce( m_bad , r , "mr_errorhandling_out" ); } catch ( e ){ theerror = e.toString(); @@ -42,6 +42,8 @@ assert( theerror , "B2" ); assert( theerror.indexOf( "emit" ) >= 0 , "B3" ); // test things are still in an ok state -res = t.mapReduce( m_good , r ); +res = t.mapReduce( m_good , r , "mr_errorhandling_out" ); assert.eq( { 1 : 1 , 2 : 2 , 3 : 2 , 4 : 1 } , res.convertToSingleObject() , "A" ); res.drop() + +assert.throws( function(){ t.mapReduce( m_good , r , { out : "xxx" , query : "foo" } ); } ) diff --git a/jstests/mr_index.js b/jstests/mr_index.js new file mode 100644 index 0000000..521d44d --- /dev/null +++ b/jstests/mr_index.js @@ -0,0 +1,43 @@ + +t = db.mr_index +t.drop() + +outName = "mr_index_out" +out = db[outName] +out.drop() + +t.insert( { tags : [ 1 ] } ) +t.insert( { tags : [ 1 , 2 ] } ) +t.insert( { tags : [ 1 , 2 , 3 ] } ) +t.insert( { tags : [ 3 ] } ) +t.insert( { tags : [ 2 , 3 ] } ) +t.insert( { tags : [ 2 , 3 ] } ) +t.insert( { tags : [ 1 , 2 ] } ) + +m = function(){ + for ( i=0; i .9999 ) + print( t.count() ) + } +} + +function del2( dbname ){ + var m = new Mongo( HOST ) + var db = m.getDB( "foo" + dbname ); + var t = db.del + + while ( ! DONE ){ + var r = Math.random(); + var n = Math.floor( Math.random() * N ); + var s = Math.random() > .5 ? 1 : -1; + + if ( r < .5 ){ + t.findOne( { x : n } ) + } + else if ( r < .75 ){ + t.find( { x : { $lt : n } } ).sort( { x : s } ).itcount(); + } + else { + t.find( { x : { $gt : n } } ).sort( { x : s } ).itcount(); + } + } +} + +all = [] + +all.push( fork( del1 , "a" ) ) +all.push( fork( del2 , "a" ) ) +all.push( fork( del1 , "b" ) ) +all.push( fork( del2 , "b" ) ) + +for ( i=0; i= 6; } ) +t.update( {} , { $pull : { a : { $lt : 6 } } } ) + +assert.eq( o.a , t.findOne().a , "A2" ) + diff --git a/jstests/push2.js b/jstests/push2.js index 943ec11..b976169 100644 --- a/jstests/push2.js +++ b/jstests/push2.js @@ -18,3 +18,5 @@ for ( x=0; x<200; x++ ){ } assert( gotError , "should have gotten error" ); + +t.drop(); diff --git a/jstests/queryoptimizer2.js b/jstests/queryoptimizer2.js new file mode 100644 index 0000000..af21e95 --- /dev/null +++ b/jstests/queryoptimizer2.js @@ -0,0 +1,62 @@ + +t = db.queryoptimizer2; + +function doTest( f1, f2 ) { + +t.drop() + +for( i = 0; i < 30; ++i ) { + t.save( { a:2 } ); +} + +for( i = 0; i < 30; ++i ) { + t.save( { b:2 } ); +} + +for( i = 0; i < 60; ++i ) { + t.save( { c:2 } ); +} + +t.ensureIndex( { a:1 } ); +t.ensureIndex( { b:1 } ); + +e = t.find( { b:2 } ).batchSize( 100 ).explain( true ); +assert.eq( null, e.oldPlan ); + +t.ensureIndex( { c:1 } ); // will clear query cache + +f1(); + +assert( t.find( { a:2 } ).batchSize( 100 ).explain( true ).oldPlan ); +assert( t.find( { b:2 } ).batchSize( 100 ).explain( true ).oldPlan ); + +e = t.find( { c:2 } ).batchSize( 100 ).explain( true ); +// no pattern should be recorded as a result of the $or query +assert.eq( null, e.oldPlan ); + +t.dropIndex( { b:1 } ); // clear query cache +for( i = 0; i < 15; ++i ) { + t.save( { a:2 } ); +} + +f2(); +// pattern should be recorded, since > half of results returned from this index +assert( t.find( { c:2 } ).batchSize( 100 ).explain( true ).oldPlan ); + +} + +doTest( function() { + t.find( { $or: [ { a:2 }, { b:2 }, { c:2 } ] } ).batchSize( 100 ).toArray(); + }, + function() { + t.find( { $or: [ { a:2 }, { c:2 } ] } ).batchSize( 100 ).toArray(); + } + ); + +doTest( function() { + t.find( { $or: [ { a:2 }, { b:2 }, { c:2 } ] } ).limit( 100 ).count( true ); + }, + function() { + t.find( { $or: [ { a:2 }, { c:2 } ] } ).limit( 100 ).count( true ); + } + ); diff --git a/jstests/regex3.js b/jstests/regex3.js index ee8d9cf..7d703aa 100644 --- a/jstests/regex3.js +++ b/jstests/regex3.js @@ -23,7 +23,7 @@ t.save( { name : "c" } ); assert.eq( 3 , t.find( { name : /^aa*/ } ).count() , "B ni" ); t.ensureIndex( { name : 1 } ); assert.eq( 3 , t.find( { name : /^aa*/ } ).count() , "B i 1" ); -assert.eq( 3 , t.find( { name : /^aa*/ } ).explain().nscanned , "B i 1 e" ); +assert.eq( 4 , t.find( { name : /^aa*/ } ).explain().nscanned , "B i 1 e" ); assert.eq( 2 , t.find( { name : /^a[ab]/ } ).count() , "B i 2" ); assert.eq( 2 , t.find( { name : /^a[bc]/ } ).count() , "B i 3" ); diff --git a/jstests/regex6.js b/jstests/regex6.js index 12ed85b..8243313 100644 --- a/jstests/regex6.js +++ b/jstests/regex6.js @@ -10,10 +10,10 @@ t.save( { name : "aaron" } ); t.ensureIndex( { name : 1 } ); assert.eq( 0 , t.find( { name : /^\// } ).count() , "index count" ); -assert.eq( 0 , t.find( { name : /^\// } ).explain().nscanned , "index explain 1" ); +assert.eq( 1 , t.find( { name : /^\// } ).explain().nscanned , "index explain 1" ); assert.eq( 0 , t.find( { name : /^é/ } ).explain().nscanned , "index explain 2" ); assert.eq( 0 , t.find( { name : /^\é/ } ).explain().nscanned , "index explain 3" ); -assert.eq( 0 , t.find( { name : /^\./ } ).explain().nscanned , "index explain 4" ); +assert.eq( 1 , t.find( { name : /^\./ } ).explain().nscanned , "index explain 4" ); assert.eq( 4 , t.find( { name : /^./ } ).explain().nscanned , "index explain 5" ); assert.eq( 4 , t.find( { name : /^\Qblah\E/ } ).explain().nscanned , "index explain 6" ); diff --git a/jstests/regex9.js b/jstests/regex9.js index 559efd9..896855c 100644 --- a/jstests/regex9.js +++ b/jstests/regex9.js @@ -1,5 +1,5 @@ -t = db.regex3; +t = db.regex9; t.drop(); t.insert( { _id : 1 , a : [ "a" , "b" , "c" ] } ) diff --git a/jstests/remove_undefined.js b/jstests/remove_undefined.js new file mode 100644 index 0000000..d5344a3 --- /dev/null +++ b/jstests/remove_undefined.js @@ -0,0 +1,28 @@ + +t = db.drop_undefined.js + +t.insert( { _id : 1 } ) +t.insert( { _id : 2 } ) +t.insert( { _id : null } ) + +z = { foo : 1 , x : null } + +t.remove( { x : z.bar } ) +assert.eq( 3 , t.count() , "A1" ) + +t.remove( { x : undefined } ) +assert.eq( 3 , t.count() , "A2" ) + +assert.throws( function(){ t.remove( { _id : z.bar } ) } , null , "B1" ) +assert.throws( function(){ t.remove( { _id : undefined } ) } , null , "B2" ) + + +t.remove( { _id : z.x } ) +assert.eq( 2 , t.count() , "C1" ) + +t.insert( { _id : null } ) +assert.eq( 3 , t.count() , "C2" ) + +assert.throws( function(){ t.remove( { _id : undefined } ) } , null, "C3" ) +assert.eq( 3 , t.count() , "C4" ) + diff --git a/jstests/rename4.js b/jstests/rename4.js new file mode 100644 index 0000000..29be374 --- /dev/null +++ b/jstests/rename4.js @@ -0,0 +1,121 @@ +t = db.jstests_rename4; +t.drop(); + +function c( f ) { + assert( !db.getLastError(), "error" ); + eval( f ); + assert( db.getLastError(), "no error" ); + db.resetError(); +} + +c( "t.update( {}, {$rename:{'a':'a'}} )" ); +c( "t.update( {}, {$rename:{'':'a'}} )" ); +c( "t.update( {}, {$rename:{'a':''}} )" ); +c( "t.update( {}, {$rename:{'_id':'a'}} )" ); +c( "t.update( {}, {$rename:{'a':'_id'}} )" ); +c( "t.update( {}, {$rename:{'_id.a':'b'}} )" ); +c( "t.update( {}, {$rename:{'b':'_id.a'}} )" ); +c( "t.update( {}, {$rename:{'_id.a':'_id.b'}} )" ); +c( "t.update( {}, {$rename:{'_id.b':'_id.a'}} )" ); +c( "t.update( {}, {$rename:{'.a':'b'}} )" ); +c( "t.update( {}, {$rename:{'a':'.b'}} )" ); +c( "t.update( {}, {$rename:{'a.':'b'}} )" ); +c( "t.update( {}, {$rename:{'a':'b.'}} )" ); +c( "t.update( {}, {$rename:{'a.b':'a'}} )" ); +c( "t.update( {}, {$rename:{'a.$':'b'}} )" ); +c( "t.update( {}, {$rename:{'a':'b.$'}} )" ); +c( "t.update( {}, {$set:{b:1},$rename:{'a':'b'}} )" ); +c( "t.update( {}, {$rename:{'a':'b'},$set:{b:1}} )" ); +c( "t.update( {}, {$rename:{'a':'b'},$set:{a:1}} )" ); +c( "t.update( {}, {$set:{'b.c':1},$rename:{'a':'b'}} )" ); +c( "t.update( {}, {$set:{b:1},$rename:{'a':'b.c'}} )" ); +c( "t.update( {}, {$rename:{'a':'b'},$set:{'b.c':1}} )" ); +c( "t.update( {}, {$rename:{'a':'b.c'},$set:{b:1}} )" ); + +t.save( {a:[1],b:{c:[1]},d:[{e:1}],f:1} ); +c( "t.update( {}, {$rename:{'a.0':'f'}} )" ); +c( "t.update( {}, {$rename:{'a.0':'g'}} )" ); +c( "t.update( {}, {$rename:{'f':'a.0'}} )" ); +c( "t.update( {}, {$rename:{'b.c.0':'f'}} )" ); +c( "t.update( {}, {$rename:{'f':'b.c.0'}} )" ); +c( "t.update( {}, {$rename:{'d.e':'d.f'}} )" ); +c( "t.update( {}, {$rename:{'d.e':'f'}} )" ); +c( "t.update( {}, {$rename:{'d.f':'d.e'}} )" ); +c( "t.update( {}, {$rename:{'f':'d.e'}} )" ); +c( "t.update( {}, {$rename:{'d.0.e':'d.f'}} )" ); +c( "t.update( {}, {$rename:{'d.0.e':'f'}} )" ); +c( "t.update( {}, {$rename:{'d.f':'d.0.e'}} )" ); +c( "t.update( {}, {$rename:{'f':'d.0.e'}} )" ); +c( "t.update( {}, {$rename:{'f.g':'a'}} )" ); +c( "t.update( {}, {$rename:{'a':'f.g'}} )" ); + +function v( start, mod, expected ) { + t.remove(); + t.save( start ); + t.update( {}, mod ); + assert( !db.getLastError() ); + var got = t.findOne(); + delete got._id; + assert.eq( expected, got ); +} + +v( {a:1}, {$rename:{a:'b'}}, {b:1} ); +v( {a:1}, {$rename:{a:'bb'}}, {bb:1} ); +v( {b:1}, {$rename:{b:'a'}}, {a:1} ); +v( {bb:1}, {$rename:{bb:'a'}}, {a:1} ); +v( {a:{y:1}}, {$rename:{'a.y':'a.z'}}, {a:{z:1}} ); +v( {a:{yy:1}}, {$rename:{'a.yy':'a.z'}}, {a:{z:1}} ); +v( {a:{z:1}}, {$rename:{'a.z':'a.y'}}, {a:{y:1}} ); +v( {a:{zz:1}}, {$rename:{'a.zz':'a.y'}}, {a:{y:1}} ); +v( {a:{c:1}}, {$rename:{a:'b'}}, {b:{c:1}} ); +v( {aa:{c:1}}, {$rename:{aa:'b'}}, {b:{c:1}} ); +v( {a:1,b:2}, {$rename:{a:'b'}}, {b:1} ); +v( {aa:1,b:2}, {$rename:{aa:'b'}}, {b:1} ); +v( {a:1,bb:2}, {$rename:{a:'bb'}}, {bb:1} ); +v( {a:1}, {$rename:{a:'b.c'}}, {b:{c:1}} ); +v( {aa:1}, {$rename:{aa:'b.c'}}, {b:{c:1}} ); +v( {a:1,b:{}}, {$rename:{a:'b.c'}}, {b:{c:1}} ); +v( {aa:1,b:{}}, {$rename:{aa:'b.c'}}, {b:{c:1}} ); +v( {a:1}, {$rename:{b:'c'}}, {a:1} ); +v( {aa:1}, {$rename:{b:'c'}}, {aa:1} ); +v( {}, {$rename:{b:'c'}}, {} ); +v( {a:{b:1,c:2}}, {$rename:{'a.b':'d'}}, {a:{c:2},d:1} ); +v( {a:{bb:1,c:2}}, {$rename:{'a.bb':'d'}}, {a:{c:2},d:1} ); +v( {a:{b:1}}, {$rename:{'a.b':'d'}}, {a:{},d:1} ); +v( {a:[5]}, {$rename:{a:'b'}}, {b:[5]} ); +v( {aa:[5]}, {$rename:{aa:'b'}}, {b:[5]} ); +v( {'0':1}, {$rename:{'0':'5'}}, {'5':1} ); +v( {a:1,b:2}, {$rename:{a:'c'},$set:{b:5}}, {b:5,c:1} ); +v( {aa:1,b:2}, {$rename:{aa:'c'},$set:{b:5}}, {b:5,c:1} ); +v( {a:1,b:2}, {$rename:{z:'c'},$set:{b:5}}, {a:1,b:5} ); +v( {aa:1,b:2}, {$rename:{z:'c'},$set:{b:5}}, {aa:1,b:5} ); + +// (formerly) rewriting single field +v( {a:{z:1,b:1}}, {$rename:{'a.b':'a.c'}}, {a:{c:1,z:1}} ); +v( {a:{z:1,tomato:1}}, {$rename:{'a.tomato':'a.potato'}}, {a:{potato:1,z:1}} ); +v( {a:{z:1,b:1,c:1}}, {$rename:{'a.b':'a.c'}}, {a:{c:1,z:1}} ); +v( {a:{z:1,tomato:1,potato:1}}, {$rename:{'a.tomato':'a.potato'}}, {a:{potato:1,z:1}} ); +v( {a:{z:1,b:1}}, {$rename:{'a.b':'a.cc'}}, {a:{cc:1,z:1}} ); +v( {a:{z:1,b:1,c:1}}, {$rename:{'a.b':'aa.c'}}, {a:{c:1,z:1},aa:{c:1}} ); + +// invalid target, but missing source +v( {a:1,c:4}, {$rename:{b:'c.d'}}, {a:1,c:4} ); + +// check index +t.drop(); +t.ensureIndex( {a:1} ); + +function l( start, mod, query, expected ) { + t.remove(); + t.save( start ); + t.update( {}, mod ); + assert( !db.getLastError() ); + var got = t.find( query ).hint( {a:1} ).next(); + delete got._id; + assert.eq( expected, got ); +} + +l( {a:1}, {$rename:{a:'b'}}, {a:null}, {b:1} ); +l( {a:1}, {$rename:{a:'bb'}}, {a:null}, {bb:1} ); +l( {b:1}, {$rename:{b:'a'}}, {a:1}, {a:1} ); +l( {bb:1}, {$rename:{bb:'a'}}, {a:1}, {a:1} ); diff --git a/jstests/repl/basic1.js b/jstests/repl/basic1.js index 701d71e..15fc983 100644 --- a/jstests/repl/basic1.js +++ b/jstests/repl/basic1.js @@ -60,7 +60,7 @@ r = function( key , v ){ correct = { a : 2 , b : 1 }; function checkMR( t ){ - var res = t.mapReduce( m , r ); + var res = t.mapReduce( m , r , "basic1_out" ); assert.eq( correct , res.convertToSingleObject() , "checkMR: " + tojson( t ) ); } @@ -68,7 +68,7 @@ function checkNumCollections( msg , diff ){ if ( ! diff ) diff = 0; var m = am.getCollectionNames(); var s = as.getCollectionNames(); - assert.eq( m.length + diff , s.length , "lengths bad \n" + tojson( m ) + "\n" + tojson( s ) ); + assert.eq( m.length + diff , s.length , msg + " lengths bad \n" + tojson( m ) + "\n" + tojson( s ) ); } checkNumCollections( "MR1" ); diff --git a/jstests/repl/block2.js b/jstests/repl/block2.js index 0e34758..f38a4e3 100644 --- a/jstests/repl/block2.js +++ b/jstests/repl/block2.js @@ -18,25 +18,26 @@ function check( msg ){ assert.eq( tm.count() , ts.count() , "check: " + msg ); } +function worked( w , wtimeout ){ + return dbm.getLastError( w , wtimeout ) == null; +} + check( "A" ); tm.save( { x : 1 } ); -dbm.getLastError( 2 ); -check( "B" ); +assert( worked( 2 ) , "B" ); tm.save( { x : 2 } ); -dbm.getLastError( 2 , 500 ); -check( "C" ); +assert( worked( 2 , 500 ) , "C" ) rt.stop( false ); tm.save( { x : 3 } ) assert.eq( 3 , tm.count() , "D1" ); -assert.throws( function(){ dbm.getLastError( 2 , 500 ); } , "D2" ) +assert( ! worked( 2 , 500 ) , "D2" ) s = rt.start( false ) setup(); -dbm.getLastError( 2 , 30000 ) -check( "D3" ) +assert( worked( 2 , 30000 ) , "E" ) rt.stop(); diff --git a/jstests/repl/mastermaster1.js b/jstests/repl/mastermaster1.js index 9f9334b..4932d5a 100644 --- a/jstests/repl/mastermaster1.js +++ b/jstests/repl/mastermaster1.js @@ -6,6 +6,8 @@ ports = allocatePorts( 2 ) left = startMongodTest( ports[0] , "mastermaster1left" , false , { master : "" , slave : "" , source : "127.0.0.1:" + ports[1] } ) right = startMongodTest( ports[1] , "mastermaster1left" , false , { master : "" , slave : "" , source : "127.0.0.1:" + ports[0] } ) +print( "check 1" ) + x = left.getDB( "admin" ).runCommand( "ismaster" ) assert( x.ismaster , "left: " + tojson( x ) ) @@ -15,6 +17,8 @@ assert( x.ismaster , "right: " + tojson( x ) ) ldb = left.getDB( "test" ) rdb = right.getDB( "test" ) +print( "check 2" ) + ldb.foo.insert( { _id : 1 , x : "eliot" } ) var result = ldb.runCommand( { getlasterror : 1 , w : 2 , wtimeout : 20000 } ); printjson(result); @@ -27,12 +31,12 @@ print( "check 3" ) assert.eq( 2 , ldb.foo.count() , "B1" ) assert.eq( 2 , rdb.foo.count() , "B2" ) - +print( "going to stop everything" ) for ( var i=0; i 0" ); + rt.stop(); } diff --git a/jstests/repl/repl11.js b/jstests/repl/repl11.js index c5c63b3..aef9872 100644 --- a/jstests/repl/repl11.js +++ b/jstests/repl/repl11.js @@ -35,6 +35,10 @@ doTest = function( signal ) { sa = s.getDB( baseName ).a; assert.soon( function() { return 1 == sa.count(); } ); + s.getDB( "local" ).auth( "repl", "foo" ); + assert.commandWorked( s.getDB( "admin" )._adminCommand( {serverStatus:1,repl:1} ) ); + assert.commandWorked( s.getDB( "admin" )._adminCommand( {serverStatus:1,repl:2} ) ); + rt.stop( false, signal ); ma.save( {} ); diff --git a/jstests/repl/repl2.js b/jstests/repl/repl2.js index c9fe6b9..42b0caf 100644 --- a/jstests/repl/repl2.js +++ b/jstests/repl/repl2.js @@ -31,6 +31,8 @@ doTest = function( signal ) { assert.soon( function() { return 1 == s.getDB( "admin" ).runCommand( { "resync" : 1 } ).ok; } ); soonCount( 1001 ); + assert.automsg( "m.getDB( 'local' ).getCollection( 'oplog.$main' ).stats().size > 0" ); + as = s.getDB("foo").a assert.eq( 1, as.find( { i: 0 } ).count() ); assert.eq( 1, as.find( { i: 999 } ).count() ); diff --git a/jstests/repl/snapshot3.js b/jstests/repl/snapshot3.js index d8d268d..02955e5 100644 --- a/jstests/repl/snapshot3.js +++ b/jstests/repl/snapshot3.js @@ -47,7 +47,7 @@ assert.eq( 500, rp.slave().getDB( baseName )[ baseName ].count() ); rp.master().getDB( baseName )[ baseName ].save( {i:500} ); assert.soon( function() { return 501 == rp.slave().getDB( baseName )[ baseName ].count(); } ); -assert( !rawMongoProgramOutput().match( /resync/ ) ); -assert( !rawMongoProgramOutput().match( /SyncException/ ) ); +assert( !rawMongoProgramOutput().match( new RegExp( "resync.*" + baseName + ".*\n" ) ) , "last1" ); +assert( !rawMongoProgramOutput().match( /SyncException/ ) , "last2" ); print("snapshot3.js finishes"); diff --git a/jstests/replsets/auth1.js b/jstests/replsets/auth1.js new file mode 100644 index 0000000..4945869 --- /dev/null +++ b/jstests/replsets/auth1.js @@ -0,0 +1,184 @@ +// check replica set authentication + +load("jstests/replsets/rslib.js"); + +var name = "rs_auth1"; +var port = allocatePorts(4); +var path = "jstests/replsets/"; + + +print("reset permissions"); +run("chmod", "644", path+"key1"); +run("chmod", "644", path+"key2"); + + +print("try starting mongod"); +var m = runMongoProgram( "mongod", "--keyFile", path+"key1", "--port", port[0], "--dbpath", "/data/db/" + name); + + +print("should fail with wrong permissions"); +assert.eq(m, 2, "mongod should exit w/ 2: permissions too open"); +stopMongod(port[0]); + + +print("change permissions on #1 & #2"); +run("chmod", "600", path+"key1"); +run("chmod", "600", path+"key2"); + + +print("add a user to server0: foo"); +m = startMongodTest( port[0], name+"-0", 0 ); +m.getDB("admin").addUser("foo", "bar"); +m.getDB("test").addUser("bar", "baz"); +print("make sure user is written before shutting down"); +m.getDB("test").getLastError(); +stopMongod(port[0]); + + +print("start up rs"); +var rs = new ReplSetTest({"name" : name, "nodes" : 3, "startPort" : port[0]}); +m = rs.restart(0, {"keyFile" : path+"key1"}); +var s = rs.start(1, {"keyFile" : path+"key1"}); +var s2 = rs.start(2, {"keyFile" : path+"key1"}); + +var result = m.getDB("admin").auth("foo", "bar"); +assert.eq(result, 1, "login failed"); +result = m.getDB("admin").runCommand({replSetInitiate : rs.getReplSetConfig()}); +assert.eq(result.ok, 1, "couldn't initiate: "+tojson(result)); + +var master = rs.getMaster().getDB("test"); +wait(function() { + var status = master.adminCommand({replSetGetStatus:1}); + return status.members && status.members[1].state == 2 && status.members[2].state == 2; + }); + +master.foo.insert({x:1}); +master.runCommand({getlasterror:1, w:3, wtimeout:60000}); + + +print("try some legal and illegal reads"); +var r = master.foo.findOne(); +assert.eq(r.x, 1); + +s.setSlaveOk(); +slave = s.getDB("test"); + +function doQueryOn(p) { + var err = {}; + try { + r = p.foo.findOne(); + } + catch(e) { + if (typeof(JSON) != "undefined") { + err = JSON.parse(e.substring(6)); + } + else if (e.indexOf("10057") > 0) { + err.code = 10057; + } + } + assert.eq(err.code, 10057); +}; + +doQueryOn(slave); +master.adminCommand({logout:1}); +doQueryOn(master); + + +result = slave.auth("bar", "baz"); +assert.eq(result, 1); + +r = slave.foo.findOne(); +assert.eq(r.x, 1); + + +print("add some data"); +master.auth("bar", "baz"); +for (var i=0; i<1000; i++) { + master.foo.insert({x:i, foo : "bar"}); +} +master.runCommand({getlasterror:1, w:3, wtimeout:60000}); + + +print("fail over"); +rs.stop(0); + +wait(function() { + function getMaster(s) { + var result = s.getDB("admin").runCommand({isMaster: 1}); + printjson(result); + if (result.ismaster) { + master = s.getDB("test"); + return true; + } + return false; + } + + if (getMaster(s) || getMaster(s2)) { + return true; + } + return false; + }); + + +print("add some more data 1"); +master.auth("bar", "baz"); +for (var i=0; i<1000; i++) { + master.foo.insert({x:i, foo : "bar"}); +} +master.runCommand({getlasterror:1, w:3, wtimeout:60000}); + + +print("resync"); +rs.restart(0); + + +print("add some more data 2"); +for (var i=0; i<1000; i++) { + master.foo.insert({x:i, foo : "bar"}); +} +master.runCommand({getlasterror:1, w:3, wtimeout:60000}); + + +print("add member with wrong key"); +var conn = new MongodRunner(port[3], "/data/db/"+name+"-3", null, null, ["--replSet","rs_auth1","--rest","--oplogSize","2", "--keyFile", path+"key2"], {no_bind : true}); +conn.start(); + + +master.getSisterDB("admin").auth("foo", "bar"); +var config = master.getSisterDB("local").system.replset.findOne(); +config.members.push({_id : 3, host : getHostName()+":"+port[3]}); +config.version++; +try { + master.adminCommand({replSetReconfig:config}); +} +catch (e) { + print("error: "+e); +} +reconnect(master); +master.getSisterDB("admin").auth("foo", "bar"); + + +print("shouldn't ever sync"); +for (var i = 0; i<30; i++) { + print("iteration: " +i); + var results = master.adminCommand({replSetGetStatus:1}); + printjson(results); + assert(results.members[3].state != 2); + sleep(1000); +} + + +print("stop member"); +stopMongod(port[3]); + + +print("start back up with correct key"); +conn = new MongodRunner(port[3], "/data/db/"+name+"-3", null, null, ["--replSet","rs_auth1","--rest","--oplogSize","2", "--keyFile", path+"key1"], {no_bind : true}); +conn.start(); + +wait(function() { + var results = master.adminCommand({replSetGetStatus:1}); + printjson(results); + return results.members[3].state == 2; + }); + diff --git a/jstests/replsets/buildindexes.js b/jstests/replsets/buildindexes.js new file mode 100644 index 0000000..76de797 --- /dev/null +++ b/jstests/replsets/buildindexes.js @@ -0,0 +1,86 @@ +doTest = function( signal ) { + + var name = "buildIndexes"; + var host = getHostName(); + + var replTest = new ReplSetTest( {name: name, nodes: 3} ); + + var nodes = replTest.startSet(); + + var config = replTest.getReplSetConfig(); + config.members[2].priority = 0; + config.members[2].buildIndexes = false; + + replTest.initiate(config); + + var master = replTest.getMaster().getDB(name); + var slaveConns = replTest.liveNodes.slaves; + var slave = []; + for (var i in slaveConns) { + slaveConns[i].setSlaveOk(); + slave.push(slaveConns[i].getDB(name)); + } + replTest.awaitReplication(); + + print("creating an index on x"); + master.x.ensureIndex({y : 1}); + printjson(master.x.stats()); + + for (var i=0; i<100; i++) { + master.x.insert({x:1,y:"abc",c:1}); + } + + replTest.awaitReplication(); + + printjson(slave[0].runCommand({count: "x"})); + var ns = master.x+""; + print("namespace: "+ns); + + // can't query system.indexes from slave, so we'll look at coll.stats() + printjson(slave[0].adminCommand({replSetGetStatus:1})); + printjson(slave[0].getSisterDB("local").system.replset.findOne()); + printjson(master.stats()); + printjson(slave[0].stats()); + printjson(slave[1].stats()); + printjson(master.x.stats()); + printjson(slave[0].x.stats()); + printjson(slave[1].x.stats()); + print("sleeping"); + sleep(20000); + var indexes = slave[0].stats().indexes; + assert.eq(indexes, 2, 'number of indexes'); + + indexes = slave[1].stats().indexes; + assert.eq(indexes, 1); + + + indexes = slave[0].x.stats().indexSizes; + printjson(indexes); + + var count = 0; + for (var i in indexes) { + count++; + if (i == "_id_") { + continue; + } + print(i); + print(i.match(/y_/)); + assert(i.match(/y_/)); + } + + assert.eq(count, 2); + + indexes = slave[1].x.stats().indexSizes; + printjson(indexes); + + count = 0; + for (var i in indexes) { + count++; + } + + assert.eq(count, 1); + + replTest.stopSet(15); +} + +doTest(15); diff --git a/jstests/replsets/cloneDb.js b/jstests/replsets/cloneDb.js new file mode 100644 index 0000000..6d2d0f3 --- /dev/null +++ b/jstests/replsets/cloneDb.js @@ -0,0 +1,52 @@ +// Test for cloning a db from a replica set [SERVER-1643] -Tony + +load('jstests/libs/grid.js') + +doTest = function( signal ) { + + var N = 2000 + + // ~1KB string + var Text = '' + for (var i = 0; i < 40; i++) + Text += 'abcdefghijklmnopqrstuvwxyz' + + // Create replica set + var repset = new ReplicaSet ('testSet', 3) .begin() + var master = repset.getMaster() + var db1 = master.getDB('test') + + // Insert data + for (var i = 0; i < N; i++) { + db1['foo'].insert({x: i, text: Text}) + db1.getLastError(2) // wait to be copied to at least one secondary + } + + // Create single server + var solo = new Server ('singleTarget') + var soloConn = solo.begin() + var db2 = soloConn.getDB('test') + + // Clone db from replica set to single server + db2.cloneDatabase (repset.getURL()) + + // Confirm clone worked + assert.eq (Text, db2['foo'] .findOne({x: N-1}) ['text'], 'cloneDatabase failed (test1)') + + // Now test the reverse direction + db1 = master.getDB('test2') + db2 = soloConn.getDB('test2') + for (var i = 0; i < N; i++) { + db2['foo'].insert({x: i, text: Text}) + db2.getLastError() + } + db1.cloneDatabase (solo.host()) + assert.eq (Text, db2['foo'] .findOne({x: N-1}) ['text'], 'cloneDatabase failed (test2)') + + // Shut down replica set and single server + solo.end() + repset.stopSet( signal ) +} + +doTest( 15 ); +print("replsets/cloneDb.js SUCCESS"); diff --git a/jstests/replsets/config1.js b/jstests/replsets/config1.js new file mode 100644 index 0000000..748ce8f --- /dev/null +++ b/jstests/replsets/config1.js @@ -0,0 +1,21 @@ +doTest = function( signal ) { + var name = 'config1'; + + var replTest = new ReplSetTest( {name: name, nodes: 3} ); + var nodes = replTest.startSet(); + + var config = replTest.getReplSetConfig(); + config.settings = {"heartbeatSleep" : .5, heartbeatTimeout : .8}; + + replTest.initiate(config); + + // Call getMaster to return a reference to the node that's been + // elected master. + var master = replTest.getMaster(); + + config = master.getDB("local").system.replset.findOne(); + assert.eq(config.settings.heartbeatSleep, .5); + assert.eq(config.settings.heartbeatTimeout, .8); +}; + +doTest(15); diff --git a/jstests/replsets/fastsync.js b/jstests/replsets/fastsync.js new file mode 100644 index 0000000..d7c3905 --- /dev/null +++ b/jstests/replsets/fastsync.js @@ -0,0 +1,117 @@ +/* + * 1. insert 100000 objects + * 2. export to two dbpaths + * 3. add one node w/fastsync + * 4. check that we never get "errmsg" : "initial sync cloning db: whatever" + * 5. check writes are replicated + */ + +var w = 0; +var wait = function(f) { + w++; + var n = 0; + while (!f()) { + if( n % 4 == 0 ) + print("toostale.js waiting " + w); + if (++n == 4) { + print("" + f); + } + assert(n < 200, 'tried 200 times, giving up'); + sleep(1000); + } +} + +var reconnect = function(a) { + wait(function() { + try { + a.getDB("foo").bar.stats(); + return true; + } catch(e) { + print(e); + return false; + } + }); +}; + +ports = allocatePorts( 3 ); + +var basename = "jstests_fastsync"; +var basePath = "/data/db/" + basename; +var hostname = getHostName(); + +var pargs = new MongodRunner( ports[ 0 ], basePath + "-p", false, false, + ["--replSet", basename, "--oplogSize", 2], + {no_bind : true} ); +p = pargs.start(); + +var admin = p.getDB("admin"); +var foo = p.getDB("foo"); +var local = p.getDB("local"); + +var config = {_id : basename, members : [{_id : 0, host : hostname+":"+ports[0]}]}; +printjson(config); +var result = admin.runCommand({replSetInitiate : config}); +print("result:"); +printjson(result); + +var count = 0; +while (count < 10 && result.ok != 1) { + count++; + sleep(2000); + result = admin.runCommand({replSetInitiate : config}); +} + +assert(result.ok, tojson(result)); +assert.soon(function() { return admin.runCommand({isMaster:1}).ismaster; }); + +print("1"); +for (var i=0; i<100000; i++) { + foo.bar.insert({date : new Date(), x : i, str : "all the talk on the market"}); +} +print("total in foo: "+foo.bar.count()); + + +print("2"); +admin.runCommand( {fsync:1,lock:1} ); +copyDbpath( basePath + "-p", basePath + "-s" ); +admin.$cmd.sys.unlock.findOne(); + + +print("3"); +var sargs = new MongodRunner( ports[ 1 ], basePath + "-s", false, false, + ["--replSet", basename, "--fastsync", + "--oplogSize", 2], {no_bind : true} ); +var reuseData = true; +sargs.start(reuseData); + +config = local.system.replset.findOne(); +config.version++; +config.members.push({_id:1, host:hostname+":"+ports[1]}); + +result = admin.runCommand({replSetReconfig : config}); +assert(result.ok, "reconfig worked"); +reconnect(p); + +print("4"); +var status = admin.runCommand({replSetGetStatus : 1}); +var count = 0; +while (status.members[1].state != 2 && count < 200) { + print("not a secondary yet"); + if (count % 10 == 0) { + printjson(status); + } + assert(!status.members[1].errmsg || !status.members[1].errmsg.match("^initial sync cloning db")); + + sleep(1000); + + // disconnection could happen here + try { + status = admin.runCommand({replSetGetStatus : 1}); + } + catch (e) { + print(e); + } + count++; +} + +assert.eq(status.members[1].state, 2); diff --git a/jstests/replsets/getlasterror_w2.js b/jstests/replsets/getlasterror_w2.js new file mode 100644 index 0000000..795e667 --- /dev/null +++ b/jstests/replsets/getlasterror_w2.js @@ -0,0 +1,36 @@ +// BUG: [SERVER-1768] replica set getlasterror {w: 2} after 2000 +// inserts hangs while secondary servers log "replSet error RS102 too stale to catch up" every once in a while + +function newReplicaSet (name, numServers) { + var rs = new ReplSetTest({name: name, nodes: numServers}) + rs.startSet() + rs.initiate() + rs.awaitReplication() + return rs +} + +function go() { +var N = 2000 + +// ~1KB string +var Text = '' +for (var i = 0; i < 40; i++) + Text += 'abcdefghijklmnopqrstuvwxyz' + +// Create replica set of 3 servers +var repset = newReplicaSet('repset', 3) +var conn = repset.getMaster() +var db = conn.getDB('test') + +// Add data to it +for (var i = 0; i < N; i++) + db['foo'].insert({x: i, text: Text}) + +// wait to be copied to at least one secondary (BUG hangs here) +db.getLastError(2) + +print('getlasterror_w2.js SUCCESS') +} + +// turn off until fixed +//go(); diff --git a/jstests/replsets/groupAndMapReduce.js b/jstests/replsets/groupAndMapReduce.js new file mode 100644 index 0000000..539fe44 --- /dev/null +++ b/jstests/replsets/groupAndMapReduce.js @@ -0,0 +1,105 @@ +doTest = function( signal ) { + + // Test basic replica set functionality. + // -- Replication + // -- Failover + + // Replica set testing API + // Create a new replica set test. Specify set name and the number of nodes you want. + var replTest = new ReplSetTest( {name: 'testSet', nodes: 3} ); + + // call startSet() to start each mongod in the replica set + // this returns a list of nodes + var nodes = replTest.startSet(); + + // Call initiate() to send the replSetInitiate command + // This will wait for initiation + replTest.initiate(); + + // Call getMaster to return a reference to the node that's been + // elected master. + var master = replTest.getMaster(); + + // save some records + var len = 100 + for (var i = 0; i < len; ++i) { + master.getDB("foo").foo.save({a: i}); + } + + // This method will check the oplogs of the master + // and slaves in the set and wait until the change has replicated. + replTest.awaitReplication(); + print("Sleeping 10s for slaves to go to secondary state"); + sleep(10000); + + slaves = replTest.liveNodes.slaves; + assert( slaves.length == 2, "Expected 2 slaves but length was " + slaves.length ); + slaves.forEach(function(slave) { + // try to read from slave + slave.slaveOk = true; + var count = slave.getDB("foo").foo.count(); + printjson( count ); + assert.eq( len , count , "slave count wrong: " + slave ); + + print("Doing a findOne to verify we can get a row"); + var one = slave.getDB("foo").foo.findOne(); + printjson(one); + +// stats = slave.getDB("foo").adminCommand({replSetGetStatus:1}); +// printjson(stats); + + print("Calling group() with slaveOk=true, must succeed"); + slave.slaveOk = true; + count = slave.getDB("foo").foo.group({initial: {n:0}, reduce: function(obj,out){out.n++;}}); + printjson( count ); + assert.eq( len , count[0].n , "slave group count wrong: " + slave ); + + print("Calling group() with slaveOk=false, must fail"); + slave.slaveOk = false; + try { + count = slave.getDB("foo").foo.group({initial: {n:0}, reduce: function(obj,out){out.n++;}}); + assert(false, "group() succeeded with slaveOk=false"); + } catch (e) { + print("Received exception: " + e); + } + + print("Calling inline mr() with slaveOk=true, must succeed"); + slave.slaveOk = true; + map = function() { emit(this.a, 1); }; + reduce = function(key, vals) { var sum = 0; for (var i = 0; i < vals.length; ++i) { sum += vals[i]; } return sum; }; + slave.getDB("foo").foo.mapReduce(map, reduce, {out: { "inline" : 1}}); + + print("Calling mr() to collection with slaveOk=true, must fail"); + try { + slave.getDB("foo").foo.mapReduce(map, reduce, "output"); + assert(false, "mapReduce() to collection succeeded on slave"); + } catch (e) { + print("Received exception: " + e); + } + + print("Calling inline mr() with slaveOk=false, must fail"); + slave.slaveOk = false; + try { + slave.getDB("foo").foo.mapReduce(map, reduce, {out: { "inline" : 1}}); + assert(false, "mapReduce() succeeded on slave with slaveOk=false"); + } catch (e) { + print("Received exception: " + e); + } + print("Calling mr() to collection with slaveOk=false, must fail"); + try { + slave.getDB("foo").foo.mapReduce(map, reduce, "output"); + assert(false, "mapReduce() to collection succeeded on slave with slaveOk=false"); + } catch (e) { + print("Received exception: " + e); + } + + }); + + + + // Shut down the set and finish the test. + replTest.stopSet( signal ); +} + +doTest( 15 ); +print("SUCCESS"); diff --git a/jstests/replsets/initial_sync1.js b/jstests/replsets/initial_sync1.js new file mode 100644 index 0000000..ee30b4e --- /dev/null +++ b/jstests/replsets/initial_sync1.js @@ -0,0 +1,129 @@ +/** + * Test killing the secondary during initially sync + * + * 1. Bring up set + * 2. Insert some data + * 4. Make sure synced + * 5. Freeze #2 + * 6. Bring up #3 + * 7. Kill #2 in the middle of syncing + * 8. Eventually it should become a secondary + * 9. Bring #2 back up + * 10. Insert some stuff + * 11. Everyone happy eventually + */ + +load("jstests/replsets/rslib.js"); +var basename = "jstests_initsync1"; + + +print("1. Bring up set"); +var replTest = new ReplSetTest( {name: basename, nodes: 2} ); +var conns = replTest.startSet(); +replTest.initiate(); + +var master = replTest.getMaster(); +var foo = master.getDB("foo"); +var admin = master.getDB("admin"); + +var slave1 = replTest.liveNodes.slaves[0]; +var admin_s1 = slave1.getDB("admin"); +var local_s1 = slave1.getDB("local"); + +print("2. Insert some data"); +for (var i=0; i<10000; i++) { + foo.bar.insert({date : new Date(), x : i, str : "all the talk on the market"}); +} +print("total in foo: "+foo.bar.count()); + + +print("4. Make sure synced"); +replTest.awaitReplication(); + + +print("5. Freeze #2"); +admin_s1.runCommand({replSetFreeze:999999}); + + +print("6. Bring up #3"); +var ports = allocatePorts( 3 ); +var basePath = "/data/db/" + basename; +var hostname = getHostName(); + +var sargs = new MongodRunner( ports[ 2 ], basePath, false, false, + ["--replSet", basename, "--oplogSize", 2], + {no_bind : true} ); +var slave2 = sargs.start(); +var local_s2 = slave2.getDB("local"); +var admin_s2 = slave2.getDB("admin"); + +var config = replTest.getReplSetConfig(); +config.version = 2; +config.members.push({_id:2, host:hostname+":"+ports[2]}); + +try { + admin.runCommand({replSetReconfig:config}); +} +catch(e) { + print(e); +} +reconnect(slave1); +reconnect(slave2); + +wait(function() { + var config2 = local_s1.system.replset.findOne(); + var config3 = local_s2.system.replset.findOne(); + + printjson(config2); + printjson(config3); + + return config2.version == config.version && + (config3 && config3.version == config.version); + }); + +wait(function() { + var status = admin_s2.runCommand({replSetGetStatus:1}); + printjson(status); + return status.members && + (status.members[2].state == 3 || status.members[2].state == 2); + }); + + +print("7. Kill #2 in the middle of syncing"); +replTest.stop(1); + + +print("8. Eventually it should become a secondary"); +print("if initial sync has started, this will cause it to fail and sleep for 5 minutes"); +sleep(5*60*1000); +wait(function() { + var status = admin_s2.runCommand({replSetGetStatus:1}); + occasionally(function() { printjson(status); }); + return status.members[2].state == 2; + }); + + +print("9. Bring #2 back up"); +replTest.start(1, {}, true); +reconnect(slave1); +wait(function() { + var status = admin_s1.runCommand({replSetGetStatus:1}); + printjson(status); + return status.ok == 1 && status.members && + status.members[1].state == 2 || status.members[1].state == 1; + }); + + +/** + * TODO: this fails on buildbot + * see SERVER-2550 +print("10. Insert some stuff"); +master = replTest.getMaster(); +for (var i=0; i<10000; i++) { + foo.bar.insert({date : new Date(), x : i, str : "all the talk on the market"}); +} + + +print("11. Everyone happy eventually"); +replTest.awaitReplication(); +*/ diff --git a/jstests/replsets/initial_sync2.js b/jstests/replsets/initial_sync2.js new file mode 100644 index 0000000..3ad3972 --- /dev/null +++ b/jstests/replsets/initial_sync2.js @@ -0,0 +1,179 @@ +/** + * Test killing the primary during initial sync + * and don't allow the other secondary to become primary + * + * 1. Bring up set + * 2. Insert some data + * 4. Make sure synced + * 5. Freeze #2 + * 6. Bring up #3 + * 7. Kill #1 in the middle of syncing + * 8. Check that #3 makes it into secondary state + * 9. Bring #1 back up + * 10. Initial sync should succeed + * 11. Insert some stuff + * 12. Everyone happy eventually + */ + +load("jstests/replsets/rslib.js"); +var basename = "jstests_initsync2"; + +var doTest = function() { + +print("1. Bring up set"); +var replTest = new ReplSetTest( {name: basename, nodes: 2} ); +var conns = replTest.startSet(); +replTest.initiate(); + +var master = replTest.getMaster(); +var origMaster = master; +var foo = master.getDB("foo"); +var admin = master.getDB("admin"); + +var slave1 = replTest.liveNodes.slaves[0]; +var admin_s1 = slave1.getDB("admin"); +var local_s1 = slave1.getDB("local"); + +print("2. Insert some data"); +for (var i=0; i<10000; i++) { + foo.bar.insert({date : new Date(), x : i, str : "all the talk on the market"}); +} +print("total in foo: "+foo.bar.count()); + + +print("4. Make sure synced"); +replTest.awaitReplication(); + + +print("5. Freeze #2"); +admin_s1.runCommand({replSetFreeze:999999}); + + +print("6. Bring up #3"); +var ports = allocatePorts( 3 ); +var basePath = "/data/db/" + basename; +var hostname = getHostName(); + +var sargs = new MongodRunner( ports[ 2 ], basePath, false, false, + ["--replSet", basename, "--oplogSize", 2], + {no_bind : true} ); +var slave2 = sargs.start(); +var local_s2 = slave2.getDB("local"); +var admin_s2 = slave2.getDB("admin"); + +var config = replTest.getReplSetConfig(); +config.version = 2; +config.members.push({_id:2, host:hostname+":"+ports[2]}); + +try { + admin.runCommand({replSetReconfig:config}); +} +catch(e) { + print(e); +} +reconnect(slave1); +reconnect(slave2); + +wait(function() { + var config2 = local_s1.system.replset.findOne(); + var config3 = local_s2.system.replset.findOne(); + + printjson(config2); + printjson(config3); + + return config2.version == config.version && + (config3 && config3.version == config.version); + }); +admin_s2.runCommand({replSetFreeze:999999}); + + +wait(function() { + var status = admin_s2.runCommand({replSetGetStatus:1}); + printjson(status); + return status.members && + (status.members[2].state == 3 || status.members[2].state == 2); + }); + + +print("7. Kill #1 in the middle of syncing"); +replTest.stop(0); + + +print("8. Check that #3 makes it into secondary state"); +wait(function() { + var status = admin_s2.runCommand({replSetGetStatus:1}); + occasionally(function() { printjson(status);}, 10); + if (status.members[2].state == 2 || status.members[2].state == 1) { + return true; + } + return false; + }); + + +print("9. Bring #1 back up"); +replTest.start(0, {}, true); +reconnect(master); +wait(function() { + var status = admin.runCommand({replSetGetStatus:1}); + printjson(status); + return status.members && + (status.members[0].state == 1 || status.members[0].state == 2); + }); + + +print("10. Initial sync should succeed"); +wait(function() { + var status = admin_s2.runCommand({replSetGetStatus:1}); + printjson(status); + return status.members && + status.members[2].state == 2 || status.members[2].state == 1; + }); + + +print("11. Insert some stuff"); +// ReplSetTest doesn't find master correctly unless all nodes are defined by +// ReplSetTest +for (var i = 0; i<30; i++) { + var result = admin.runCommand({isMaster : 1}); + if (result.ismaster) { + break; + } + else if (result.primary) { + master = connect(result.primary+"/admin").getMongo(); + break; + } + sleep(1000); +} + +for (var i=0; i<10000; i++) { + foo.bar.insert({date : new Date(), x : i, str : "all the talk on the market"}); +} + + +print("12. Everyone happy eventually"); +// if 3 is master... +if (master+"" != origMaster+"") { + print("3 is master"); + slave2 = origMaster; +} + +wait(function() { + var op1 = getLatestOp(master); + var op2 = getLatestOp(slave1); + var op3 = getLatestOp(slave2); + + occasionally(function() { + print("latest ops:"); + printjson(op1); + printjson(op2); + printjson(op3); + }); + + return friendlyEqual(getLatestOp(master), getLatestOp(slave1)) && + friendlyEqual(getLatestOp(master), getLatestOp(slave2)); + }); + +replTest.stopSet(); +}; + +doTest(); diff --git a/jstests/replsets/initial_sync3.js b/jstests/replsets/initial_sync3.js new file mode 100644 index 0000000..471aa16 --- /dev/null +++ b/jstests/replsets/initial_sync3.js @@ -0,0 +1,87 @@ +/* test initial sync options + * + * {state : 1} + * {state : 2} + * {name : host+":"+port} + * {_id : 2} + * {optime : now} + * {optime : 1970} + */ + +load("jstests/replsets/rslib.js"); +var name = "initialsync3"; +var host = getHostName(); +var port = allocatePorts(7); + +print("Start set with three nodes"); +var replTest = new ReplSetTest( {name: name, nodes: 7} ); +var nodes = replTest.startSet(); +replTest.initiate({ + _id : name, + members : [ + {_id:0, host : host+":"+port[0]}, + {_id:1, host : host+":"+port[1], initialSync : {state : 1}}, + {_id:2, host : host+":"+port[2], initialSync : {state : 2}}, + {_id:3, host : host+":"+port[3], initialSync : {name : host+":"+port[2]}}, + {_id:4, host : host+":"+port[4], initialSync : {_id : 2}}, + {_id:5, host : host+":"+port[5], initialSync : {optime : new Date()}}, + {_id:6, host : host+":"+port[6], initialSync : {optime : new Date(0)}} + ]}); + +var master = replTest.getMaster(); + +print("Initial sync"); +master.getDB("foo").bar.baz.insert({x:1}); + +print("Make sure everyone's secondary"); +wait(function() { + var status = master.getDB("admin").runCommand({replSetGetStatus:1}); + occasionally(function() { + printjson(status); + }); + + if (!status.members) { + return false; + } + + for (i=0; i<7; i++) { + if (status.members[i].state != 1 && status.members[i].state != 2) { + return false; + } + } + return true; + + }); + +replTest.awaitReplication(); + +replTest.stopSet(); + +print("reconfig"); + +var rs2 = new ReplSetTest( {name: 'reconfig-isync3', nodes: 3} ); +rs2.startSet(); +rs2.initiate(); + +master = rs2.getMaster(); +var config = master.getDB("local").system.replset.findOne(); +config.version++; +config.members[0].initialSync = {state : 2}; +config.members[1].initialSync = {state : 1}; +try { + master.getDB("admin").runCommand({replSetReconfig : config}); +} +catch(e) { + print("trying to reconfigure: "+e); +} + +master = rs2.getMaster(); +config = master.getDB("local").system.replset.findOne(); + +assert(typeof(config.members[0].initialSync) == "object"); +assert.eq(config.members[0].initialSync.state, 2); +assert.eq(config.members[1].initialSync.state, 1); + +rs2.stopSet(); + +print("initialSync3 success!"); diff --git a/jstests/replsets/ismaster1.js b/jstests/replsets/ismaster1.js new file mode 100644 index 0000000..22865e5 --- /dev/null +++ b/jstests/replsets/ismaster1.js @@ -0,0 +1,36 @@ +/** + * 1. Check passive field in isMaster + */ + +load("jstests/replsets/rslib.js"); + +var name = "ismaster"; +var host = getHostName(); + +var replTest = new ReplSetTest( {name: name, nodes: 3} ); + +var nodes = replTest.startSet(); + +var config = replTest.getReplSetConfig(); +config.members[1].priority = 0; +config.members[2].priority = 0; + +replTest.initiate(config); + +var master = replTest.getMaster(); +wait(function() { + var result = master.getDB("admin").runCommand({replSetGetStatus:1}); + return result.members && result.members[0].state == 1 && + result.members[1].state == 2 && result.members[2].state == 2; + }); + +var result = master.getDB("admin").runCommand({isMaster:1}); +assert(!('passive' in result), tojson(result)); + +result = replTest.liveNodes.slaves[0].getDB("admin").runCommand({isMaster:1}); +assert('passive' in result, tojson(result)); + +result = replTest.liveNodes.slaves[1].getDB("admin").runCommand({isMaster:1}); +assert('passive' in result, tojson(result)); + +replTest.stopSet(); diff --git a/jstests/replsets/key1 b/jstests/replsets/key1 new file mode 100644 index 0000000..b5c19e4 --- /dev/null +++ b/jstests/replsets/key1 @@ -0,0 +1 @@ +foop de doop diff --git a/jstests/replsets/key2 b/jstests/replsets/key2 new file mode 100644 index 0000000..cbde821 --- /dev/null +++ b/jstests/replsets/key2 @@ -0,0 +1 @@ +other key diff --git a/jstests/replsets/remove1.js b/jstests/replsets/remove1.js new file mode 100644 index 0000000..ebd17d6 --- /dev/null +++ b/jstests/replsets/remove1.js @@ -0,0 +1,132 @@ +/* test removing a node from a replica set + * + * Start set with three nodes + * Initial sync + * Remove slave1 + * Remove slave2 + * Bring slave1 back up + * Bring slave2 back up + * Add them back as slave + * Make sure everyone's secondary + */ + +load("jstests/replsets/rslib.js"); +var name = "removeNodes"; +var host = getHostName(); + + +print("Start set with three nodes"); +var replTest = new ReplSetTest( {name: name, nodes: 3} ); +var nodes = replTest.startSet(); +replTest.initiate(); +var master = replTest.getMaster(); + + +print("Initial sync"); +master.getDB("foo").bar.baz.insert({x:1}); + +replTest.awaitReplication(); + + +print("Remove slave2"); +var config = replTest.getReplSetConfig(); + +config.members.pop(); +config.version = 2; +try { + master.getDB("admin").runCommand({replSetReconfig:config}); +} +catch(e) { + print(e); +} +reconnect(master); + + +print("Remove slave1"); +config.members.pop(); +config.version = 3; +try { + master.getDB("admin").runCommand({replSetReconfig:config}); +} +catch(e) { + print(e); +} +reconnect(master); + +print("sleeping 1"); +sleep(10000); +// these are already down, but this clears their ports from memory so that they +// can be restarted later +stopMongod(replTest.getPort(1)); +stopMongod(replTest.getPort(2)); + + +print("Bring slave1 back up"); +var paths = [ replTest.getPath(1), replTest.getPath(2) ]; +var ports = allocatePorts(2, replTest.getPort(2)+1); +var args = ["mongod", "--port", ports[0], "--dbpath", paths[0], "--noprealloc", "--smallfiles", "--rest"]; +var conn = startMongoProgram.apply( null, args ); +conn.getDB("local").system.replset.remove(); +printjson(conn.getDB("local").runCommand({getlasterror:1})); +print(conn); +print("sleeping 2"); +sleep(10000); +stopMongod(ports[0]); + +replTest.restart(1); + + +print("Bring slave2 back up"); +args[2] = ports[1]; +args[4] = paths[1]; +conn = startMongoProgram.apply( null, args ); +conn.getDB("local").system.replset.remove(); +print("path: "+paths[1]); +print("sleeping 3"); +sleep(10000); +stopMongod(ports[1]); + +replTest.restart(2); +sleep(10000); + + +print("Add them back as slaves"); +config.members.push({_id:1, host : host+":"+replTest.getPort(1)}); +config.members.push({_id:2, host : host+":"+replTest.getPort(2)}); +config.version = 4; +wait(function() { + try { + master.getDB("admin").runCommand({replSetReconfig:config}); + } + catch(e) { + print(e); + } + reconnect(master); + + master.setSlaveOk(); + var newConfig = master.getDB("local").system.replset.findOne(); + return newConfig.version == 4; + }); + + +print("Make sure everyone's secondary"); +wait(function() { + var status = master.getDB("admin").runCommand({replSetGetStatus:1}); + occasionally(function() { + printjson(status); + }); + + if (!status.members || status.members.length != 3) { + return false; + } + + for (var i = 0; i<3; i++) { + if (status.members[i].state != 1 && status.members[i].state != 2) { + return false; + } + } + return true; + }); + +replTest.stopSet(); + diff --git a/jstests/replsets/replset2.js b/jstests/replsets/replset2.js index f18b467..4849620 100644 --- a/jstests/replsets/replset2.js +++ b/jstests/replsets/replset2.js @@ -1,126 +1,126 @@ -print("\n\nreplset2.js BEGIN"); - -doTest = function (signal) { - - // FAILING TEST - // See below: - - // Test replication with getLastError - - // Replica set testing API - // Create a new replica set test. Specify set name and the number of nodes you want. - var replTest = new ReplSetTest({ name: 'testSet', nodes: 3, oplogSize: 5 }); - - // call startSet() to start each mongod in the replica set - // this returns a list of nodes - var nodes = replTest.startSet(); - - // Call initiate() to send the replSetInitiate command - // This will wait for initiation - replTest.initiate(); - - var testDB = "repl-test"; - - // Call getMaster to return a reference to the node that's been - // elected master. - var master = replTest.getMaster(); - - // Wait for replication to a single node - master.getDB(testDB).bar.insert({ n: 1 }); - - // Wait for initial sync - replTest.awaitReplication(); - - var slaves = replTest.liveNodes.slaves; - slaves.forEach(function (slave) { slave.setSlaveOk(); }); - - var failed = false; - var callGetLastError = function (w, timeout, db) { - try { - var result = master.getDB(db).getLastErrorObj(w, timeout); - print("replset2.js getLastError result: " + tojson(result)); - if (result['ok'] != 1) { - print("replset2.js FAILURE getlasterror not ok"); - failed = true; - } - } - catch (e) { - print("\nreplset2.js exception in getLastError: " + e + '\n'); - throw e; - } - } - - // Test getlasterror with multiple inserts - // TEST FAILS HEREg - print("\n\nreplset2.js **** Try inserting a multiple records -- first insert ****") - - printjson(master.getDB("admin").runCommand("replSetGetStatus")); - - master.getDB(testDB).foo.insert({ n: 1 }); - master.getDB(testDB).foo.insert({ n: 2 }); - master.getDB(testDB).foo.insert({ n: 3 }); - - print("\nreplset2.js **** TEMP 1 ****") - - printjson(master.getDB("admin").runCommand("replSetGetStatus")); - - callGetLastError(3, 25000, testDB); - - print("replset2.js **** TEMP 1a ****") - - m1 = master.getDB(testDB).foo.findOne({ n: 1 }); - printjson(m1); - assert(m1['n'] == 1, "replset2.js Failed to save to master on multiple inserts"); - - print("replset2.js **** TEMP 1b ****") - - var s0 = slaves[0].getDB(testDB).foo.findOne({ n: 1 }); - assert(s0['n'] == 1, "replset2.js Failed to replicate to slave 0 on multiple inserts"); - - var s1 = slaves[1].getDB(testDB).foo.findOne({ n: 1 }); - assert(s1['n'] == 1, "replset2.js Failed to replicate to slave 1 on multiple inserts"); - - // Test getlasterror with a simple insert - print("replset2.js **** Try inserting a single record ****") - master.getDB(testDB).dropDatabase(); - master.getDB(testDB).foo.insert({ n: 1 }); - callGetLastError(3, 10000, testDB); - - m1 = master.getDB(testDB).foo.findOne({ n: 1 }); - printjson(m1); - assert(m1['n'] == 1, "replset2.js Failed to save to master"); - - s0 = slaves[0].getDB(testDB).foo.findOne({ n: 1 }); - assert(s0['n'] == 1, "replset2.js Failed to replicate to slave 0"); - - s1 = slaves[1].getDB(testDB).foo.findOne({ n: 1 }); - assert(s1['n'] == 1, "replset2.js Failed to replicate to slave 1"); - - // Test getlasterror with large insert - print("replset2.js **** Try inserting many records ****") +print("\n\nreplset2.js BEGIN"); + +doTest = function (signal) { + + // FAILING TEST + // See below: + + // Test replication with getLastError + + // Replica set testing API + // Create a new replica set test. Specify set name and the number of nodes you want. + var replTest = new ReplSetTest({ name: 'testSet', nodes: 3, oplogSize: 5 }); + + // call startSet() to start each mongod in the replica set + // this returns a list of nodes + var nodes = replTest.startSet(); + + // Call initiate() to send the replSetInitiate command + // This will wait for initiation + replTest.initiate(); + + var testDB = "repl-test"; + + // Call getMaster to return a reference to the node that's been + // elected master. + var master = replTest.getMaster(); + + // Wait for replication to a single node + master.getDB(testDB).bar.insert({ n: 1 }); + + // Wait for initial sync + replTest.awaitReplication(); + + var slaves = replTest.liveNodes.slaves; + slaves.forEach(function (slave) { slave.setSlaveOk(); }); + + var failed = false; + var callGetLastError = function (w, timeout, db) { + try { + var result = master.getDB(db).getLastErrorObj(w, timeout); + print("replset2.js getLastError result: " + tojson(result)); + if (result['ok'] != 1) { + print("replset2.js FAILURE getlasterror not ok"); + failed = true; + } + } + catch (e) { + print("\nreplset2.js exception in getLastError: " + e + '\n'); + throw e; + } + } + + // Test getlasterror with multiple inserts + // TEST FAILS HEREg + print("\n\nreplset2.js **** Try inserting a multiple records -- first insert ****") + + printjson(master.getDB("admin").runCommand("replSetGetStatus")); + + master.getDB(testDB).foo.insert({ n: 1 }); + master.getDB(testDB).foo.insert({ n: 2 }); + master.getDB(testDB).foo.insert({ n: 3 }); + + print("\nreplset2.js **** TEMP 1 ****") + + printjson(master.getDB("admin").runCommand("replSetGetStatus")); + + callGetLastError(3, 25000, testDB); + + print("replset2.js **** TEMP 1a ****") + + m1 = master.getDB(testDB).foo.findOne({ n: 1 }); + printjson(m1); + assert(m1['n'] == 1, "replset2.js Failed to save to master on multiple inserts"); + + print("replset2.js **** TEMP 1b ****") + + var s0 = slaves[0].getDB(testDB).foo.findOne({ n: 1 }); + assert(s0['n'] == 1, "replset2.js Failed to replicate to slave 0 on multiple inserts"); + + var s1 = slaves[1].getDB(testDB).foo.findOne({ n: 1 }); + assert(s1['n'] == 1, "replset2.js Failed to replicate to slave 1 on multiple inserts"); + + // Test getlasterror with a simple insert + print("replset2.js **** Try inserting a single record ****") + master.getDB(testDB).dropDatabase(); + master.getDB(testDB).foo.insert({ n: 1 }); + callGetLastError(3, 10000, testDB); + + m1 = master.getDB(testDB).foo.findOne({ n: 1 }); + printjson(m1); + assert(m1['n'] == 1, "replset2.js Failed to save to master"); + + s0 = slaves[0].getDB(testDB).foo.findOne({ n: 1 }); + assert(s0['n'] == 1, "replset2.js Failed to replicate to slave 0"); + + s1 = slaves[1].getDB(testDB).foo.findOne({ n: 1 }); + assert(s1['n'] == 1, "replset2.js Failed to replicate to slave 1"); + + // Test getlasterror with large insert + print("replset2.js **** Try inserting many records ****") try { - bigData = new Array(2000).toString() - for (var n = 0; n < 1000; n++) { - master.getDB(testDB).baz.insert({ n: n, data: bigData }); - } - callGetLastError(3, 60000, testDB); - - print("replset2.js **** V1 ") - - var verifyReplication = function (nodeName, collection) { - data = collection.findOne({ n: 1 }); - assert(data['n'] == 1, "replset2.js Failed to save to " + nodeName); - data = collection.findOne({ n: 999 }); - assert(data['n'] == 999, "replset2.js Failed to save to " + nodeName); - } - - print("replset2.js **** V2 ") - - verifyReplication("master", master.getDB(testDB).baz); - verifyReplication("slave 0", slaves[0].getDB(testDB).baz); - verifyReplication("slave 1", slaves[1].getDB(testDB).baz); - - assert(failed == false, "replset2.js Replication with getLastError failed. See errors."); + bigData = new Array(2000).toString() + for (var n = 0; n < 1000; n++) { + master.getDB(testDB).baz.insert({ n: n, data: bigData }); + } + callGetLastError(3, 60000, testDB); + + print("replset2.js **** V1 ") + + var verifyReplication = function (nodeName, collection) { + data = collection.findOne({ n: 1 }); + assert(data['n'] == 1, "replset2.js Failed to save to " + nodeName); + data = collection.findOne({ n: 999 }); + assert(data['n'] == 999, "replset2.js Failed to save to " + nodeName); + } + + print("replset2.js **** V2 ") + + verifyReplication("master", master.getDB(testDB).baz); + verifyReplication("slave 0", slaves[0].getDB(testDB).baz); + verifyReplication("slave 1", slaves[1].getDB(testDB).baz); + + assert(failed == false, "replset2.js Replication with getLastError failed. See errors."); } catch(e) { print("ERROR: " + e); @@ -132,10 +132,10 @@ doTest = function (signal) { printjson(slaves[1].getDB("local").oplog.rs.find().sort({"$natural": -1}).limit(1).next()); } - - replTest.stopSet(signal); + + replTest.stopSet(signal); } -doTest( 15 ); - +doTest( 15 ); + print("\nreplset2.js SUCCESS\n"); diff --git a/jstests/replsets/replset3.js b/jstests/replsets/replset3.js index 8126b9d..faa0627 100644 --- a/jstests/replsets/replset3.js +++ b/jstests/replsets/replset3.js @@ -1,56 +1,80 @@ - -doTest = function( signal ) { - - // Test replica set step down - - // Replica set testing API - // Create a new replica set test. Specify set name and the number of nodes you want. - var replTest = new ReplSetTest( {name: 'testSet', nodes: 3} ); - - // call startSet() to start each mongod in the replica set - // this returns a list of nodes - var nodes = replTest.startSet(); - - // Call initiate() to send the replSetInitiate command - // This will wait for initiation - replTest.initiate(); - - // Get master node - var master = replTest.getMaster(); - - // Write some data to master - // NOTE: this test fails unless we write some data. - master.getDB("foo").foo.save({a: 1}); - master.getDB("foo").runCommand({getlasterror: 1, w:3, wtimeout: 20000}); - - // Step down master - master.getDB("admin").runCommand({replSetStepDown: true}); - - try { - var new_master = replTest.getMaster(); - } - catch( err ) { - throw( "Could not elect new master before timeout." ); - } - - assert( master != new_master, "Old master shouldn't be equal to new master." ); - - // Make sure that slaves are still up - var result = new_master.getDB("admin").runCommand({replSetGetStatus: 1}); - assert( result['ok'] == 1, "Could not verify that slaves were still up:" + result ); - - slaves = replTest.liveNodes.slaves; - assert.soon(function() { - res = slaves[0].getDB("admin").runCommand({replSetGetStatus: 1}) - return res.myState == 2; - }, "Slave 0 state not ready."); - - assert.soon(function() { - res = slaves[1].getDB("admin").runCommand({replSetGetStatus: 1}) - return res.myState == 2; - }, "Slave 1 state not ready."); - - replTest.stopSet( 15 ); + +doTest = function (signal) { + + // Test replica set step down + + // Replica set testing API + // Create a new replica set test. Specify set name and the number of nodes you want. + var replTest = new ReplSetTest({ name: 'testSet', nodes: 3 }); + + // call startSet() to start each mongod in the replica set + // this returns a list of nodes + var nodes = replTest.startSet(); + + // Call initiate() to send the replSetInitiate command + // This will wait for initiation + replTest.initiate(); + + // Get master node + var master = replTest.getMaster(); + + // Write some data to master + // NOTE: this test fails unless we write some data. + master.getDB("foo").foo.save({ a: 1 }); + master.getDB("foo").runCommand({ getlasterror: 1, w: 3, wtimeout: 20000 }); + + var phase = 1; + + print(phase++); + + // Step down master. Note: this may close our connection! + try { + master.getDB("admin").runCommand({ replSetStepDown: true }); + } catch (err) { + print("caught: " + err + " on stepdown"); + } + + print(phase++); + + try { + var new_master = replTest.getMaster(); + } + catch (err) { + throw ("Could not elect new master before timeout."); + } + + print(phase++); + + assert(master != new_master, "Old master shouldn't be equal to new master."); + + print(phase++); + + // Make sure that slaves are still up + var result = new_master.getDB("admin").runCommand({ replSetGetStatus: 1 }); + assert(result['ok'] == 1, "Could not verify that slaves were still up:" + result); + + print(phase++); + + slaves = replTest.liveNodes.slaves; + assert.soon(function () { + try { + res = slaves[0].getDB("admin").runCommand({ replSetGetStatus: 1 }) + } catch (err) { } + return res.myState == 2; + }, "Slave 0 state not ready."); + + print(phase++); + + assert.soon(function () { + try { + res = slaves[1].getDB("admin").runCommand({ replSetGetStatus: 1 }) + } catch (err) { } + return res.myState == 2; + }, "Slave 1 state not ready."); + + print("replset3.js SUCCESS"); + + replTest.stopSet(15); } doTest( 15 ); diff --git a/jstests/replsets/replset5.js b/jstests/replsets/replset5.js index fe1761e..13ee5c9 100644 --- a/jstests/replsets/replset5.js +++ b/jstests/replsets/replset5.js @@ -23,15 +23,15 @@ doTest = function (signal) { master.getDB("barDB").bar.save({ a: 1 }); replTest.awaitReplication(); - // These writes should be replicated immediately - master.getDB(testDB).foo.insert({ n: 1 }); - master.getDB(testDB).foo.insert({ n: 2 }); - master.getDB(testDB).foo.insert({ n: 3 }); - - // *** NOTE ***: The default doesn't seem to be propogating. - // When I run getlasterror with no defaults, the slaves don't have the data: - // These getlasterror commands can be run individually to verify this. - //master.getDB("admin").runCommand({ getlasterror: 1, w: 3, wtimeout: 20000 }); + // These writes should be replicated immediately + var docNum = 5000; + for(var n=0; n 0); + + +print("5"); +config.members[2].arbiterOnly = true; +reconfig(); + + +print("6"); +statusSoon(7); +assert.eq(replTest.liveNodes.slaves[1].getDB("local").oplog.rs.count(), 0); + + +print("7"); +delete config.members[2].arbiterOnly; +reconfig(); + + +print("8"); +statusSoon(2); +assert(replTest.liveNodes.slaves[1].getDB("local").oplog.rs.count() > 0); + + +print("9"); +for (var i = 0; i < 10000; i++) { + master.getDB("foo").bar.insert({increment : i, c : 0, foo : "kasdlfjaklsdfalksdfakldfmalksdfmaklmfalkfmkafmdsaklfma", date : new Date(), d : Date()}); +} + + +print("10"); +config.members[2].arbiterOnly = true; +reconfig(); + + +print("11"); +statusSoon(7); +assert.eq(replTest.liveNodes.slaves[1].getDB("local").oplog.rs.count(), 0); +*/ + +replTest.stopSet( 15 ); + diff --git a/jstests/replsets/replsetfreeze.js b/jstests/replsets/replsetfreeze.js new file mode 100644 index 0000000..3721ba5 --- /dev/null +++ b/jstests/replsets/replsetfreeze.js @@ -0,0 +1,105 @@ +/* + * 1: initialize set + * 2: step down m1 + * 3: freeze set for 30 seconds + * 4: check no one is master for 30 seconds + * 5: check for new master + * 6: step down new master + * 7: freeze for 30 seconds + * 8: unfreeze + * 9: check we get a new master within 30 seconds + */ + + +var w = 0; +var wait = function(f) { + w++; + var n = 0; + while (!f()) { + if( n % 4 == 0 ) + print("toostale.js waiting " + w); + if (++n == 4) { + print("" + f); + } + assert(n < 200, 'tried 200 times, giving up'); + sleep(1000); + } +} + +var reconnect = function(a) { + wait(function() { + try { + a.getDB("foo").bar.stats(); + return true; + } catch(e) { + print(e); + return false; + } + }); +}; + + +print("1: initialize set"); +var replTest = new ReplSetTest( {name: 'unicomplex', nodes: 3} ); +var nodes = replTest.nodeList(); +var conns = replTest.startSet(); +var config = {"_id" : "unicomplex", "members" : [ + {"_id" : 0, "host" : nodes[0] }, + {"_id" : 1, "host" : nodes[1] }, + {"_id" : 2, "host" : nodes[2], "arbiterOnly" : true}]}; +var r = replTest.initiate(config); +var master = replTest.getMaster(); + + +print("2: step down m1"); +try { + master.getDB("admin").runCommand({replSetStepDown : 1}); +} +catch(e) { + print(e); +} +reconnect(master); + +print("3: freeze set for 30 seconds"); +master.getDB("admin").runCommand({replSetFreeze : 30}); + + +print("4: check no one is master for 30 seconds"); +var start = (new Date()).getTime(); +while ((new Date()).getTime() - start < 30000) { + var result = master.getDB("admin").runCommand({isMaster:1}); + assert.eq(result.ismaster, false); + assert.eq(result.primary, undefined); + sleep(1000); +} + + +print("5: check for new master"); +master = replTest.getMaster(); + + +print("6: step down new master"); +try { + master.getDB("admin").runCommand({replSetStepDown : 1}); +} +catch(e) { + print(e); +} +reconnect(master); + + +print("7: freeze for 30 seconds"); +master.getDB("admin").runCommand({replSetFreeze : 30}); +sleep(1000); + + +print("8: unfreeze"); +master.getDB("admin").runCommand({replSetFreeze : 0}); + + +print("9: check we get a new master within 30 seconds"); +master = replTest.getMaster(); + + +replTest.stopSet( 15 ); + diff --git a/jstests/replsets/rollback.js b/jstests/replsets/rollback.js index 8840371..6370e41 100644 --- a/jstests/replsets/rollback.js +++ b/jstests/replsets/rollback.js @@ -1,155 +1,186 @@ -// test rollback in replica sets - -// try running as : -// -// mongo --nodb rollback.js | tee out | grep -v ^m31 -// - -var debugging = 0; - -function pause(s) { - print(s); - while (debugging) { - sleep(3000); - print(s); - } -} - -function deb(obj) { - if( debugging ) { - print("\n\n\n" + obj + "\n\n"); - } -} - -w = 0; - -function wait(f) { - w++; - var n = 0; - while (!f()) { - if( n % 4 == 0 ) - print("rollback.js waiting " + w); - if (++n == 4) { - print("" + f); - } - sleep(1000); - } -} - -doTest = function (signal) { - - var replTest = new ReplSetTest({ name: 'unicomplex', nodes: 3 }); - var nodes = replTest.nodeList(); - //print(tojson(nodes)); - - var conns = replTest.startSet(); - var r = replTest.initiate({ "_id": "unicomplex", - "members": [ - { "_id": 0, "host": nodes[0] }, - { "_id": 1, "host": nodes[1] }, - { "_id": 2, "host": nodes[2], arbiterOnly: true}] - }); - - // Make sure we have a master - var master = replTest.getMaster(); - a_conn = conns[0]; - A = a_conn.getDB("admin"); - b_conn = conns[1]; - a_conn.setSlaveOk(); - b_conn.setSlaveOk(); - B = b_conn.getDB("admin"); - assert(master == conns[0], "conns[0] assumed to be master"); - assert(a_conn == master); - - //deb(master); - - // Make sure we have an arbiter - assert.soon(function () { - res = conns[2].getDB("admin").runCommand({ replSetGetStatus: 1 }); - return res.myState == 7; - }, "Arbiter failed to initialize."); - - // Wait for initial replication - var a = a_conn.getDB("foo"); - var b = b_conn.getDB("foo"); - - /* force the oplog to roll */ - if (new Date() % 2 == 0) { - print("ROLLING OPLOG AS PART OF TEST (we only do this sometimes)"); - var pass = 1; - var first = a.getSisterDB("local").oplog.rs.find().sort({ $natural: 1 }).limit(1)[0]; - a.roll.insert({ x: 1 }); - while (1) { - for (var i = 0; i < 10000; i++) - a.roll.update({}, { $inc: { x: 1} }); - var op = a.getSisterDB("local").oplog.rs.find().sort({ $natural: 1 }).limit(1)[0]; - if (tojson(op.h) != tojson(first.h)) { - printjson(op); - printjson(first); - break; - } - pass++; - a.getLastError(2); // unlikely secondary isn't keeping up, but let's avoid possible intermittent issues with that. - } - print("PASSES FOR OPLOG ROLL: " + pass); - } - else { - print("NO ROLL"); - } - - a.bar.insert({ q: 1, a: "foo" }); - a.bar.insert({ q: 2, a: "foo", x: 1 }); - a.bar.insert({ q: 3, bb: 9, a: "foo" }); - - assert(a.bar.count() == 3, "t.count"); - - // wait for secondary to get this data - wait(function () { return b.bar.count() == 3; }); - - A.runCommand({ replSetTest: 1, blind: true }); - wait(function () { return B.isMaster().ismaster; }); - - b.bar.insert({ q: 4 }); - b.bar.insert({ q: 5 }); - b.bar.insert({ q: 6 }); - assert(b.bar.count() == 6, "u.count"); - - // a should not have the new data as it was in blind state. - B.runCommand({ replSetTest: 1, blind: true }); - A.runCommand({ replSetTest: 1, blind: false }); - wait(function () { return !B.isMaster().ismaster; }); - wait(function () { return A.isMaster().ismaster; }); - - assert(a.bar.count() == 3, "t is 3"); - a.bar.insert({ q: 7 }); - a.bar.insert({ q: 8 }); - { - assert(a.bar.count() == 5); - var x = a.bar.find().toArray(); - assert(x[0].q == 1, '1'); - assert(x[1].q == 2, '2'); - assert(x[2].q == 3, '3'); - assert(x[3].q == 7, '7'); - assert(x[4].q == 8, '8'); - } - - // A is 1 2 3 7 8 - // B is 1 2 3 4 5 6 - - // bring B back online - B.runCommand({ replSetTest: 1, blind: false }); - - wait(function () { return B.isMaster().ismaster || B.isMaster().secondary; }); - - // everyone is up here... - assert(A.isMaster().ismaster || A.isMaster().secondary, "A up"); - assert(B.isMaster().ismaster || B.isMaster().secondary, "B up"); - - friendlyEqual(a.bar.find().sort({ _id: 1 }).toArray(), b.bar.find().sort({ _id: 1 }).toArray(), "server data sets do not match"); - - pause("rollback.js SUCCESS"); - replTest.stopSet(signal); +// test rollback in replica sets + +// try running as : +// +// mongo --nodb rollback.js | tee out | grep -v ^m31 +// + +var debugging = 0; + +function pause(s) { + print(s); + while (debugging) { + sleep(3000); + print(s); + } +} + +function deb(obj) { + if( debugging ) { + print("\n\n\n" + obj + "\n\n"); + } +} + +w = 0; + +function wait(f) { + w++; + var n = 0; + while (!f()) { + if( n % 4 == 0 ) + print("rollback.js waiting " + w); + if (++n == 4) { + print("" + f); + } + assert(n < 200, 'tried 200 times, giving up'); + sleep(1000); + } } +doTest = function (signal) { + + var replTest = new ReplSetTest({ name: 'unicomplex', nodes: 3 }); + var nodes = replTest.nodeList(); + //print(tojson(nodes)); + + var conns = replTest.startSet(); + var r = replTest.initiate({ "_id": "unicomplex", + "members": [ + { "_id": 0, "host": nodes[0] }, + { "_id": 1, "host": nodes[1] }, + { "_id": 2, "host": nodes[2], arbiterOnly: true}] + }); + + // Make sure we have a master + var master = replTest.getMaster(); + a_conn = conns[0]; + A = a_conn.getDB("admin"); + b_conn = conns[1]; + a_conn.setSlaveOk(); + b_conn.setSlaveOk(); + B = b_conn.getDB("admin"); + assert(master == conns[0], "conns[0] assumed to be master"); + assert(a_conn == master); + + //deb(master); + + // Make sure we have an arbiter + assert.soon(function () { + res = conns[2].getDB("admin").runCommand({ replSetGetStatus: 1 }); + return res.myState == 7; + }, "Arbiter failed to initialize."); + + // Wait for initial replication + var a = a_conn.getDB("foo"); + var b = b_conn.getDB("foo"); + + /* force the oplog to roll */ + if (new Date() % 2 == 0) { + print("ROLLING OPLOG AS PART OF TEST (we only do this sometimes)"); + var pass = 1; + var first = a.getSisterDB("local").oplog.rs.find().sort({ $natural: 1 }).limit(1)[0]; + a.roll.insert({ x: 1 }); + while (1) { + for (var i = 0; i < 10000; i++) + a.roll.update({}, { $inc: { x: 1} }); + var op = a.getSisterDB("local").oplog.rs.find().sort({ $natural: 1 }).limit(1)[0]; + if (tojson(op.h) != tojson(first.h)) { + printjson(op); + printjson(first); + break; + } + pass++; + a.getLastError(2); // unlikely secondary isn't keeping up, but let's avoid possible intermittent issues with that. + } + print("PASSES FOR OPLOG ROLL: " + pass); + } + else { + print("NO ROLL"); + } + + a.bar.insert({ q: 1, a: "foo" }); + a.bar.insert({ q: 2, a: "foo", x: 1 }); + a.bar.insert({ q: 3, bb: 9, a: "foo" }); + + assert(a.bar.count() == 3, "t.count"); + + // wait for secondary to get this data + wait(function () { return b.bar.count() == 3; }); + + A.runCommand({ replSetTest: 1, blind: true }); + reconnect(a,b); + wait(function () { return B.isMaster().ismaster; }); + + b.bar.insert({ q: 4 }); + b.bar.insert({ q: 5 }); + b.bar.insert({ q: 6 }); + assert(b.bar.count() == 6, "u.count"); + + // a should not have the new data as it was in blind state. + B.runCommand({ replSetTest: 1, blind: true }); + print("*************** wait for server to reconnect ****************"); + reconnect(a,b); + A.runCommand({ replSetTest: 1, blind: false }); + reconnect(a,b); + + print("*************** B ****************"); + wait(function () { try { return !B.isMaster().ismaster; } catch(e) { return false; } }); + print("*************** A ****************"); + reconnect(a,b); + wait(function () { + try { + return A.isMaster().ismaster; + } catch(e) { + return false; + } + }); + + assert(a.bar.count() == 3, "t is 3"); + a.bar.insert({ q: 7 }); + a.bar.insert({ q: 8 }); + { + assert(a.bar.count() == 5); + var x = a.bar.find().toArray(); + assert(x[0].q == 1, '1'); + assert(x[1].q == 2, '2'); + assert(x[2].q == 3, '3'); + assert(x[3].q == 7, '7'); + assert(x[4].q == 8, '8'); + } + + // A is 1 2 3 7 8 + // B is 1 2 3 4 5 6 + + // bring B back online + B.runCommand({ replSetTest: 1, blind: false }); + reconnect(a,b); + + wait(function () { return B.isMaster().ismaster || B.isMaster().secondary; }); + + // everyone is up here... + assert(A.isMaster().ismaster || A.isMaster().secondary, "A up"); + assert(B.isMaster().ismaster || B.isMaster().secondary, "B up"); + replTest.awaitReplication(); + + friendlyEqual(a.bar.find().sort({ _id: 1 }).toArray(), b.bar.find().sort({ _id: 1 }).toArray(), "server data sets do not match"); + + pause("rollback.js SUCCESS"); + replTest.stopSet(signal); +}; + + +var reconnect = function(a,b) { + wait(function() { + try { + a.bar.stats(); + b.bar.stats(); + return true; + } catch(e) { + print(e); + return false; + } + }); +}; + print("rollback.js"); doTest( 15 ); diff --git a/jstests/replsets/rollback2.js b/jstests/replsets/rollback2.js index 483d221..46fb548 100644 --- a/jstests/replsets/rollback2.js +++ b/jstests/replsets/rollback2.js @@ -1,201 +1,232 @@ -// test rollback in replica sets - -// try running as : -// -// mongo --nodb rollback.js | tee out | grep -v ^m31 -// - -var debugging = 0; - -function pause(s) { - print(s); - while (debugging) { - sleep(3000); - print(s); - } -} - -function deb(obj) { - if( debugging ) { - print("\n\n\n" + obj + "\n\n"); - } -} - -w = 0; - -function wait(f) { - w++; - var n = 0; - while (!f()) { - if (n % 4 == 0) - print("rollback2.js waiting " + w); - if (++n == 4) { - print("" + f); - } - sleep(1000); - } -} - -function dbs_match(a, b) { - print("dbs_match"); - - var ac = a.system.namespaces.find().sort({name:1}).toArray(); - var bc = b.system.namespaces.find().sort({name:1}).toArray(); - if (!friendlyEqual(ac, bc)) { - print("dbs_match: namespaces don't match"); - print("\n\n"); - printjson(ac); - print("\n\n"); - printjson(bc); - print("\n\n"); - return false; - } - - var c = a.getCollectionNames(); - for( var i in c ) { - print("checking " + c[i]); - if( !friendlyEqual( a[c[i]].find().sort({_id:1}).toArray(), b[c[i]].find().sort({_id:1}).toArray() ) ) { - print("dbs_match: collections don't match " + c[i]); - return false; - } - } - return true; -} - -/* these writes will be initial data and replicate everywhere. */ -function doInitialWrites(db) { - t = db.bar; - t.insert({ q:0}); - t.insert({ q: 1, a: "foo" }); - t.insert({ q: 2, a: "foo", x: 1 }); - t.insert({ q: 3, bb: 9, a: "foo" }); - t.insert({ q: 40, a: 1 }); - t.insert({ q: 40, a: 2 }); - t.insert({ q: 70, txt: 'willremove' }); - - db.createCollection("kap", { capped: true, size: 5000 }); - db.kap.insert({ foo: 1 }) - - // going back to empty on capped is a special case and must be tested - db.createCollection("kap2", { capped: true, size: 5501 }); -} - -/* these writes on one primary only and will be rolled back. */ -function doItemsToRollBack(db) { - t = db.bar; - t.insert({ q: 4 }); - t.update({ q: 3 }, { q: 3, rb: true }); - - t.remove({ q: 40 }); // multi remove test - - t.update({ q: 2 }, { q: 39, rb: true }); - - // rolling back a delete will involve reinserting the item(s) - t.remove({ q: 1 }); - - t.update({ q: 0 }, { $inc: { y: 1} }); - - db.kap.insert({ foo: 2 }) - db.kap2.insert({ foo: 2 }) - - // create a collection (need to roll back the whole thing) - db.newcoll.insert({ a: true }); - - // create a new empty collection (need to roll back the whole thing) - db.createCollection("abc"); -} - -function doWritesToKeep2(db) { - t = db.bar; - t.insert({ txt: 'foo' }); - t.remove({ q: 70 }); - t.update({ q: 0 }, { $inc: { y: 33} }); -} - -function verify(db) { - print("verify"); - t = db.bar; - assert(t.find({ q: 1 }).count() == 1); - assert(t.find({ txt: 'foo' }).count() == 1); - assert(t.find({ q: 4 }).count() == 0); -} - -doTest = function (signal) { - - var replTest = new ReplSetTest({ name: 'unicomplex', nodes: 3 }); - var nodes = replTest.nodeList(); - //print(tojson(nodes)); - - var conns = replTest.startSet(); - var r = replTest.initiate({ "_id": "unicomplex", - "members": [ - { "_id": 0, "host": nodes[0] }, - { "_id": 1, "host": nodes[1] }, - { "_id": 2, "host": nodes[2], arbiterOnly: true}] - }); - - // Make sure we have a master - var master = replTest.getMaster(); - a_conn = conns[0]; - A = a_conn.getDB("admin"); - b_conn = conns[1]; - a_conn.setSlaveOk(); - b_conn.setSlaveOk(); - B = b_conn.getDB("admin"); - assert(master == conns[0], "conns[0] assumed to be master"); - assert(a_conn == master); - - //deb(master); - - // Make sure we have an arbiter - assert.soon(function () { - res = conns[2].getDB("admin").runCommand({ replSetGetStatus: 1 }); - return res.myState == 7; - }, "Arbiter failed to initialize."); - - // Wait for initial replication - var a = a_conn.getDB("foo"); - var b = b_conn.getDB("foo"); - doInitialWrites(a); - - // wait for secondary to get this data - wait(function () { return b.bar.count() == a.bar.count(); }); - - A.runCommand({ replSetTest: 1, blind: true }); - wait(function () { return B.isMaster().ismaster; }); - - doItemsToRollBack(b); - - // a should not have the new data as it was in blind state. - B.runCommand({ replSetTest: 1, blind: true }); - A.runCommand({ replSetTest: 1, blind: false }); - wait(function () { return !B.isMaster().ismaster; }); - wait(function () { return A.isMaster().ismaster; }); - - assert(a.bar.count() >= 1, "count check"); - doWritesToKeep2(a); - - // A is 1 2 3 7 8 - // B is 1 2 3 4 5 6 - - // bring B back online - // as A is primary, B will roll back and then catch up - B.runCommand({ replSetTest: 1, blind: false }); - - wait(function () { return B.isMaster().ismaster || B.isMaster().secondary; }); - - // everyone is up here... - assert(A.isMaster().ismaster || A.isMaster().secondary, "A up"); - assert(B.isMaster().ismaster || B.isMaster().secondary, "B up"); - - verify(a); - - assert( dbs_match(a,b), "server data sets do not match after rollback, something is wrong"); - - pause("rollback2.js SUCCESS"); - replTest.stopSet(signal); +// a test of rollback in replica sets +// +// try running as : +// +// mongo --nodb rollback2.js | tee out | grep -v ^m31 +// + +var debugging = 0; + +function pause(s) { + print(s); + while (debugging) { + sleep(3000); + print(s); + } +} + +function deb(obj) { + if( debugging ) { + print("\n\n\n" + obj + "\n\n"); + } +} + +w = 0; + +function wait(f) { + w++; + var n = 0; + while (!f()) { + if (n % 4 == 0) + print("rollback2.js waiting " + w); + if (++n == 4) { + print("" + f); + } + assert(n < 200, 'tried 200 times, giving up'); + sleep(1000); + } +} + +function dbs_match(a, b) { + print("dbs_match"); + + var ac = a.system.namespaces.find().sort({name:1}).toArray(); + var bc = b.system.namespaces.find().sort({name:1}).toArray(); + if (!friendlyEqual(ac, bc)) { + print("dbs_match: namespaces don't match"); + print("\n\n"); + printjson(ac); + print("\n\n"); + printjson(bc); + print("\n\n"); + return false; + } + + var c = a.getCollectionNames(); + for( var i in c ) { + print("checking " + c[i]); + if( !friendlyEqual( a[c[i]].find().sort({_id:1}).toArray(), b[c[i]].find().sort({_id:1}).toArray() ) ) { + print("dbs_match: collections don't match " + c[i]); + return false; + } + } + return true; +} + +/* these writes will be initial data and replicate everywhere. */ +function doInitialWrites(db) { + t = db.bar; + t.insert({ q:0}); + t.insert({ q: 1, a: "foo" }); + t.insert({ q: 2, a: "foo", x: 1 }); + t.insert({ q: 3, bb: 9, a: "foo" }); + t.insert({ q: 40, a: 1 }); + t.insert({ q: 40, a: 2 }); + t.insert({ q: 70, txt: 'willremove' }); + + db.createCollection("kap", { capped: true, size: 5000 }); + db.kap.insert({ foo: 1 }) + + // going back to empty on capped is a special case and must be tested + db.createCollection("kap2", { capped: true, size: 5501 }); +} + +/* these writes on one primary only and will be rolled back. */ +function doItemsToRollBack(db) { + t = db.bar; + t.insert({ q: 4 }); + t.update({ q: 3 }, { q: 3, rb: true }); + + t.remove({ q: 40 }); // multi remove test + + t.update({ q: 2 }, { q: 39, rb: true }); + + // rolling back a delete will involve reinserting the item(s) + t.remove({ q: 1 }); + + t.update({ q: 0 }, { $inc: { y: 1} }); + + db.kap.insert({ foo: 2 }) + db.kap2.insert({ foo: 2 }) + + // create a collection (need to roll back the whole thing) + db.newcoll.insert({ a: true }); + + // create a new empty collection (need to roll back the whole thing) + db.createCollection("abc"); } +function doWritesToKeep2(db) { + t = db.bar; + t.insert({ txt: 'foo' }); + t.remove({ q: 70 }); + t.update({ q: 0 }, { $inc: { y: 33} }); +} + +function verify(db) { + print("verify"); + t = db.bar; + assert(t.find({ q: 1 }).count() == 1); + assert(t.find({ txt: 'foo' }).count() == 1); + assert(t.find({ q: 4 }).count() == 0); +} + +doTest = function (signal) { + + var replTest = new ReplSetTest({ name: 'unicomplex', nodes: 3 }); + var nodes = replTest.nodeList(); + //print(tojson(nodes)); + + var conns = replTest.startSet(); + var r = replTest.initiate({ "_id": "unicomplex", + "members": [ + { "_id": 0, "host": nodes[0] }, + { "_id": 1, "host": nodes[1] }, + { "_id": 2, "host": nodes[2], arbiterOnly: true}] + }); + + // Make sure we have a master + var master = replTest.getMaster(); + a_conn = conns[0]; + A = a_conn.getDB("admin"); + b_conn = conns[1]; + a_conn.setSlaveOk(); + b_conn.setSlaveOk(); + B = b_conn.getDB("admin"); + assert(master == conns[0], "conns[0] assumed to be master"); + assert(a_conn == master); + + //deb(master); + + // Make sure we have an arbiter + assert.soon(function () { + res = conns[2].getDB("admin").runCommand({ replSetGetStatus: 1 }); + return res.myState == 7; + }, "Arbiter failed to initialize."); + + // Wait for initial replication + var a = a_conn.getDB("foo"); + var b = b_conn.getDB("foo"); + wait(function () { + var status = A.runCommand({replSetGetStatus : 1}); + return status.members[1].state == 2; + }); + + doInitialWrites(a); + + // wait for secondary to get this data + wait(function () { return b.bar.count() == a.bar.count(); }); + wait(function () { + var status = A.runCommand({replSetGetStatus : 1}); + return status.members[1].state == 2; + }); + + + A.runCommand({ replSetTest: 1, blind: true }); + reconnect(a, b); + + wait(function () { return B.isMaster().ismaster; }); + + doItemsToRollBack(b); + + // a should not have the new data as it was in blind state. + B.runCommand({ replSetTest: 1, blind: true }); + reconnect(a, b); + A.runCommand({ replSetTest: 1, blind: false }); + reconnect(a,b); + + wait(function () { try { return !B.isMaster().ismaster; } catch(e) { return false; } }); + wait(function () { try { return A.isMaster().ismaster; } catch(e) { return false; } }); + + assert(a.bar.count() >= 1, "count check"); + doWritesToKeep2(a); + + // A is 1 2 3 7 8 + // B is 1 2 3 4 5 6 + + // bring B back online + // as A is primary, B will roll back and then catch up + B.runCommand({ replSetTest: 1, blind: false }); + reconnect(a,b); + + wait(function () { return B.isMaster().ismaster || B.isMaster().secondary; }); + + // everyone is up here... + assert(A.isMaster().ismaster || A.isMaster().secondary, "A up"); + assert(B.isMaster().ismaster || B.isMaster().secondary, "B up"); + replTest.awaitReplication(); + + verify(a); + + assert( dbs_match(a,b), "server data sets do not match after rollback, something is wrong"); + + pause("rollback2.js SUCCESS"); + replTest.stopSet(signal); +}; + +var reconnect = function(a,b) { + wait(function() { + try { + a.bar.stats(); + b.bar.stats(); + return true; + } catch(e) { + print(e); + return false; + } + }); +}; + print("rollback2.js"); doTest( 15 ); diff --git a/jstests/replsets/rollback3.js b/jstests/replsets/rollback3.js index 5c2f2f1..fa923d8 100755 --- a/jstests/replsets/rollback3.js +++ b/jstests/replsets/rollback3.js @@ -30,10 +30,10 @@ function wait(f) { if (n % 4 == 0) print("rollback3.js waiting " + w); if (++n == 4) { - print("" + f); - } - if (n == 200) { - print("rollback3.js failing waited too long"); + print("" + f); + } + if (n == 200) { + print("rollback3.js failing waited too long"); throw "wait error"; } sleep(1000); @@ -188,15 +188,20 @@ doTest = function (signal) { wait(function () { return b.bar.count() == a.bar.count(); }); A.runCommand({ replSetTest: 1, blind: true }); - wait(function () { return B.isMaster().ismaster; }); + reconnect(a,b); + wait(function () { try { return B.isMaster().ismaster; } catch(e) { return false; } }); doItemsToRollBack(b); // a should not have the new data as it was in blind state. B.runCommand({ replSetTest: 1, blind: true }); + reconnect(a,b); + A.runCommand({ replSetTest: 1, blind: false }); - wait(function () { return !B.isMaster().ismaster; }); - wait(function () { return A.isMaster().ismaster; }); + reconnect(a,b); + + wait(function () { try { return !B.isMaster().ismaster; } catch(e) { return false; } }); + wait(function () { try { return A.isMaster().ismaster; } catch(e) { return false; } }); assert(a.bar.count() >= 1, "count check"); doWritesToKeep2(a); @@ -207,18 +212,34 @@ doTest = function (signal) { // bring B back online // as A is primary, B will roll back and then catch up B.runCommand({ replSetTest: 1, blind: false }); + reconnect(a,b); wait(function () { return B.isMaster().ismaster || B.isMaster().secondary; }); // everyone is up here... assert(A.isMaster().ismaster || A.isMaster().secondary, "A up"); assert(B.isMaster().ismaster || B.isMaster().secondary, "B up"); - + replTest.awaitReplication(); + assert( dbs_match(a,b), "server data sets do not match after rollback, something is wrong"); pause("rollback3.js SUCCESS"); replTest.stopSet(signal); -} +}; + + +var reconnect = function(a,b) { + wait(function() { + try { + a.bar.stats(); + b.bar.stats(); + return true; + } catch(e) { + print(e); + return false; + } + }); +}; print("rollback3.js"); doTest( 15 ); diff --git a/jstests/replsets/rslib.js b/jstests/replsets/rslib.js new file mode 100644 index 0000000..c072829 --- /dev/null +++ b/jstests/replsets/rslib.js @@ -0,0 +1,63 @@ + +var count = 0; +var w = 0; + +var wait = function(f) { + w++; + var n = 0; + while (!f()) { + if( n % 4 == 0 ) + print("waiting " + w); + if (++n == 4) { + print("" + f); + } + assert(n < 200, 'tried 200 times, giving up'); + sleep(1000); + } +}; + +/** + * Use this to do something once every 4 iterations. + * + *
+ * for (i=0; i<1000; i++) {
+ *   occasionally(function() { print("4 more iterations"); });
+ * }
+ * 
+ */ +var occasionally = function(f, n) { + var interval = n || 4; + if (count % interval == 0) { + f(); + } + count++; +}; + +var reconnect = function(a) { + wait(function() { + try { + // make this work with either dbs or connections + if (typeof(a.getDB) == "function") { + a.getDB("foo").bar.stats(); + } + else { + a.bar.stats(); + } + return true; + } catch(e) { + print(e); + return false; + } + }); +}; + + +var getLatestOp = function(server) { + server.getDB("admin").getMongo().setSlaveOk(); + var log = server.getDB("local")['oplog.rs']; + var cursor = log.find({}).sort({'$natural': -1}).limit(1); + if (cursor.hasNext()) { + return cursor.next(); + } + return null; +}; diff --git a/jstests/replsets/slaveDelay2.js b/jstests/replsets/slaveDelay2.js new file mode 100644 index 0000000..2d9dd1f --- /dev/null +++ b/jstests/replsets/slaveDelay2.js @@ -0,0 +1,104 @@ + +var name = "slaveDelay2"; +var host = getHostName(); + +var waitForAllMembers = function(master) { + var ready = false; + + outer: + while (true) { + var state = master.getSisterDB("admin").runCommand({replSetGetStatus:1}); + + for (var m in state.members) { + if (state.members[m].state != 2 && state.members[m].state != 1) { + sleep(10000); + continue outer; + } + } + + printjson(state); + print("okay, everyone is primary or secondary"); + return; + } +}; + + +var initialize = function() { + var replTest = new ReplSetTest( {name: name, nodes: 1} ); + + var nodes = replTest.startSet(); + + replTest.initiate(); + + var master = replTest.getMaster().getDB(name); + + waitForAllMembers(master); + + return replTest; +}; + +var populate = function(master) { + // insert records + for (var i =0; i<1000; i++) { + master.foo.insert({_id:1}); + } + + master.runCommand({getlasterror:1}); +} + +doTest = function( signal ) { + var replTest = initialize(); + var master = replTest.getMaster().getDB(name); + populate(master); + var admin = master.getSisterDB("admin"); + + /** + * start a slave with a long delay (1 hour) and do some writes while it is + * initializing. Make sure it syncs all of these writes before going into + * syncDelay. + */ + var conn = startMongodTest(31008, name + "-sd", 0, { useHostname: true, replSet: name }); + conn.setSlaveOk(); + + config = master.getSisterDB("local").system.replset.findOne(); + config.version++; + config.members.push({_id : 1, host : host+":31008",priority:0, slaveDelay:3600}); + var ok = admin.runCommand({replSetReconfig : config}); + assert.eq(ok.ok,1); + + // do inserts during initial sync + count = 0; + while (count < 10) { + for (var i = 100*count; i<100*(count+1); i++) { + master.foo.insert({x:i}); + } + + //check if initial sync is done + var state = master.getSisterDB("admin").runCommand({replSetGetStatus:1}); + printjson(state); + if (state.members[1].state == 2) { + break; + } + + count++; + } + + // throw out last 100 inserts, but make sure the others were applied + if (count == 0) { + print("NOTHING TO CHECK"); + replTest.stopSet(); + return; + } + + // wait a bit for the syncs to be applied + waitForAllMembers(master); + + for (var i=0; i<(100*count); i++) { + var obj = conn.getDB(name).foo.findOne({x : i}); + assert(obj); + } + + replTest.stopSet(); +} + +doTest(15); diff --git a/jstests/replsets/slavedelay1.js b/jstests/replsets/slavedelay1.js new file mode 100644 index 0000000..e549822 --- /dev/null +++ b/jstests/replsets/slavedelay1.js @@ -0,0 +1,127 @@ + +var waitForAllMembers = function(master) { + var ready = false; + + outer: + while (true) { + var state = master.getSisterDB("admin").runCommand({replSetGetStatus:1}); + printjson(state); + + for (var m in state.members) { + if (state.members[m].state != 2 && state.members[m].state != 1) { + sleep(10000); + continue outer; + } + } + return; + } +}; + + +doTest = function( signal ) { + + var name = "slaveDelay"; + var host = getHostName(); + + var replTest = new ReplSetTest( {name: name, nodes: 3} ); + + var nodes = replTest.startSet(); + + /* set slaveDelay to 30 seconds */ + var config = replTest.getReplSetConfig(); + config.members[2].priority = 0; + config.members[2].slaveDelay = 30; + + replTest.initiate(config); + + var master = replTest.getMaster().getDB(name); + var slaveConns = replTest.liveNodes.slaves; + var slave = []; + for (var i in slaveConns) { + var d = slaveConns[i].getDB(name); + d.getMongo().setSlaveOk(); + slave.push(d); + } + + waitForAllMembers(master); + + // insert a record + master.foo.insert({x:1}); + master.runCommand({getlasterror:1, w:2}); + + var doc = master.foo.findOne(); + assert.eq(doc.x, 1); + + // make sure slave has it + var doc = slave[0].foo.findOne(); + assert.eq(doc.x, 1); + + // make sure delayed slave doesn't have it + assert.eq(slave[1].foo.findOne(), null); + + // wait 35 seconds + sleep(35000); + + // now delayed slave should have it + assert.eq(slave[1].foo.findOne().x, 1); + + + /************* Part 2 *******************/ + + // how about non-initial sync? + + for (var i=0; i<100; i++) { + master.foo.insert({_id : i, "foo" : "bar"}); + } + master.runCommand({getlasterror:1,w:2}); + + assert.eq(master.foo.findOne({_id : 99}).foo, "bar"); + assert.eq(slave[0].foo.findOne({_id : 99}).foo, "bar"); + assert.eq(slave[1].foo.findOne({_id : 99}), null); + + sleep(35000); + + assert.eq(slave[1].foo.findOne({_id : 99}).foo, "bar"); + + /************* Part 3 *******************/ + + // how about if we add a new server? will it sync correctly? + + var conn = startMongodTest( 31007 , name+"-part3" , 0 , {useHostname : true, replSet : name} ); + + config = master.getSisterDB("local").system.replset.findOne(); + printjson(config); + config.version++; + config.members.push({_id : 3, host : host+":31007",priority:0, slaveDelay:10}); + + var admin = master.getSisterDB("admin"); + try { + var ok = admin.runCommand({replSetReconfig : config}); + assert.eq(ok.ok,1); + } + catch(e) { + print(e); + } + + master = replTest.getMaster().getDB(name); + + waitForAllMembers(master); + + sleep(15000); + + // it should be all caught up now + + master.foo.insert({_id : 123, "x" : "foo"}); + master.runCommand({getlasterror:1,w:2}); + + conn.setSlaveOk(); + assert.eq(conn.getDB(name).foo.findOne({_id:123}), null); + + sleep(15000); + + assert.eq(conn.getDB(name).foo.findOne({_id:123}).x, "foo"); + + replTest.stopSet(); +} + +doTest(15); diff --git a/jstests/replsets/sync1.js b/jstests/replsets/sync1.js index e60d128..af16044 100644 --- a/jstests/replsets/sync1.js +++ b/jstests/replsets/sync1.js @@ -1,5 +1,7 @@ // test rollback of replica sets +load("jstests/replsets/rslib.js"); + var debugging=0; w = 0; @@ -50,7 +52,7 @@ doTest = function (signal) { dbs[0].bar.ensureIndex({ w: 1 }); var ok = false; - var inserts = 100000; + var inserts = 10000; print("\nsync1.js ********************************************************************** part 5"); @@ -62,7 +64,7 @@ doTest = function (signal) { do { sleep(1000); status = dbs[0].getSisterDB("admin").runCommand({ replSetGetStatus: 1 }); - } while (status.members[1].state != 2 && status.members[2].state != 2); + } while (status.members[1].state != 2 || status.members[2].state != 2); print("\nsync1.js ********************************************************************** part 6"); dbs[0].getSisterDB("admin").runCommand({ replSetTest: 1, blind: true }); @@ -125,12 +127,14 @@ doTest = function (signal) { try { printjson(dbs[1].isMaster()); printjson(dbs[1].bar.count()); + printjson(dbs[1].adminCommand({replSetGetStatus : 1})); } catch (e) { print(e); } print("dbs[2]:"); try { printjson(dbs[2].isMaster()); printjson(dbs[2].bar.count()); + printjson(dbs[2].adminCommand({replSetGetStatus : 1})); } catch (e) { print(e); } assert(false, "sync1.js too many exceptions, failing"); @@ -161,10 +165,22 @@ doTest = function (signal) { print("\nsync1.js ********************************************************************** part 10"); // now, let's see if rollback works - var result = dbs[0].getSisterDB("admin").runCommand({ replSetTest: 1, blind: false }); + wait(function() { + try { + dbs[0].adminCommand({ replSetTest: 1, blind: false }); + } + catch(e) { + print(e); + } + reconnect(dbs[0]); + reconnect(dbs[1]); + + var status = dbs[1].adminCommand({replSetGetStatus:1}); + return status.members[0].health == 1; + }); + + dbs[0].getMongo().setSlaveOk(); - - printjson(result); sleep(5000); // now this should resync @@ -192,6 +208,10 @@ doTest = function (signal) { count++; if (count == 100) { + printjson(dbs[0].isMaster()); + printjson(dbs[0].adminCommand({replSetGetStatus:1})); + printjson(dbs[1].isMaster()); + printjson(dbs[1].adminCommand({replSetGetStatus:1})); pause("FAIL part 11"); assert(false, "replsets/\nsync1.js fails timing out"); replTest.stopSet(signal); diff --git a/jstests/replsets/sync_passive.js b/jstests/replsets/sync_passive.js new file mode 100644 index 0000000..d3e8ef4 --- /dev/null +++ b/jstests/replsets/sync_passive.js @@ -0,0 +1,89 @@ +/** + * Test syncing from non-primaries. + * + * Start a set. + * Inital sync. + * Kill member 1. + * Add some data. + * Kill member 0. + * Restart member 1. + * Check that it syncs. + * Add some data. + * Kill member 1. + * Restart member 0. + * Check that it syncs. + */ + +load("jstests/replsets/rslib.js"); + +var name = "sync_passive"; +var host = getHostName(); + +var replTest = new ReplSetTest( {name: name, nodes: 3} ); + +var nodes = replTest.startSet(); + +/* set slaveDelay to 30 seconds */ +var config = replTest.getReplSetConfig(); +config.members[2].priority = 0; + +replTest.initiate(config); + +var master = replTest.getMaster().getDB("test"); +var server0 = master; +var server1 = replTest.liveNodes.slaves[0]; + +print("Initial sync"); +for (var i=0;i<100;i++) { + master.foo.insert({x:i}); +} +replTest.awaitReplication(); + + +print("stop #1"); +replTest.stop(1); + + +print("add some data"); +for (var i=0;i<1000;i++) { + master.bar.insert({x:i}); +} +replTest.awaitReplication(); + + +print("stop #0"); +replTest.stop(0); + + +print("restart #1"); +replTest.restart(1); + + +print("check sync"); +replTest.awaitReplication(); + + +print("add data"); +reconnect(server1); +master = replTest.getMaster().getDB("test"); +for (var i=0;i<1000;i++) { + master.bar.insert({x:i}); +} +replTest.awaitReplication(); + + +print("kill #1"); +replTest.stop(1); + + +print("restart #0"); +replTest.restart(0); +reconnect(server0); + + +print("wait for sync"); +replTest.awaitReplication(); + + +print("bring #1 back up, make sure everything's okay"); +replTest.restart(1); diff --git a/jstests/replsets/sync_passive2.js b/jstests/replsets/sync_passive2.js new file mode 100644 index 0000000..230d71c --- /dev/null +++ b/jstests/replsets/sync_passive2.js @@ -0,0 +1,120 @@ +/** + * Test syncing from non-primaries. + */ + +load("jstests/replsets/rslib.js"); + +var name = "sync_passive2"; +var host = getHostName(); + +var replTest = new ReplSetTest( {name: name, nodes: 5} ); +var nodes = replTest.startSet(); + +// 0: master +// 1: arbiter +// 2: slave a +// 3: slave b +// 4: slave c +var config = replTest.getReplSetConfig(); +config.members[1].arbiterOnly = true; +for (i=2; i counts[0] , "counts 1 : " + tojson( counts ) ) diff --git a/jstests/sharding/bigMapReduce.js b/jstests/sharding/bigMapReduce.js index 1cc12f4..3cc1d66 100644 --- a/jstests/sharding/bigMapReduce.js +++ b/jstests/sharding/bigMapReduce.js @@ -7,11 +7,69 @@ db = s.getDB( "test" ); var str="" for (i=0;i<4*1024;i++) { str=str+"a"; } for (j=0; j<50; j++) for (i=0; i<512; i++){ db.foo.save({y:str})} +db.getLastError(); + +s.printChunks(); +s.printChangeLog(); function map() { emit('count', 1); } function reduce(key, values) { return Array.sum(values) } -out = db.foo.mapReduce(map, reduce) -printjson(out) // SERVER-1400 +gotAGoodOne = false; + +for ( iter=0; iter<5; iter++ ){ + try { + out = db.foo.mapReduce(map, reduce,"big_out") + gotAGoodOne = true + } + catch ( e ){ + if ( __mrerror__ && __mrerror__.cause && __mrerror__.cause.assertionCode == 13388 ){ + // TODO: SERVER-2396 + sleep( 1000 ); + continue; + } + printjson( __mrerror__ ); + throw e; + } +} +assert( gotAGoodOne , "no good for basic" ) + +gotAGoodOne = false; +// test output to a different DB +// do it multiple times so that primary shard changes +for (iter = 0; iter < 5; iter++) { + outCollStr = "mr_replace_col_" + iter; + outDbStr = "mr_db_" + iter; + + print("Testing mr replace into DB " + iter) + + try { + res = db.foo.mapReduce( map , reduce , { out : { replace: outCollStr, db: outDbStr } } ) + gotAGoodOne = true; + } + catch ( e ){ + if ( __mrerror__ && __mrerror__.cause && __mrerror__.cause.assertionCode == 13388 ){ + // TODO: SERVER-2396 + sleep( 1000 ); + continue; + } + printjson( __mrerror__ ); + throw e; + } + printjson(res); + + outDb = s.getDB(outDbStr); + outColl = outDb[outCollStr]; + + obj = outColl.convertToSingleObject("value"); + assert.eq( 25600 , obj.count , "Received wrong result " + obj.count ); + + print("checking result field"); + assert.eq(res.result.collection, outCollStr, "Wrong collection " + res.result.collection); + assert.eq(res.result.db, outDbStr, "Wrong db " + res.result.db); +} + +assert( gotAGoodOne , "no good for out db" ) s.stop() + diff --git a/jstests/sharding/count1.js b/jstests/sharding/count1.js index ed69d1f..cc3f712 100644 --- a/jstests/sharding/count1.js +++ b/jstests/sharding/count1.js @@ -27,14 +27,16 @@ db.foo.save( { _id : 6 , name : "allan" } ) assert.eq( 6 , db.foo.find().count() , "basic count" ); -s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); -s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); -s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); +s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); // [Minkey -> allan) , * [allan -> ..) +s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); // * [allan -> sara) , [sara -> Maxkey) +s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); // [alan -> joe) , [joe -> sara] + +s.printChunks() assert.eq( 6 , db.foo.find().count() , "basic count after split " ); assert.eq( 6 , db.foo.find().sort( { name : 1 } ).count() , "basic count after split sorted " ); -s.adminCommand( { movechunk : "test.foo" , find : { name : "joe" } , to : secondary.getMongo().name } ); +s.adminCommand( { movechunk : "test.foo" , find : { name : "allan" } , to : secondary.getMongo().name } ); assert.eq( 3 , primary.foo.find().toArray().length , "primary count" ); assert.eq( 3 , secondary.foo.find().toArray().length , "secondary count" ); diff --git a/jstests/sharding/cursor1.js b/jstests/sharding/cursor1.js index 2a30936..f6cb9e4 100644 --- a/jstests/sharding/cursor1.js +++ b/jstests/sharding/cursor1.js @@ -53,7 +53,7 @@ sleep( 6000 ) assert( cur.next() , "T3" ) assert( cur.next() , "T4" ); sleep( 22000 ) -assert.throws( function(){ cur.next(); } , "T5" ) +assert.throws( function(){ cur.next(); } , null , "T5" ) after = db.runCommand( { "cursorInfo" : 1 , "setTimeout" : 10000 } ) // 10 seconds gc(); gc() diff --git a/jstests/sharding/features1.js b/jstests/sharding/features1.js index 05b8b8c..c22f094 100644 --- a/jstests/sharding/features1.js +++ b/jstests/sharding/features1.js @@ -81,10 +81,10 @@ assert.eq( 1 , db.foo3.count() , "eval pre1" ); assert.eq( 1 , db.foo2.count() , "eval pre2" ); assert.eq( 8 , db.eval( function(){ return db.foo3.findOne().a; } ), "eval 1 " ); -assert.throws( function(){ db.eval( function(){ return db.foo2.findOne().a; } ) } , "eval 2" ) +assert.throws( function(){ db.eval( function(){ return db.foo2.findOne().a; } ) } , null , "eval 2" ) assert.eq( 1 , db.eval( function(){ return db.foo3.count(); } ), "eval 3 " ); -assert.throws( function(){ db.eval( function(){ return db.foo2.count(); } ) } , "eval 4" ) +assert.throws( function(){ db.eval( function(){ return db.foo2.count(); } ) } , null , "eval 4" ) // ---- unique shard key ---- @@ -105,6 +105,14 @@ assert.eq( 2 , b.foo4.getIndexes().length , "ub2" ); assert( a.foo4.getIndexes()[1].unique , "ua3" ); assert( b.foo4.getIndexes()[1].unique , "ub3" ); +assert.eq( 2 , db.foo4.count() , "uc1" ) +db.foo4.save( { num : 7 } ) +assert.eq( 3 , db.foo4.count() , "uc2" ) +db.foo4.save( { num : 7 } ) +gle = db.getLastErrorObj(); +assert( gle.err , "uc3" ) +assert.eq( 3 , db.foo4.count() , "uc4" ) + // --- don't let you convertToCapped ---- assert( ! db.foo4.isCapped() , "ca1" ); assert( ! a.foo4.isCapped() , "ca2" ); @@ -152,12 +160,22 @@ assert.throws( function(){ db.foo6.group( { key : { a : 1 } , initial : { count // ---- can't shard non-empty collection without index ----- db.foo8.save( { a : 1 } ); +db.getLastError(); assert( ! s.admin.runCommand( { shardcollection : "test.foo8" , key : { a : 1 } } ).ok , "non-empty collection" ); + +// ---- can't shard non-empty collection with null values in shard key ---- + +db.foo9.save( { b : 1 } ); +db.getLastError(); +db.foo9.ensureIndex( { a : 1 } ); +assert( ! s.admin.runCommand( { shardcollection : "test.foo9" , key : { a : 1 } } ).ok , "entry with null value" ); + + // --- listDatabases --- r = db.getMongo().getDBs() -assert.eq( 4 , r.databases.length , "listDatabases 1 : " + tojson( r ) ) +assert.eq( 3 , r.databases.length , "listDatabases 1 : " + tojson( r ) ) assert.lt( 10000 , r.totalSize , "listDatabases 2 : " + tojson( r ) ); s.stop() diff --git a/jstests/sharding/features2.js b/jstests/sharding/features2.js index dfb2883..b2070ea 100644 --- a/jstests/sharding/features2.js +++ b/jstests/sharding/features2.js @@ -92,8 +92,10 @@ r = function( key , values ){ doMR = function( n ){ print(n); - - var res = db.mr.mapReduce( m , r ); + + // on-disk + + var res = db.mr.mapReduce( m , r , "smr1_out" ); printjson( res ); assert.eq( new NumberLong(4) , res.counts.input , "MR T0 " + n ); @@ -103,11 +105,26 @@ doMR = function( n ){ var z = {}; x.find().forEach( function(a){ z[a._id] = a.value.count; } ); assert.eq( 3 , Object.keySet( z ).length , "MR T2 " + n ); - assert.eq( 2 , z.a , "MR T2 " + n ); - assert.eq( 3 , z.b , "MR T2 " + n ); - assert.eq( 3 , z.c , "MR T2 " + n ); + assert.eq( 2 , z.a , "MR T3 " + n ); + assert.eq( 3 , z.b , "MR T4 " + n ); + assert.eq( 3 , z.c , "MR T5 " + n ); x.drop(); + + // inline + + var res = db.mr.mapReduce( m , r , { out : { inline : 1 } } ); + printjson( res ); + assert.eq( new NumberLong(4) , res.counts.input , "MR T6 " + n ); + + var z = {}; + res.find().forEach( function(a){ z[a._id] = a.value.count; } ); + printjson( z ); + assert.eq( 3 , Object.keySet( z ).length , "MR T7 " + n ) ; + assert.eq( 2 , z.a , "MR T8 " + n ); + assert.eq( 3 , z.b , "MR T9 " + n ); + assert.eq( 3 , z.c , "MR TA " + n ); + } doMR( "before" ); @@ -124,7 +141,7 @@ s.adminCommand({movechunk:'test.mr', find:{x:3}, to: s.getServer('test').name } doMR( "after extra split" ); -cmd = { mapreduce : "mr" , map : "emit( " , reduce : "fooz + " }; +cmd = { mapreduce : "mr" , map : "emit( " , reduce : "fooz + " , out : "broken1" }; x = db.runCommand( cmd ); y = s._connections[0].getDB( "test" ).runCommand( cmd ); diff --git a/jstests/sharding/features3.js b/jstests/sharding/features3.js index b15ccd3..b28d88e 100644 --- a/jstests/sharding/features3.js +++ b/jstests/sharding/features3.js @@ -1,4 +1,3 @@ - s = new ShardingTest( "features3" , 2 , 1 , 1 ); s.adminCommand( { enablesharding : "test" } ); @@ -25,7 +24,7 @@ assert.eq( N / 2 , x.shards.shard0001.count , "count on shard0001" ) start = new Date() print( "about to fork shell: " + Date() ) -join = startParallelShell( "db.foo.find( function(){ x = \"\"; for ( i=0; i<10000; i++ ){ x+=i; } return true; } ).itcount()" ) +join = startParallelShell( "db.foo.find( function(){ x = ''; for ( i=0; i<10000; i++ ){ x+=i; } return true; } ).itcount()" ) print( "after forking shell: " + Date() ) function getMine( printInprog ){ diff --git a/jstests/sharding/geo_near_random1.js b/jstests/sharding/geo_near_random1.js new file mode 100644 index 0000000..6ffd4b2 --- /dev/null +++ b/jstests/sharding/geo_near_random1.js @@ -0,0 +1,37 @@ +// this tests all points using $near +load("jstests/libs/geo_near_random.js"); + +var testName = "geo_near_random1"; +var s = new ShardingTest( testName , 3 ); + +db = s.getDB("test"); // global db + +var test = new GeoNearRandomTest(testName); + +s.adminCommand({enablesharding:'test'}); +s.adminCommand({shardcollection: ('test.' + testName), key: {_id:1} }); + +test.insertPts(50); + +for (var i = (test.nPts/10); i < test.nPts; i+= (test.nPts/10)){ + s.adminCommand({split: ('test.' + testName), middle: {_id: i} }); + try { + s.adminCommand({moveChunk: ('test.' + testName), find: {_id: i-1}, to: ('shard000' + (i%3))}); + } catch (e) { + // ignore this error + if (! e.match(/that chunk is already on that shard/)){ + throw e; + } + } +} + +printShardingSizes() + +var opts = {sharded: true} +test.testPt([0,0], opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); + +s.stop() diff --git a/jstests/sharding/geo_near_random2.js b/jstests/sharding/geo_near_random2.js new file mode 100644 index 0000000..4871e1e --- /dev/null +++ b/jstests/sharding/geo_near_random2.js @@ -0,0 +1,44 @@ +// this tests 1% of all points using $near and $nearSphere +load("jstests/libs/geo_near_random.js"); + +var testName = "geo_near_random2"; +var s = new ShardingTest( testName , 3 ); + +db = s.getDB("test"); // global db + +var test = new GeoNearRandomTest(testName); + +s.adminCommand({enablesharding:'test'}); +s.adminCommand({shardcollection: ('test.' + testName), key: {_id:1} }); + +test.insertPts(5000); + +for (var i = (test.nPts/10); i < test.nPts; i+= (test.nPts/10)){ + s.adminCommand({split: ('test.' + testName), middle: {_id: i} }); + try { + s.adminCommand({moveChunk: ('test.' + testName), find: {_id: i-1}, to: ('shard000' + (i%3))}); + } catch (e) { + // ignore this error + if (! e.match(/that chunk is already on that shard/)){ + throw e; + } + } +} + +printShardingSizes() + +opts = {sphere:0, nToTest:test.nPts*0.01, sharded:true}; +test.testPt([0,0], opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); + +opts.sphere = 1 +test.testPt([0,0], opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); + +s.stop() diff --git a/jstests/sharding/key_many.js b/jstests/sharding/key_many.js index 1e0ba9d..3a8203f 100644 --- a/jstests/sharding/key_many.js +++ b/jstests/sharding/key_many.js @@ -20,7 +20,7 @@ s = new ShardingTest( "key_many" , 2 ); s.adminCommand( { enablesharding : "test" } ) db = s.getDB( "test" ); primary = s.getServer( "test" ).getDB( "test" ); -seconday = s.getOther( primary ).getDB( "test" ); +secondary = s.getOther( primary ).getDB( "test" ); function makeObjectDotted( v ){ var o = {}; @@ -97,12 +97,12 @@ for ( var i=0; i allan) , * [allan -> ..) +s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); // * [allan -> sara) , [sara -> Maxkey) +s.adminCommand( { split : "test.foo" , find : { name : "joe" } } ); // [alan -> joe) , [joe -> sara] -s.adminCommand( { movechunk : "test.foo" , find : { name : "joe" } , to : seconday.getMongo().name } ); +s.adminCommand( { movechunk : "test.foo" , find : { name : "allan" } , to : seconday.getMongo().name } ); s.printChunks(); @@ -39,6 +39,11 @@ assert.eq( 6 , db.foo.find().sort( { name : 1 } ).count() , "total count with co assert.eq( "allan,bob,eliot,joe,mark,sara" , db.foo.find().sort( { name : 1 } ).toArray().map( function(z){ return z.name; } ) , "sort 1" ); assert.eq( "sara,mark,joe,eliot,bob,allan" , db.foo.find().sort( { name : -1 } ).toArray().map( function(z){ return z.name; } ) , "sort 2" ); +// make sure we can't foce a split on an extreme key +// [allan->joe) +assert.throws( function(){ s.adminCommand( { split : "test.foo" , middle : { name : "allan" } } ) } ); +assert.throws( function(){ s.adminCommand( { split : "test.foo" , middle : { name : "joe" } } ) } ); + s.stop(); diff --git a/jstests/sharding/limit_push.js b/jstests/sharding/limit_push.js new file mode 100644 index 0000000..75ad271 --- /dev/null +++ b/jstests/sharding/limit_push.js @@ -0,0 +1,47 @@ +// This test is to ensure that limit() clauses are pushed down to the shards and evaluated +// See: http://jira.mongodb.org/browse/SERVER-1896 + +s = new ShardingTest( "limit_push", 2, 1, 1 ); + +db = s.getDB( "test" ); + +// Create some data +for (i=0; i < 100; i++) { db.limit_push.insert({ _id : i, x: i}); } +db.limit_push.ensureIndex( { x : 1 } ); +assert.eq( 100 , db.limit_push.find().length() , "Incorrect number of documents" ); + +// Shard the collection +s.adminCommand( { enablesharding : "test" } ); +s.adminCommand( { shardcollection : "test.limit_push" , key : { x : 1 } } ); + +// Now split the and move the data between the shards +s.adminCommand( { split : "test.limit_push", middle : { x : 50 }} ); +s.adminCommand( { moveChunk: "test.limit_push", find : { x : 51}, to : "shard0000" }) + +// Check that the chunck have split correctly +assert.eq( 2 , s.config.chunks.count() , "wrong number of chunks"); + +// The query is asking for the maximum value below a given value +// db.limit_push.find( { x : { $lt : 60} } ).sort( { x:-1} ).limit(1) +q = { x : { $lt : 60} }; + +// Make sure the basic queries are correct +assert.eq( 60 , db.limit_push.find( q ).count() , "Did not find 60 documents" ); +//rs = db.limit_push.find( q ).sort( { x:-1} ).limit(1) +//assert.eq( rs , { _id : "1" , x : 59 } , "Did not find document with value 59" ); + +// Now make sure that the explain shos that each shard is returning a single document as indicated +// by the "n" element for each shard +exp = db.limit_push.find( q ).sort( { x:-1} ).limit(1).explain(); +printjson( exp ) + +assert.eq("ParallelSort", exp.clusteredType, "Not a ParallelSort"); + +var k = 0; +for (var j in exp.shards) { + assert.eq( 1 , exp.shards[j][0].n, "'n' is not 1 from shard000" + k.toString()); + k++ +} + +s.stop(); + diff --git a/jstests/sharding/migrateBig.js b/jstests/sharding/migrateBig.js new file mode 100644 index 0000000..f6ba18a --- /dev/null +++ b/jstests/sharding/migrateBig.js @@ -0,0 +1,45 @@ + +s = new ShardingTest( "migrateBig" , 2 , 0 , 1 , { chunksize : 1 } ); + +s.adminCommand( { enablesharding : "test" } ); +s.adminCommand( { shardcollection : "test.foo" , key : { x : 1 } } ); + +db = s.getDB( "test" ) +coll = db.foo + +big = "" +while ( big.length < 10000 ) + big += "eliot" + +for ( x=0; x<100; x++ ) + coll.insert( { x : x , big : big } ) + +s.adminCommand( { split : "test.foo" , middle : { x : 33 } } ) +s.adminCommand( { split : "test.foo" , middle : { x : 66 } } ) +s.adminCommand( { movechunk : "test.foo" , find : { x : 90 } , to : s.getOther( s.getServer( "test" ) ).name } ) + +db.printShardingStatus() + +print( "YO : " + s.getServer( "test" ).host ) +direct = new Mongo( s.getServer( "test" ).host ) +print( "direct : " + direct ) + +directDB = direct.getDB( "test" ) + +for ( done=0; done<2*1024*1024; done+=big.length ){ + directDB.foo.insert( { x : 50 + Math.random() , big : big } ) + directDB.getLastError(); +} + +db.printShardingStatus() + +assert.throws( function(){ s.adminCommand( { movechunk : "test.foo" , find : { x : 50 } , to : s.getOther( s.getServer( "test" ) ).name } ); } , [] , "move should fail" ) + +for ( i=0; i<20; i+= 2 ) + s.adminCommand( { split : "test.foo" , middle : { x : i } } ) + +db.printShardingStatus() + +assert.soon( function(){ var x = s.chunkDiff( "foo" , "test" ); print( "chunk diff: " + x ); return x < 2; } , "no balance happened" , 120 * 1000 , 2000 ) + +s.stop() diff --git a/jstests/sharding/multi_mongos1.js b/jstests/sharding/multi_mongos1.js new file mode 100644 index 0000000..cf9ebde --- /dev/null +++ b/jstests/sharding/multi_mongos1.js @@ -0,0 +1,70 @@ +// multi_mongos.js + +// setup sharding with two mongos, s1 and s2 +s1 = new ShardingTest( "multi_mongos1" , 2 , 1 , 2 ); +s2 = s1._mongos[1]; + +s1.adminCommand( { enablesharding : "test" } ); +s1.adminCommand( { shardcollection : "test.foo" , key : { num : 1 } } ); + +s1.config.databases.find().forEach( printjson ) + +viaS1 = s1.getDB( "test" ).foo; +viaS2 = s2.getDB( "test" ).foo; + +primary = s1.getServer( "test" ).getDB( "test" ).foo; +secondary = s1.getOther( primary.name ).getDB( "test" ).foo; + +N = 4; +for (i=1; i<=N; i++) { + viaS1.save( { num : i } ); +} + +// initial checks + +// both mongos see all elements +assert.eq( N , viaS1.find().toArray().length , "normal A" ); +assert.eq( N , viaS2.find().toArray().length , "other A" ); + +// all elements are in one of the shards +assert.eq( N , primary.count() , "p1" ) +assert.eq( 0 , secondary.count() , "s1" ) +assert.eq( 1 , s1.onNumShards( "foo" ) , "on 1 shards" ); + +// +// STEP 1 (builds a bit of context so there should probably not be a step 2 in this same test) +// where we try to issue a move chunk from a mongos that's stale +// followed by a split on a valid chunk, albeit one with not the highest lastmod + +// split in [Minkey->1), [1->N), [N,Maxkey) +s1.adminCommand( { split : "test.foo" , middle : { num : 1 } } ); +s1.adminCommand( { split : "test.foo" , middle : { num : N } } ); + +// s2 is now stale w.r.t boundaires around { num: 1 } +res = s2.getDB( "admin" ).runCommand( { movechunk : "test.foo" , find : { num : 1 } , to : s1.getOther( s1.getServer( "test" ) ).name } ); +assert.eq( 0 , res.ok , "a move with stale boundaries should not have succeeded" + tojson(res) ); + +// s2 must have reloaded as a result of a failed move; retrying should work +res = s2.getDB( "admin" ).runCommand( { movechunk : "test.foo" , find : { num : 1 } , to : s1.getOther( s1.getServer( "test" ) ).name } ); +assert.eq( 1 , res.ok , "mongos did not reload after a failed migrate" + tojson(res) ); + +// s1 is not stale about the boundaries of [MinKey->1) +// but we'll try to split a chunk whose lastmod.major was not touched by the previous move +// in 1.6, that chunk would be with [Minkey->1) (where { num: -1 } falls) +// after 1.6, it would be with [N->Maxkey] (where { num: N+1 } falls) +// s.printShardingStatus() +res = s1.getDB( "admin" ).runCommand( { split : "test.foo" , middle : { num : N+1 } } ); // replace with { num: -1 } instead in 1.6 +assert.eq( 1, res.ok , "split over accurate boudaries should have succeeded" + tojson(res) ); + +// { num : 4 } is on primary +// { num : 1 , 2 , 3 } are on secondary +assert.eq( 1 , primary.find().toArray().length , "wrong count on primary" ); +assert.eq( 3 , secondary.find().toArray().length , "wrong count on secondary" ); +assert.eq( N , primary.find().itcount() + secondary.find().itcount() , "wrong total count" ) + +assert.eq( N , viaS1.find().toArray().length , "normal B" ); +assert.eq( N , viaS2.find().toArray().length , "other B" ); + +printjson( primary._db._adminCommand( "shardingState" ) ); + +s1.stop(); \ No newline at end of file diff --git a/jstests/sharding/rename.js b/jstests/sharding/rename.js index aa6137d..fa27611 100644 --- a/jstests/sharding/rename.js +++ b/jstests/sharding/rename.js @@ -24,3 +24,4 @@ assert.eq(db.bar.findOne(), {_id:3}, '3.1'); assert.eq(db.bar.count(), 1, '3.2'); assert.eq(db.foo.count(), 0, '3.3'); +s.stop() \ No newline at end of file diff --git a/jstests/sharding/shard1.js b/jstests/sharding/shard1.js index 1783238..ae382e4 100644 --- a/jstests/sharding/shard1.js +++ b/jstests/sharding/shard1.js @@ -21,6 +21,7 @@ assert.eq( 3 , db.foo.find().length() , "after partitioning count failed" ); s.adminCommand( shardCommand ); cconfig = s.config.collections.findOne( { _id : "test.foo" } ); +assert( cconfig , "why no collection entry for test.foo" ) delete cconfig.lastmod delete cconfig.dropped assert.eq( cconfig , { _id : "test.foo" , key : { num : 1 } , unique : false } , "Sharded content" ); diff --git a/jstests/sharding/shard3.js b/jstests/sharding/shard3.js index e57dc1e..7132563 100644 --- a/jstests/sharding/shard3.js +++ b/jstests/sharding/shard3.js @@ -41,9 +41,10 @@ printjson( primary._db._adminCommand( "shardingState" ) ); // --- filtering --- -function doCounts( name , total ){ +function doCounts( name , total , onlyItCounts ){ total = total || ( primary.count() + secondary.count() ); - assert.eq( total , a.count() , name + " count" ); + if ( ! onlyItCounts ) + assert.eq( total , a.count() , name + " count" ); assert.eq( total , a.find().sort( { n : 1 } ).itcount() , name + " itcount - sort n" ); assert.eq( total , a.find().itcount() , name + " itcount" ); assert.eq( total , a.find().sort( { _id : 1 } ).itcount() , name + " itcount - sort _id" ); @@ -51,8 +52,12 @@ function doCounts( name , total ){ } var total = doCounts( "before wrong save" ) -//secondary.save( { num : -3 } ); -//doCounts( "after wrong save" , total ) +secondary.save( { num : -3 } ); +doCounts( "after wrong save" , total , true ) +e = a.find().explain(); +assert.eq( 3 , e.n , "ex1" ) +assert.eq( 4 , e.nscanned , "ex2" ) +assert.eq( 1 , e.nChunkSkips , "ex3" ) // --- move all to 1 --- print( "MOVE ALL TO 1" ); @@ -89,27 +94,18 @@ s.printCollectionInfo( "test.foo" , "after counts" ); assert.eq( 0 , primary.count() , "p count after drop" ) assert.eq( 0 , secondary.count() , "s count after drop" ) +// NOTE +// the following bypasses the sharding layer and writes straight to the servers +// this is not supported at all but we'd like to leave this backdoor for now primary.save( { num : 1 } ); secondary.save( { num : 4 } ); - assert.eq( 1 , primary.count() , "p count after drop and save" ) assert.eq( 1 , secondary.count() , "s count after drop and save " ) +print("*** makes sure that sharded access respects the drop command" ); -print("*** makes sure that sharding knows where things live" ); - -assert.eq( 1 , a.count() , "a count after drop and save" ) -s.printCollectionInfo( "test.foo" , "after a count" ); -assert.eq( 1 , b.count() , "b count after drop and save" ) -s.printCollectionInfo( "test.foo" , "after b count" ); - -assert( a.findOne( { num : 1 } ) , "a drop1" ); -assert.isnull( a.findOne( { num : 4 } ) , "a drop1" ); - -s.printCollectionInfo( "test.foo" , "after a findOne tests" ); - -assert( b.findOne( { num : 1 } ) , "b drop1" ); -assert.isnull( b.findOne( { num : 4 } ) , "b drop1" ); +assert.isnull( a.findOne() , "lookup via mongos 'a' accessed dropped data" ); +assert.isnull( b.findOne() , "lookup via mongos 'b' accessed dropped data" ); s.printCollectionInfo( "test.foo" , "after b findOne tests" ); @@ -130,6 +126,8 @@ s.printCollectionInfo( "test.foo" , "after dropDatabase setup3" ); print( "*** ready to call dropDatabase" ) res = s.getDB( "test" ).dropDatabase(); assert.eq( 1 , res.ok , "dropDatabase failed : " + tojson( res ) ); +// Waiting for SERVER-2253 +// assert.eq( 0 , s.config.databases.count( { _id: "test" } ) , "database 'test' was dropped but still appears in configDB" ); s.printShardingStatus(); s.printCollectionInfo( "test.foo" , "after dropDatabase call 1" ); diff --git a/jstests/sharding/shard_insert_getlasterror_w2.js b/jstests/sharding/shard_insert_getlasterror_w2.js new file mode 100644 index 0000000..c722f21 --- /dev/null +++ b/jstests/sharding/shard_insert_getlasterror_w2.js @@ -0,0 +1,89 @@ +// replica set as solo shard +// getLastError(2) fails on about every 170 inserts on my Macbook laptop -Tony +// TODO: Add assertion code that catches hang + +load('jstests/libs/grid.js') + +function go() { + + var N = 2000 + + // ~1KB string + var Text = '' + for (var i = 0; i < 40; i++) + Text += 'abcdefghijklmnopqrstuvwxyz' + + // Create replica set with 3 servers + var repset1 = new ReplicaSet('repset1', 3) .begin() + + // Add data to it + var conn1a = repset1.getMaster() + var db1a = conn1a.getDB('test') + for (var i = 0; i < N; i++) { + db1a['foo'].insert({x: i, text: Text}) + db1a.getLastError(2) // wait to be copied to at least one secondary + } + + // Create 3 sharding config servers + var configsetSpec = new ConfigSet(3) + var configsetConns = configsetSpec.begin() + + // Create sharding router (mongos) + var routerSpec = new Router(configsetSpec) + var routerConn = routerSpec.begin() + var dba = routerConn.getDB('admin') + var db = routerConn.getDB('test') + + // Add repset1 as only shard + addShard (routerConn, repset1.getURL()) + + // Enable sharding on test db and its collection foo + enableSharding (routerConn, 'test') + db['foo'].ensureIndex({x: 1}) + shardCollection (routerConn, 'test', 'foo', {x: 1}) + + sleep(30000) + printjson (db['foo'].stats()) + dba.printShardingStatus() + printjson (db['foo'].count()) + + // Test case where GLE should return an error + db.foo.insert({_id:'a', x:1}); + db.foo.insert({_id:'a', x:1}); + var x = db.getLastErrorObj(2, 30000) + assert.neq(x.err, null, tojson(x)); + + // Add more data + for (var i = N; i < 2*N; i++) { + db['foo'].insert({x: i, text: Text}) + var x = db.getLastErrorObj(2, 30000) // wait to be copied to at least one secondary + if (i % 30 == 0) print(i) + if (i % 100 == 0 || x.err != null) printjson(x); + assert.eq(x.err, null, tojson(x)); + } + + // take down the slave and make sure it fails over + repset1.stop(1); + repset1.stop(2); + db.getMongo().setSlaveOk(); + print("trying some queries"); + assert.soon(function() { try { + db.foo.find().next(); + } + catch(e) { + print(e); + return false; + } + return true; + }); + + // Done + routerSpec.end() + configsetSpec.end() + repset1.stopSet() + + print('shard_insert_getlasterror_w2.js SUCCESS') +} + +//Uncomment below to execute +go() diff --git a/jstests/sharding/sort1.js b/jstests/sharding/sort1.js index 0edb7a7..e2b287e 100644 --- a/jstests/sharding/sort1.js +++ b/jstests/sharding/sort1.js @@ -2,7 +2,7 @@ s = new ShardingTest( "sort1" , 2 , 0 , 2 ) s.adminCommand( { enablesharding : "test" } ); -s.adminCommand( { shardcollection : "test.data" , key : { num : 1 } } ); +s.adminCommand( { shardcollection : "test.data" , key : { 'sub.num' : 1 } } ); db = s.getDB( "test" ); @@ -11,16 +11,16 @@ N = 100 forward = [] backward = [] for ( i=0; i= 2 ) + break; + print("32bit.js PASS #" + pass); + pass++; + + t = mydb.colltest_32bit; + + print("seed=" + seed); + + t.insert({x:1}); + t.ensureIndex({a:1}); + t.ensureIndex({b:1}, true); + t.ensureIndex({x:1}); + if( Math.random() < 0.3 ) + t.ensureIndex({c:1}); + t.ensureIndex({d:1}); + t.ensureIndex({e:1}); + t.ensureIndex({f:1}); + + big = 'a b'; + big = big + big; + k = big; + big = big + big; + big = big + big; + big = big + big; + + a = 0; + c = 'kkk'; + var start = new Date(); + while( 1 ) { + b = Math.random(seed); + d = c + -a; + f = Math.random(seed) + a; + a++; + cc = big; + if( Math.random(seed) < .1 ) + cc = null; + t.insert({a:a,b:b,c:cc,d:d,f:f}); + if( Math.random(seed) < 0.01 ) { + + if( mydb.getLastError() ) { + /* presumably we have mmap error on 32 bit. try a few more manipulations attempting to break things */ + t.insert({a:33,b:44,c:55,d:66,f:66}); + t.insert({a:33,b:44000,c:55,d:66}); + t.insert({a:33,b:440000,c:55}); + t.insert({a:33,b:4400000}); + t.update({a:20},{'$set':{c:'abc'}}); + t.update({a:21},{'$set':{c:'aadsfbc'}}); + t.update({a:22},{'$set':{c:'c'}}); + t.update({a:23},{'$set':{b:cc}}); + t.remove({a:22}); + break; + } + + t.remove({a:a}); + t.remove({b:Math.random(seed)}); + t.insert({e:1}); + t.insert({f:'aaaaaaaaaa'}); + + if( Math.random() < 0.00001 ) { print("remove cc"); t.remove({c:cc}); } + if( Math.random() < 0.0001 ) { print("update cc"); t.update({c:cc},{'$set':{c:1}},false,true); } + if( Math.random() < 0.00001 ) { print("remove e"); t.remove({e:1}); } + } + if (a == 20000 ) { + var delta_ms = (new Date())-start; + // 2MM / 20000 = 100. 1000ms/sec. + var eta_secs = delta_ms * (100 / 1000); + print("32bit.js eta_secs:" + eta_secs); + if( eta_secs > 1000 ) { + print("32bit.js machine is slow, stopping early. a:" + a); + mydb.dropDatabase(); + return; + } + } + if( a % 100000 == 0 ) { + print(a); + // on 64 bit we won't error out, so artificially stop. on 32 bit we will hit mmap limit ~1.6MM but may + // vary by a factor of 2x by platform + if( a >= 2200000 ) { + mydb.dropDatabase(); + return; + } + } + } + print("count: " + t.count()); + + var res = t.validate(); + if( !res.valid ) { + print("32bit.js FAIL validating"); + print(res.result); + printjson(res); + //mydb.dropDatabase(); + throw "fail validating 32bit.js"; + } + + mydb.dropDatabase(); + } + + print("32bit.js SUCCESS"); +} + +if (!db._adminCommand("buildInfo").debug && !db.runCommand( { serverStatus : 1 , repl : 1 } ).repl ){ + /* this test is slow, so don't run during the day */ + print("\n32bit.js running - this test is slow so only runs at night."); + f(); +} +else { + print("32bit.js skipping this test - debug server build would be too slow"); +} diff --git a/jstests/slowNightly/btreedel.js b/jstests/slowNightly/btreedel.js new file mode 100644 index 0000000..824eb3e --- /dev/null +++ b/jstests/slowNightly/btreedel.js @@ -0,0 +1,43 @@ +// btreedel.js + +t = db.foo; +t.remove({}); + +for (var i = 0; i < 1000000; i++) { + t.insert({ _id: i, x: 'a b' }); +} + +print("1 insert done count: " + t.count()); + +var c = t.find({y:null}).sort({ _id: 1 }); +for (var j = 0; j < 400000; j++) { + c.next(); + if (j % 200000 == 0) + printjson(c.next()); +} +printjson(c.next()); + +var d = t.find({ _id: { $gt: 300000} }).sort({ _id: -1 }); +d.next(); + +print("2"); + +t.remove({ _id: { $gt: 200000, $lt: 600000} }); + +print("3"); +print(d.hasNext()); + +n = 0; +last = {}; +printjson(c.next()); +while (c.hasNext()) { + n++; + last = c.next(); +} + +print("4. n:" + n); +printjson(last); + +assert(n > 100000); + +print("btreedel.js success"); diff --git a/jstests/slowNightly/capped4.js b/jstests/slowNightly/capped4.js index 01af8f2..27d138c 100644 --- a/jstests/slowNightly/capped4.js +++ b/jstests/slowNightly/capped4.js @@ -31,4 +31,4 @@ assert( t.validate().valid, "G" ); db._adminCommand("closeAllDatabases"); -//assert( db.serverStatus().cursors.totalOpen == 0, "cursors open and shouldn't be"); +assert( db.serverStatus().cursors.totalOpen == 0, "cursors open and shouldn't be"); diff --git a/jstests/slowNightly/command_line_parsing.js b/jstests/slowNightly/command_line_parsing.js new file mode 100644 index 0000000..38c7324 --- /dev/null +++ b/jstests/slowNightly/command_line_parsing.js @@ -0,0 +1,9 @@ +// validate command line parameter parsing + +port = allocatePorts( 1 )[ 0 ]; +var baseName = "jstests_slowNightly_command_line_parsing"; + +// test notablescan +var m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName, "--notablescan" ); +m.getDB( baseName ).getCollection( baseName ).save( {a:1} ); +assert.throws( function() { m.getDB( baseName ).getCollection( baseName ).find( {a:1} ).toArray() } ); diff --git a/jstests/slowNightly/dur_big_atomic_update.js b/jstests/slowNightly/dur_big_atomic_update.js new file mode 100644 index 0000000..ffb0d83 --- /dev/null +++ b/jstests/slowNightly/dur_big_atomic_update.js @@ -0,0 +1,31 @@ +// @file dur_big_atomic_update.js +// +// this tests writing 1GB in an atomic update to make sure we commit periodically + +var path = "/data/db/dur_big_atomic_update"; + +conn = startMongodEmpty("--port", 30001, "--dbpath", path, "--dur", "--durOptions", 8); +d = conn.getDB("test"); +d.foo.drop(); + +for (var i=0; i<1024; i++){ + d.foo.insert({_id:i}); +} + +big_string = 'x'; +while (big_string.length < 1024*1024) { + big_string += big_string; +} + +d.foo.update({$atomic:1}, {$set: {big_string: big_string}}, false, /*multi*/true); +err = d.getLastErrorObj(); + +assert(err.err == null); +assert(err.n == 1024); + +// free up space +d.dropDatabase(); + +stopMongod(30001); + +print("dur big atomic update SUCCESS"); diff --git a/jstests/slowNightly/dur_passthrough.js b/jstests/slowNightly/dur_passthrough.js new file mode 100644 index 0000000..22482e0 --- /dev/null +++ b/jstests/slowNightly/dur_passthrough.js @@ -0,0 +1,89 @@ +// runs the toplevel jstests with --dur +// +// TODO(mathias) use paranoid mode (--durOptions 8) once we are reasonably sure it will pass + +// DEBUG : set this variable to debug by skipping to a specific test to start with and go from there +//var skippingTo = /null.js/; +var skippingTo = false; + +conn = startMongodEmpty("--port", 30100, "--dbpath", "/data/db/dur_passthrough", "--dur", "--smallfiles"); +db = conn.getDB("test"); + +function durPassThrough() { + + var runnerStart = new Date() + + var ran = {}; + + /** run a test. won't run more than once. logs if fails and then throws. + */ + function runTest(x) { + function _run(x) { + if (/[\/\\]_/.test(x.name) || + !/\.js$/.test(x.name) || + /repair/.test(x.name) || +// /numberlong/.test(x.name) || + false // placeholder so all real tests end in || + ) { + print("dur_passthrough.js >>>> skipping " + x.name); + return; + } + print(); + print("dur_passthrough.js run " + x.name); + print("dur_passthrough.js end " + x.name + ' ' + Date.timeFunc(function () { load(x.name); }, 1) + "ms"); + print(); + } + if (ran[x.name]) + return; + ran[x.name] = true; + try { + _run(x); + } + catch (e) { + print("\n\n\n\ndur_passthrough.js FAIL " + x.name + "\n\n\n"); + throw e; + } + } + + var files = listFiles("jstests"); + + if( !skippingTo ) { + // run something that will almost surely pass and is fast just to make sure our framework + // here is really working + runTest({ name: 'jstests/basic1.js' }); + + // run "suspicious" tests early. these are tests that have ever failed in buildbot. we run them + // early and try to get a fail fast + runTest({ name: 'jstests/shellstartparallel.js' }); + runTest({ name: 'jstests/cursora.js' }); + + // run the shell-oriented tests early. if the shell is broken the other tests aren't meaningful + runTest({ name: 'jstests/run_program1.js' }); + runTest({ name: 'jstests/shellspawn.js' }); + runTest({ name: 'jstests/shellkillop.js' }); + } + + files = files.sort(compareOn('name')); + files.forEach( + function (x) { + if (skippingTo && !skippingTo.test(x.name)) { + print("dur_passthrough.js temp skip " + x.name); + return; + } + skippingTo = false; + + // to keep memory usage low on 32 bit: + db.adminCommand("closeAllDatabases"); + + runTest(x); + } + ); + + print("dur_passthrough.js stopMongod"); + stopMongod(30100); + var runnerEnd = new Date(); + print("dur_passthrough.js total runner time: " + ((runnerEnd.getTime() - runnerStart.getTime()) / 1000) + "secs") +} + +durPassThrough(); +print("dur_passthrough.js SUCCESS"); diff --git a/jstests/slowNightly/dur_remove_old_journals.js b/jstests/slowNightly/dur_remove_old_journals.js new file mode 100644 index 0000000..3c57c12 --- /dev/null +++ b/jstests/slowNightly/dur_remove_old_journals.js @@ -0,0 +1,53 @@ +// this test makes sure that old journal files are removed + +// tunables +STRING_SIZE = 1024*1024; +NUM_TO_INSERT = 2.5*1024; +PATH = "/data/db/dur_remove_old_journals"; +SYNC_DELAY = 5; // must be a number + +conn = startMongodEmpty("--port", 30001, "--dbpath", PATH, "--dur", "--smallfiles", "--syncdelay", ''+SYNC_DELAY); +db = conn.getDB("test"); + +longString = 'x'; +while (longString.length < STRING_SIZE) + longString += longString; + +numInserted = 0; +while (numInserted < NUM_TO_INSERT){ + db.foo.insert({_id: numInserted++, s:longString}); + + + if (numInserted % 100 == 0){ + print("numInserted: " + numInserted); + db.adminCommand({fsync:1}); + db.foo.remove(); + db.adminCommand({fsync:1}); + } +} + +sleepSecs = SYNC_DELAY + 15 // long enough for data file flushing and journal keep time +print("\nWaiting " + sleepSecs + " seconds...\n"); +sleep(sleepSecs*1000); + + +files = listFiles(PATH + "/journal") +printjson(files); + +var nfiles = 0; +files.forEach(function (file) { + assert.eq('string', typeof (file.name)); // sanity checking + if (/prealloc/.test(file.name)) { + ; + } + else { + nfiles++; + assert(!(/j\._[01]/.test(file.name)), "Old journal file still exists: " + file.name); + } +}) + +assert.eq(2, nfiles); // j._2 and lsn + +stopMongod(30001); + +print("*** success ***"); diff --git a/jstests/slowNightly/geo_near_random1.js b/jstests/slowNightly/geo_near_random1.js new file mode 100644 index 0000000..ad67bdc --- /dev/null +++ b/jstests/slowNightly/geo_near_random1.js @@ -0,0 +1,13 @@ +// this tests all points using $near +load("jstests/libs/geo_near_random.js"); + +var test = new GeoNearRandomTest("nightly.geo_near_random1"); + +test.insertPts(200); + +test.testPt([0,0]); +test.testPt(test.mkPt()); +test.testPt(test.mkPt()); +test.testPt(test.mkPt()); +test.testPt(test.mkPt()); + diff --git a/jstests/slowNightly/geo_near_random2.js b/jstests/slowNightly/geo_near_random2.js new file mode 100644 index 0000000..d7dbc97 --- /dev/null +++ b/jstests/slowNightly/geo_near_random2.js @@ -0,0 +1,21 @@ +// this tests 1% of all points using $near and $nearSphere +load("jstests/libs/geo_near_random.js"); + +var test = new GeoNearRandomTest("nightly.geo_near_random2"); + +test.insertPts(10000); + +opts = {sphere:0, nToTest:test.nPts*0.01}; +test.testPt([0,0], opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); + +opts.sphere = 1 +test.testPt([0,0], opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); + diff --git a/jstests/slowNightly/index_check9.js b/jstests/slowNightly/index_check9.js new file mode 100644 index 0000000..6634d06 --- /dev/null +++ b/jstests/slowNightly/index_check9.js @@ -0,0 +1,118 @@ +Random.setRandomSeed(); + +t = db.test_index_check9; + +function doIt() { + +t.drop(); + +function sort() { + var sort = {}; + for( var i = 0; i < n; ++i ) { + sort[ fields[ i ] ] = Random.rand() > 0.5 ? 1 : -1; + } + return sort; +} + +var fields = [ 'a', 'b', 'c', 'd', 'e' ]; +n = Random.randInt( 5 ) + 1; +var idx = sort(); + +var chars = "abcdefghijklmnopqrstuvwxyz"; +var alphas = [] +for( var i = 0; i < n; ++i ) { + alphas.push( Random.rand() > 0.5 ); +} + +t.ensureIndex( idx ); + +function obj() { + var ret = {}; + for( var i = 0; i < n; ++i ) { + ret[ fields[ i ] ] = r( alphas[ i ] ); + } + return ret; +} + +function r( alpha ) { + if ( !alpha ) { + return Random.randInt( 10 ); + } else { + var len = Random.randInt( 10 ); + buf = ""; + for( var i = 0; i < len; ++i ) { + buf += chars.charAt( Random.randInt( chars.length ) ); + } + return buf; + } +} + +function check() { + var v = t.validate(); + if ( !t.valid ) { + printjson( t ); + assert( t.valid ); + } + var spec = {}; + for( var i = 0; i < n; ++i ) { + if ( Random.rand() > 0.5 ) { + var bounds = [ r( alphas[ i ] ), r( alphas[ i ] ) ]; + if ( bounds[ 0 ] > bounds[ 1 ] ) { + bounds.reverse(); + } + var s = {}; + if ( Random.rand() > 0.5 ) { + s[ "$gte" ] = bounds[ 0 ]; + } else { + s[ "$gt" ] = bounds[ 0 ]; + } + if ( Random.rand() > 0.5 ) { + s[ "$lte" ] = bounds[ 1 ]; + } else { + s[ "$lt" ] = bounds[ 1 ]; + } + spec[ fields[ i ] ] = s; + } else { + var vals = [] + for( var j = 0; j < Random.randInt( 15 ); ++j ) { + vals.push( r( alphas[ i ] ) ); + } + spec[ fields[ i ] ] = { $in: vals }; + } + } + s = sort(); + c1 = t.find( spec, { _id:null } ).sort( s ).hint( idx ).toArray(); + c2 = t.find( spec ).sort( s ).explain().nscanned; + c3 = t.find( spec, { _id:null } ).sort( s ).hint( {$natural:1} ).toArray(); + // assert.eq( c1, c3, "spec: " + tojson( spec ) + ", sort: " + tojson( s ) ); + // assert.eq( c1.length, c2 ); + assert.eq( c1, c3 ); +} + +for( var i = 0; i < 10000; ++i ) { + t.save( obj() ); + if( Random.rand() > 0.999 ) { + print( i ); + check(); + } +} + +for( var i = 0; i < 100000; ++i ) { + if ( Random.rand() > 0.9 ) { + t.save( obj() ); + } else { + t.remove( obj() ); // improve + } + if( Random.rand() > 0.999 ) { + print( i ); + check(); + } +} + +check(); + +} + +for( var z = 0; z < 5; ++z ) { + doIt(); +} \ No newline at end of file diff --git a/jstests/slowNightly/large_chunk.js b/jstests/slowNightly/large_chunk.js new file mode 100644 index 0000000..6cf40e3 --- /dev/null +++ b/jstests/slowNightly/large_chunk.js @@ -0,0 +1,51 @@ +// Where we test operations dealing with large chunks + +// Starts a new sharding environment limiting the chunksize to 2GB. +// Note that early splitting will start with a 1/4 of max size currently. +s = new ShardingTest( "large_chunk" , 2 , 2 , 1 , { chunksize : 2000 } ); + +// take the balancer out of the equation +s.config.settings.update( { _id: "balancer" }, { $set : { stopped: true } } , true ); +s.config.settings.find().forEach( printjson ) +db = s.getDB( "test" ); + +// +// Step 1 - Test moving a large chunk +// + +// Turn on sharding on the 'test.foo' collection and generate a large chunk +s.adminCommand( { enablesharding : "test" } ); +s.adminCommand( { shardcollection : "test.foo" , key : { _id : 1 } } ); + +bigString = "" +while ( bigString.length < 10000 ) + bigString += "asdasdasdasdadasdasdasdasdasdasdasdasda"; + +inserted = 0; +num = 0; +while ( inserted < ( 400 * 1024 * 1024 ) ){ + db.foo.insert( { _id : num++ , s : bigString } ); + inserted += bigString.length; +} +db.getLastError(); +assert.eq( 1 , s.config.chunks.count() , "step 1 - need one large chunk" ); + +primary = s.getServer( "test" ).getDB( "test" ); +secondary = s.getOther( primary ).getDB( "test" ); + +// Make sure that we don't move that chunk if it goes past what we consider the maximum chunk size +print("Checkpoint 1a") +max = 200 * 1024 * 1024; +moveChunkCmd = { movechunk : "test.foo" , find : { _id : 1 } , to : secondary.getMongo().name , maxChunkSizeBytes : max }; +assert.throws( function() { s.adminCommand( moveChunkCmd ); } ); + +// Move the chunk +print("checkpoint 1b"); +before = s.config.chunks.find().toArray(); +s.adminCommand( { movechunk : "test.foo" , find : { _id : 1 } , to : secondary.getMongo().name } ); +after = s.config.chunks.find().toArray(); +assert.neq( before[0].shard , after[0].shard , "move chunk did not work" ); + +s.config.changelog.find().forEach( printjson ) + +s.stop(); \ No newline at end of file diff --git a/jstests/slowNightly/moveprimary-replset.js b/jstests/slowNightly/moveprimary-replset.js new file mode 100755 index 0000000..0b6a78b --- /dev/null +++ b/jstests/slowNightly/moveprimary-replset.js @@ -0,0 +1,67 @@ +// Move db between replica set shards -Tony + +load('jstests/libs/grid.js') + +function go() { + +var N = 10000 + +// Create replica set of one server +var repset1 = new ReplicaSet('repset1', 1) .begin() +var conn1a = repset1.getMaster() +var db1a = conn1a.getDB('test') + +// Add data to it +for (var i = 1; i <= N; i++) db1a['foo'].insert({x: i}) + +// Add another server to replica set +var conn1b = repset1.addServer() +conn1b.setSlaveOk() +var db1b = conn1b.getDB('test') + +// Check that new server received replicated data +assert (db1b['foo'].count() == N, 'data did not replicate') + +// Create sharding config servers +var configset = new ConfigSet(3) +configset.begin() + +// Create sharding router (mongos) +var router = new Router(configset) +var routerConn = router.begin() +var db = routerConn.getDB('test') + +// Add repset1 as only shard +addShard (routerConn, repset1.getURL()) + +// Add data via router and check it +db['foo'].update({}, {$set: {y: 'hello'}}, false, true) +assert (db['foo'].count({y: 'hello'}) == N, + 'updating and counting docs via router (mongos) failed') + +// Create another replica set +var repset2 = new ReplicaSet('repset2', 2) .begin() +var conn2a = repset2.getMaster() + +// Add repset2 as second shard +addShard (routerConn, repset2.getURL()) + +routerConn.getDB('admin').printShardingStatus() +printjson (conn2a.getDBs()) + +// Move test db from repset1 to repset2 +moveDB (routerConn, 'test', repset2.getURL()) + +routerConn.getDB('admin').printShardingStatus() +printjson (conn2a.getDBs()) + +//Done +router.end() +configset.end() +repset2.stopSet() +repset1.stopSet() + +print('moveprimary-replset.js SUCCESS') +} + +go() diff --git a/jstests/slowNightly/newcollection2.js b/jstests/slowNightly/newcollection2.js new file mode 100644 index 0000000..6bf2495 --- /dev/null +++ b/jstests/slowNightly/newcollection2.js @@ -0,0 +1,11 @@ +// Alocate collection forcing just a small size remainder in 2nd extent + +port = allocatePorts( 1 )[ 0 ] +var baseName = "jstests_disk_newcollection2"; +var m = startMongod( "--noprealloc", "--smallfiles", "--port", port, "--dbpath", "/data/db/" + baseName ); +db = m.getDB( "test" ); + +db.createCollection( baseName, {size:0x1FFC0000-0x10-8192} ); +var v = db[ baseName ].validate(); +printjson( v ); +assert( v.valid ); diff --git a/jstests/slowNightly/run_sharding_passthrough.js b/jstests/slowNightly/run_sharding_passthrough.js deleted file mode 100644 index fda982b..0000000 --- a/jstests/slowNightly/run_sharding_passthrough.js +++ /dev/null @@ -1,94 +0,0 @@ -s = new ShardingTest( "auto1" , 2 , 1 , 1 ); -s.adminCommand( { enablesharding : "test" } ); -db=s.getDB("test"); - -var files = listFiles("jstests"); - -var runnerStart = new Date() - -files.forEach( - function(x) { - -// /(basic|update).*\.js$/ - if ( /[\/\\]_/.test(x.name) || - ! /\.js$/.test(x.name ) ){ - print(" >>>>>>>>>>>>>>> skipping " + x.name); - return; - } - - // Notes: - - // apply_ops1: nothing works, dunno why yet. SERVER-1439 - - // copydb, copydb2: copyDatabase seems not to work at all in - // the ShardingTest setup. SERVER-1440 - - // cursor8: cursorInfo different/meaningless(?) in mongos - // closeAllDatabases may not work through mongos - // SERVER-1441 - // deal with cursorInfo in mongos SERVER-1442 - - // dbcase: Database names are case-insensitive under ShardingTest? - // SERVER-1443 - - // These are all SERVER-1444 - // count5: limit() and maybe skip() may be unreliable - // geo3: limit() not working, I think - // or4: skip() not working? - - // shellkillop: dunno yet. SERVER-1445 - - // These should simply not be run under sharding: - // dbadmin: Uncertain Cut-n-pasting its contents into mongo worked. - // error1: getpreverror not supported under sharding - // fsync, fsync2: isn't supported through mongos - // remove5: getpreverror, I think. don't run - // update4: getpreverror don't run - - // Around July 20, command passthrough went away, and these - // commands weren't implemented: - // clean cloneCollectionAsCapped copydbgetnonce dataSize - // datasize dbstats deleteIndexes dropIndexes forceerror - // getnonce logout medianKey profile reIndex repairDatabase - // reseterror splitVector validate - - /* missing commands : - * forceerror and switchtoclienterrors - * cloneCollectionAsCapped - * splitvector - * profile (apitest_db, cursor6, evalb) - * copydbgetnonce - * dbhash - * medianKey - * clean (apitest_dbcollection) - * logout and getnonce - */ - if (/[\/\\](error3|capped.*|splitvector|apitest_db|cursor6|copydb-auth|profile1|dbhash|median|apitest_dbcollection|evalb|auth1|auth2)\.js$/.test(x.name)) { - print(" !!!!!!!!!!!!!!! skipping test that has failed under sharding but might not anymore " + x.name) - return; - } - // These are bugs (some might be fixed now): - if (/[\/\\](apply_ops1|count5|cursor8|or4|shellkillop|update4)\.js$/.test(x.name)) { - print(" !!!!!!!!!!!!!!! skipping test that has failed under sharding but might not anymore " + x.name) - return; - } - // These aren't supposed to get run under sharding: - if (/[\/\\](dbadmin|error1|fsync|fsync2|geo.*|indexh|remove5|update4)\.js$/.test(x.name)) { - print(" >>>>>>>>>>>>>>> skipping test that would fail under sharding " + x.name) - return; - } - - print(" *******************************************"); - print(" Test : " + x.name + " ..."); - print(" " + Date.timeFunc( - function() { - load(x.name); - }, 1) + "ms"); - - } -); - - -var runnerEnd = new Date() - -print( "total runner time: " + ( ( runnerEnd.getTime() - runnerStart.getTime() ) / 1000 ) + "secs" ) diff --git a/jstests/slowNightly/sharding_balance1.js b/jstests/slowNightly/sharding_balance1.js index 840aaff..9379c4f 100644 --- a/jstests/slowNightly/sharding_balance1.js +++ b/jstests/slowNightly/sharding_balance1.js @@ -1,7 +1,7 @@ // sharding_balance1.js -s = new ShardingTest( "slow_sharding_balance1" , 2 , 2 , 1 , { chunksize : 1 } ) +s = new ShardingTest( "slow_sharding_balance1" , 2 , 1 , 1 , { chunksize : 1 } ) s.adminCommand( { enablesharding : "test" } ); diff --git a/jstests/slowNightly/sharding_balance2.js b/jstests/slowNightly/sharding_balance2.js index c94e256..3296ff6 100644 --- a/jstests/slowNightly/sharding_balance2.js +++ b/jstests/slowNightly/sharding_balance2.js @@ -1,6 +1,6 @@ // sharding_balance2.js -s = new ShardingTest( "slow_sharding_balance2" , 2 , 2 , 1 , { chunksize : 1 , manualAddShard : true } ) +s = new ShardingTest( "slow_sharding_balance2" , 2 , 1 , 1 , { chunksize : 1 , manualAddShard : true } ) names = s.getConnNames(); for ( var i=0; i .99 ){ db.getLastError() - check(); // SERVER-1430 TODO + check( "random late check" ); // SERVER-1430 } - var x = dist(); + var x = s.chunkCounts( "foo" ) if ( Math.random() > .999 ) printjson( x ) return Math.max( x.shard0000 , x.shard0001 ) - Math.min( x.shard0000 , x.shard0001 ); } function sum(){ - var x = dist(); + var x = s.chunkCounts( "foo" ) return x.shard0000 + x.shard0001; } diff --git a/jstests/slowNightly/sharding_balance_randomorder1.js b/jstests/slowNightly/sharding_balance_randomorder1.js new file mode 100644 index 0000000..05eabc6 --- /dev/null +++ b/jstests/slowNightly/sharding_balance_randomorder1.js @@ -0,0 +1,54 @@ +// sharding_balance1.js + +s = new ShardingTest( "sharding_balance_randomorder1" , 2 , 2 , 1 , { chunksize : 1 } ) + +s.adminCommand( { enablesharding : "test" } ); + +s.config.settings.find().forEach( printjson ) + +db = s.getDB( "test" ); + +bigString = "" +while ( bigString.length < 10000 ) + bigString += "asdasdasdasdadasdasdasdasdasdasdasdasda"; + +inserted = 0; +num = 0; +while ( inserted < ( 20 * 1024 * 1024 ) ){ + db.foo.insert( { _id : Math.random() , s : bigString } ); + inserted += bigString.length; +} + +db.getLastError(); +s.adminCommand( { shardcollection : "test.foo" , key : { _id : 1 } } ); +assert.lt( 20 , s.config.chunks.count() , "setup2" ); + +function diff(){ + var x = s.chunkCounts( "foo" ); + printjson( x ) + return Math.max( x.shard0000 , x.shard0001 ) - Math.min( x.shard0000 , x.shard0001 ); +} + +function sum(){ + var x = s.chunkCounts( "foo" ); + return x.shard0000 + x.shard0001; +} + +assert.lt( 20 , diff() , "big differential here" ); +print( diff() ) + +assert.soon( function(){ + var d = diff(); + return d < 5; +} , "balance didn't happen" , 1000 * 60 * 3 , 5000 ); + +var chunkCount = sum(); +s.adminCommand( { removeshard: "shard0000" } ); + +assert.soon( function(){ + printjson(s.chunkCounts( "foo" )); + s.config.shards.find().forEach(function(z){printjson(z);}); + return chunkCount == s.config.chunks.count({shard: "shard0001"}); +} , "removeshard didn't happen" , 1000 * 60 * 3 , 5000 ); + +s.stop(); diff --git a/jstests/slowNightly/sharding_cursors1.js b/jstests/slowNightly/sharding_cursors1.js index 307e8d7..de59b0d 100644 --- a/jstests/slowNightly/sharding_cursors1.js +++ b/jstests/slowNightly/sharding_cursors1.js @@ -1,4 +1,4 @@ -s = new ShardingTest( "cursors1" , 2 , 0 , 1 , { chunksize : 1 } ) +s = new ShardingTest( "sharding_cursors1" , 2 , 0 , 1 , { chunksize : 1 } ) s.adminCommand( { enablesharding : "test" } ); @@ -17,6 +17,10 @@ toInsert = ( 1 * 1000 * 1000 ); for (var i=0; i < toInsert; i++ ){ db.foo.insert( { i: i, r: Math.random(), s: bigString } ); assert.eq(db.getLastError(), null, 'no error'); //SERVER-1541 + + if ( i % 1000 == 999 ) { + print( "already inserted " + ( i + 1 ) ); + } } inserted = toInsert; diff --git a/jstests/slowNightly/sharding_multiple_collections.js b/jstests/slowNightly/sharding_multiple_collections.js new file mode 100644 index 0000000..61d9911 --- /dev/null +++ b/jstests/slowNightly/sharding_multiple_collections.js @@ -0,0 +1,53 @@ +// multcollections.js + +s = new ShardingTest( "multcollections" , 2 , 1 , 1 , { chunksize : 1 } ); + +s.adminCommand( { enablesharding : "test" } ); + +db = s.getDB( "test" ) + +N = 100000 + +S = "" +while ( S.length < 500 ) + S += "123123312312"; + +for ( i=0; i>>>>>>>>>>>>>> skipping " + x.name); + return; + } + + // Notes: + + // apply_ops1: nothing works, dunno why yet. SERVER-1439 + + // copydb, copydb2: copyDatabase seems not to work at all in + // the ShardingTest setup. SERVER-1440 + + // cursor8: cursorInfo different/meaningless(?) in mongos + // closeAllDatabases may not work through mongos + // SERVER-1441 + // deal with cursorInfo in mongos SERVER-1442 + + // dbcase: Database names are case-insensitive under ShardingTest? + // SERVER-1443 + + // These are all SERVER-1444 + // count5: limit() and maybe skip() may be unreliable + // geo3: limit() not working, I think + // or4: skip() not working? + + // shellkillop: dunno yet. SERVER-1445 + + // These should simply not be run under sharding: + // dbadmin: Uncertain Cut-n-pasting its contents into mongo worked. + // error1: getpreverror not supported under sharding + // fsync, fsync2: isn't supported through mongos + // remove5: getpreverror, I think. don't run + // update4: getpreverror don't run + + // Around July 20, command passthrough went away, and these + // commands weren't implemented: + // clean cloneCollectionAsCapped copydbgetnonce dataSize + // datasize dbstats deleteIndexes dropIndexes forceerror + // getnonce logout medianKey profile reIndex repairDatabase + // reseterror splitVector validate + + /* missing commands : + * forceerror and switchtoclienterrors + * cloneCollectionAsCapped + * splitvector + * profile (apitest_db, cursor6, evalb) + * copydbgetnonce + * dbhash + * medianKey + * clean (apitest_dbcollection) + * logout and getnonce + */ + if (/[\/\\](error3|capped.*|splitvector|apitest_db|cursor6|copydb-auth|profile1|dbhash|median|apitest_dbcollection|evalb|evald|eval_nolock|auth1|auth2|unix_socket\d*)\.js$/.test(x.name)) { + print(" !!!!!!!!!!!!!!! skipping test that has failed under sharding but might not anymore " + x.name) + return; + } + // These are bugs (some might be fixed now): + if (/[\/\\](apply_ops1|count5|cursor8|or4|shellkillop|update4)\.js$/.test(x.name)) { + print(" !!!!!!!!!!!!!!! skipping test that has failed under sharding but might not anymore " + x.name) + return; + } + // These aren't supposed to get run under sharding: + if (/[\/\\](dbadmin|error1|fsync|fsync2|geo.*|indexh|remove5|update4|notablescan|check_shard_index|mr_replaceIntoDB)\.js$/.test(x.name)) { + print(" >>>>>>>>>>>>>>> skipping test that would fail under sharding " + x.name) + return; + } + + print(" *******************************************"); + print(" Test : " + x.name + " ..."); + print(" " + Date.timeFunc( + function() { + load(x.name); + }, 1) + "ms"); + + } +); + + +var runnerEnd = new Date() + +print( "total runner time: " + ( ( runnerEnd.getTime() - runnerStart.getTime() ) / 1000 ) + "secs" ) diff --git a/jstests/slowNightly/sharding_rs1.js b/jstests/slowNightly/sharding_rs1.js index b7d90ba..4ad126e 100644 --- a/jstests/slowNightly/sharding_rs1.js +++ b/jstests/slowNightly/sharding_rs1.js @@ -43,10 +43,19 @@ function diff(){ assert.lt( 20 , diff() , "big differential here" ); print( diff() ) +{ + // quick test for SERVER-2686 + var mydbs = db.getMongo().getDBs().databases; + for ( var i=0; i .5 ) + t.remove( { _id : i } ) + else + t.insert( { _id : i , s : s } ) + } + + //printjson( t.stats() ); + + assert.eq( orig.storageSize , t.stats().storageSize , "B" + j ) +} diff --git a/jstests/slowWeekly/dur_passthrough.js b/jstests/slowWeekly/dur_passthrough.js new file mode 100644 index 0000000..1840fb7 --- /dev/null +++ b/jstests/slowWeekly/dur_passthrough.js @@ -0,0 +1,44 @@ +// +// simple runner to run toplevel tests in jstests +// + +//TODO(mathias) add --master or make another test +//conn = startMongodEmpty("--port", 30200, "--dbpath", "/data/db/dur_passthrough", "--dur", "--smallfiles", "--durOptions", "24"); +conn = startMongodEmpty("--port", 30200, "--dbpath", "/data/db/dur_passthrough", "--dur", "--smallfiles", "--durOptions", "8"); +db = conn.getDB("test"); + +var files = listFiles("jstests"); +files = files.sort(compareOn('name')); + +var runnerStart = new Date() + +files.forEach( + function (x) { + + if (/[\/\\]_/.test(x.name) || + !/\.js$/.test(x.name) || + /repair/.test(x.name) || // fails on recovery + /shellkillop/.test(x.name) || // takes forever and don't test anything new + false // placeholder so all real tests end in || + ) + { + print(" >>>>>>>>>>>>>>> skipping " + x.name); + return; + } + + print(); + print(" *******************************************"); + print(" Test : " + x.name + " ..."); + print(" " + Date.timeFunc(function () { load(x.name); }, 1) + "ms"); + + } +); + +stopMongod(30200); + +var runnerEnd = new Date() + +print( "total runner time: " + ( ( runnerEnd.getTime() - runnerStart.getTime() ) / 1000 ) + "secs" ) + +//TODO(mathias): test recovery here + diff --git a/jstests/slowWeekly/geo_near_random1.js b/jstests/slowWeekly/geo_near_random1.js new file mode 100644 index 0000000..5ddfd26 --- /dev/null +++ b/jstests/slowWeekly/geo_near_random1.js @@ -0,0 +1,13 @@ +// this tests all points using $near +load("jstests/libs/geo_near_random.js"); + +var test = new GeoNearRandomTest("weekly.geo_near_random1"); + +test.insertPts(1000); + +test.testPt([0,0]); +test.testPt(test.mkPt()); +test.testPt(test.mkPt()); +test.testPt(test.mkPt()); +test.testPt(test.mkPt()); + diff --git a/jstests/slowWeekly/geo_near_random2.js b/jstests/slowWeekly/geo_near_random2.js new file mode 100644 index 0000000..9e93657 --- /dev/null +++ b/jstests/slowWeekly/geo_near_random2.js @@ -0,0 +1,21 @@ +// this tests 1% of all points using $near and $nearSphere +load("jstests/libs/geo_near_random.js"); + +var test = new GeoNearRandomTest("weekly.geo_near_random2"); + +test.insertPts(50000); + +opts = {sphere:0, nToTest:test.nPts*0.01}; +test.testPt([0,0], opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); +test.testPt(test.mkPt(), opts); + +opts.sphere = 1 +test.testPt([0,0], opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); +test.testPt(test.mkPt(0.8), opts); + diff --git a/jstests/slowWeekly/indexbg_dur.js b/jstests/slowWeekly/indexbg_dur.js new file mode 100644 index 0000000..5fbe0e7 --- /dev/null +++ b/jstests/slowWeekly/indexbg_dur.js @@ -0,0 +1,67 @@ +/** + * Kill mongod during a background index build and ensure that the bad index + * can be dropped on restart. + */ + +function countFields( x ) { + var count = 0; + for( var i in x ) { + ++count; + } + return count; +} + +size = 100000; +while( 1 ) { + print( "size: " + size ); + + var testname = "index_build"; + var path = "/data/db/" + testname+"_dur"; + conn = startMongodEmpty("--port", 30001, "--dbpath", path, "--dur", "--smallfiles", "--durOptions", 8); + t = conn.getDB( testname ).getCollection( testname ); + + for( var i = 0; i < size; ++i ) { + t.save( {i:i} ); + } + t.getDB().getLastError(); + x = startMongoProgramNoConnect( "mongo", "--eval", "db.getSisterDB( '" + testname + "' )." + testname + ".ensureIndex( {i:1}, {background:true} );", conn.host ); + sleep( 1000 ); + stopMongod( 30001, /* signal */ 9 ); + waitProgram( x ); + + conn = startMongodNoReset("--port", 30001, "--dbpath", path, "--dur", "--smallfiles", "--durOptions", 8); + t = conn.getDB( testname ).getCollection( testname ); + + var statsSize = countFields( t.stats().indexSizes ); + var nsSize = conn.getDB( testname ).system.indexes.count( {ns:testname+'.'+testname} ); + + // If index build completed before the kill, try again with more data. + if ( !( statsSize == 1 && nsSize == 2 ) ) { + print( "statsSize: " + statsSize + ", nsSize: " + nsSize + ", retrying with more data" ); + stopMongod( 30001 ); + size *= 2; + continue; + } + + assert.eq( "index not found", t.dropIndex( "i_1" ).errmsg ); + + var statsSize = countFields( t.stats().indexSizes ); + var nsSize = conn.getDB( testname ).system.indexes.count( {ns:testname+'.'+testname} ); + + assert.eq( statsSize, nsSize ); + assert( t.validate().valid ); + // TODO check that index namespace is cleaned up as well once that is implemented + + t.ensureIndex( {i:1} ); + var statsSize = countFields( t.stats().indexSizes ); + var nsSize = conn.getDB( testname ).system.indexes.count( {ns:testname+'.'+testname} ); + + assert.eq( 2, statsSize ); + assert.eq( 2, nsSize ); + + exp = t.find( {i:20} ).explain(); + assert.eq( 1, exp.n ); + assert.eq( 'BtreeCursor i_1', exp.cursor ); + + break; +} diff --git a/jstests/slowWeekly/query_yield1.js b/jstests/slowWeekly/query_yield1.js index e996b53..1a95b87 100644 --- a/jstests/slowWeekly/query_yield1.js +++ b/jstests/slowWeekly/query_yield1.js @@ -2,10 +2,10 @@ t = db.query_yield1; t.drop() -N = 10000; +N = 20000; i = 0; -q = function(){ var x=this.n; for ( var i=0; i<500; i++ ){ x = x * 2; } return false; } +q = function(){ var x=this.n; for ( var i=0; i<250; i++ ){ x = x * 2; } return false; } while ( true ){ function fill(){ @@ -59,7 +59,7 @@ while ( ( (new Date()).getTime() - start ) < ( time * 2 ) ){ assert.eq( 1 , x.inprog.length , "nothing in prog" ); } - assert.gt( 50 , me ); + assert.gt( 200 , me , "took too long for me to run" ); if ( x.inprog.length == 0 ) break; diff --git a/jstests/slowWeekly/query_yield2.js b/jstests/slowWeekly/query_yield2.js index e13fabe..dd7e5d9 100644 --- a/jstests/slowWeekly/query_yield2.js +++ b/jstests/slowWeekly/query_yield2.js @@ -2,10 +2,10 @@ t = db.query_yield2; t.drop() -N = 100; +N = 200; i = 0; -q = function(){ var x=this.n; for ( var i=0; i<50000; i++ ){ x = x * 2; } return false; } +q = function(){ var x=this.n; for ( var i=0; i<25000; i++ ){ x = x * 2; } return false; } while ( true ){ function fill(){ @@ -59,7 +59,7 @@ while ( ( (new Date()).getTime() - start ) < ( time * 2 ) ){ assert.eq( 1 , x.inprog.length , "nothing in prog" ); } - assert.gt( 75 , me ); + assert.gt( 100 , me ); if ( x.inprog.length == 0 ) break; diff --git a/jstests/slowWeekly/update_yield1.js b/jstests/slowWeekly/update_yield1.js index 2e63690..7e95855 100644 --- a/jstests/slowWeekly/update_yield1.js +++ b/jstests/slowWeekly/update_yield1.js @@ -27,7 +27,7 @@ while ( true ){ timeUpdate(); time = timeUpdate(); print( N + "\t" + time ); - if ( time > 2000 ) + if ( time > 8000 ) break; N *= 2; @@ -47,13 +47,14 @@ num = 0; start = new Date(); while ( ( (new Date()).getTime() - start ) < ( time * 2 ) ){ var me = Date.timeFunc( function(){ t.findOne(); } ); + if (me > 50) print("time: " + me); if ( num++ == 0 ){ var x = db.currentOp() assert.eq( 1 , x.inprog.length , "nothing in prog" ); } - assert.gt( 50 , me ); + assert.gt( 2000 , me ); } join(); @@ -65,14 +66,16 @@ assert.eq( 0 , x.inprog.length , "weird 2" ); join = startParallelShell( "db.update_yield1.update( { $atomic : true } , { $inc : { n : 1 } } , false , true ); db.getLastError()" ); -assert.soon( - function(){ - return db.currentOp().inprog.length > 0; - } , "never doing update 2" -); +sleep(1000); // wait for shell startup ops to finish + +var x = db.currentOp(); +printjson(x); +assert.eq(1, x.inprog.length, "never doing update 2"); +assert.eq("update", x.inprog[0].op); + +t.findOne(); // should wait for update to finish -t.findOne(); var x = db.currentOp() -assert.eq( 0 , x.inprog.length , "should have been atomic" ); +assert.eq( [] , x.inprog , "should have been atomic" ); join(); diff --git a/jstests/sort2.js b/jstests/sort2.js index facd64c..1e21414 100644 --- a/jstests/sort2.js +++ b/jstests/sort2.js @@ -1,6 +1,6 @@ // test sorting, mainly a test ver simple with no index -t = db.sorrrt2; +t = db.sort2; t.drop(); t.save({x:1, y:{a:5,b:4}}); diff --git a/jstests/splitvector.js b/jstests/splitvector.js index 8d86319..da93486 100644 --- a/jstests/splitvector.js +++ b/jstests/splitvector.js @@ -11,7 +11,7 @@ // e.g. 20000 // @param maxChunkSize is in MBs. // -assertChunkSizes = function ( splitVec , numDocs , maxChunkSize ){ +assertChunkSizes = function ( splitVec , numDocs , maxChunkSize , msg ){ splitVec = [{ x: -1 }].concat( splitVec ); splitVec.push( { x: numDocs+1 } ); for ( i=0; i b.t ) + return 1; + + return a.i - b.i; +} + +for ( i=0; i +#include "boost/thread/once.hpp" #include #include #include @@ -107,16 +109,17 @@ namespace mongo { const int VERSION_MINOR = 5; enum ExitCode { - EXIT_CLEAN = 0 , - EXIT_BADOPTIONS = 2 , + EXIT_CLEAN = 0 , + EXIT_BADOPTIONS = 2 , EXIT_REPLICATION_ERROR = 3 , EXIT_NEED_UPGRADE = 4 , + EXIT_SHARDING_ERROR = 5 , EXIT_KILL = 12 , - EXIT_ABRUBT = 14 , + EXIT_ABRUPT = 14 , EXIT_NTSERVICE_ERROR = 20 , EXIT_JAVA = 21 , - EXIT_OOM_MALLOC = 42 , - EXIT_OOM_REALLOC = 43 , + EXIT_OOM_MALLOC = 42 , + EXIT_OOM_REALLOC = 43 , EXIT_FS = 45 , EXIT_CLOCK_SKEW = 47 , EXIT_NET_ERROR = 48 , @@ -126,7 +129,7 @@ namespace mongo { }; - void dbexit( ExitCode returnCode, const char *whyMsg = ""); + void dbexit( ExitCode returnCode, const char *whyMsg = "", bool tryToGetLock = false); /** this is here so you can't just type exit() to quit the program @@ -135,10 +138,7 @@ namespace mongo { */ void exit( ExitCode returnCode ); bool inShutdown(); - -} // namespace mongo -namespace mongo { using namespace boost::filesystem; void asserted(const char *msg, const char *file, unsigned line); } @@ -156,10 +156,6 @@ namespace mongo { void sayDbContext(const char *msg = 0); void rawOut( const string &s ); -} // namespace mongo - -namespace mongo { - typedef char _TCHAR; using boost::uint32_t; diff --git a/rpm/init.d-mongod b/rpm/init.d-mongod index 5ee8379..b7d4567 100644 --- a/rpm/init.d-mongod +++ b/rpm/init.d-mongod @@ -61,10 +61,11 @@ case "$1" in restart ;; condrestart) - [ -f /var/lock/subsys/mongodb ] && restart || : + [ -f /var/lock/subsys/mongod ] && restart || : ;; status) status $mongod + RETVAL=$? ;; *) echo "Usage: $0 {start|stop|status|restart|reload|force-reload|condrestart}" diff --git a/rpm/mongo.spec b/rpm/mongo.spec index 98f4d39..5ef543b 100644 --- a/rpm/mongo.spec +++ b/rpm/mongo.spec @@ -1,5 +1,5 @@ Name: mongo -Version: 1.6.5 +Version: 1.8.0 Release: mongodb_1%{?dist} Summary: mongo client shell and tools License: AGPL 3.0 @@ -105,6 +105,7 @@ fi %{_bindir}/mongoimport %{_bindir}/mongorestore %{_bindir}/mongostat +%{_bindir}/bsondump %{_mandir}/man1/mongo.1* %{_mandir}/man1/mongod.1* diff --git a/rpm/mongod.conf b/rpm/mongod.conf index 99346ef..1530199 100644 --- a/rpm/mongod.conf +++ b/rpm/mongod.conf @@ -78,14 +78,3 @@ dbpath=/var/lib/mongo # or #master = true #source = slave.example.com - -# Address of a server to pair with. -#pairwith = -# Address of arbiter server. -#arbiter = -# Automatically resync if slave data is stale -#autoresync -# Custom size for replication operation log. -#oplogSize = -# Size limit for in-memory storage of op ids. -#opIdMem = diff --git a/s/balance.cpp b/s/balance.cpp index 33cafdf..ee0c992 100644 --- a/s/balance.cpp +++ b/s/balance.cpp @@ -1,4 +1,4 @@ -// balance.cpp +//@file balance.cpp /** * Copyright (C) 2008 10gen Inc. @@ -31,10 +31,10 @@ #include "grid.h" namespace mongo { - + Balancer balancer; - Balancer::Balancer() : _balancedLastTime(0), _policy( new BalancerPolicy ){} + Balancer::Balancer() : _balancedLastTime(0), _policy( new BalancerPolicy ) {} Balancer::~Balancer() { delete _policy; @@ -43,15 +43,15 @@ namespace mongo { int Balancer::_moveChunks( const vector* candidateChunks ) { int movedCount = 0; - for ( vector::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ){ + for ( vector::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) { const CandidateChunk& chunkInfo = *it->get(); DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns ); assert( cfg ); - + ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); assert( cm ); - + const BSONObj& chunkToMove = chunkInfo.chunk; ChunkPtr c = cm->findChunk( chunkToMove["min"].Obj() ); if ( c->getMin().woCompare( chunkToMove["min"].Obj() ) || c->getMax().woCompare( chunkToMove["max"].Obj() ) ) { @@ -61,62 +61,65 @@ namespace mongo { c = cm->findChunk( chunkToMove["min"].Obj() ); if ( c->getMin().woCompare( chunkToMove["min"].Obj() ) || c->getMax().woCompare( chunkToMove["max"].Obj() ) ) { - log() << "chunk mismatch after reload, ignoring will retry issue cm: " + log() << "chunk mismatch after reload, ignoring will retry issue cm: " << c->getMin() << " min: " << chunkToMove["min"].Obj() << endl; continue; } } - - string errmsg; - if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , errmsg ) ){ + + BSONObj res; + if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , res ) ) { movedCount++; continue; } - log() << "MOVE FAILED **** " << errmsg << "\n" - << " from: " << chunkInfo.from << " to: " << chunkInfo.to << " chunk: " << chunkToMove << endl; + // the move requires acquiring the collection metadata's lock, which can fail + log() << "balacer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to + << " chunk: " << chunkToMove << endl; + + if ( res["chunkTooBig"].trueValue() ) { + // reload just to be safe + cm = cfg->getChunkManager( chunkInfo.ns ); + assert( cm ); + c = cm->findChunk( chunkToMove["min"].Obj() ); + + log() << "forcing a split because migrate failed for size reasons" << endl; + + res = BSONObj(); + c->singleSplit( true , res ); + log() << "forced split results: " << res << endl; + + // TODO: if the split fails, mark as jumbo SERVER-2571 + } } return movedCount; } - - void Balancer::_ping(){ - assert( _myid.size() && _started ); - try { - ScopedDbConnection conn( configServer.getPrimary() ); - _ping( conn.conn() ); - conn.done(); - } - catch ( std::exception& e ){ - log() << "bare ping failed: " << e.what() << endl; - } - - } - void Balancer::_ping( DBClientBase& conn ){ + void Balancer::_ping( DBClientBase& conn ) { WriteConcern w = conn.getWriteConcern(); conn.setWriteConcern( W_NONE ); - conn.update( ShardNS::mongos , - BSON( "_id" << _myid ) , - BSON( "$set" << BSON( "ping" << DATENOW << "up" << (int)(time(0)-_started) ) ) , - true ); + conn.update( ShardNS::mongos , + BSON( "_id" << _myid ) , + BSON( "$set" << BSON( "ping" << DATENOW << "up" << (int)(time(0)-_started) ) ) , + true ); conn.setWriteConcern( w); } - - bool Balancer::_checkOIDs(){ + + bool Balancer::_checkOIDs() { vector all; Shard::getAllShards( all ); - + map oids; - - for ( vector::iterator i=all.begin(); i!=all.end(); ++i ){ + + for ( vector::iterator i=all.begin(); i!=all.end(); ++i ) { Shard s = *i; BSONObj f = s.runCommand( "admin" , "features" ); - if ( f["oidMachine"].isNumber() ){ + if ( f["oidMachine"].isNumber() ) { int x = f["oidMachine"].numberInt(); - if ( oids.count(x) == 0 ){ + if ( oids.count(x) == 0 ) { oids[x] = s; } else { @@ -133,7 +136,7 @@ namespace mongo { return true; } - void Balancer::_doBalanceRound( DBClientBase& conn, vector* candidateChunks ){ + void Balancer::_doBalanceRound( DBClientBase& conn, vector* candidateChunks ) { assert( candidateChunks ); // @@ -143,8 +146,8 @@ namespace mongo { auto_ptr cursor = conn.query( ShardNS::collection , BSONObj() ); vector< string > collections; - while ( cursor->more() ){ - BSONObj col = cursor->next(); + while ( cursor->more() ) { + BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col["key"].eoo() ) @@ -164,7 +167,7 @@ namespace mongo { // // TODO: skip unresponsive shards and mark information as stale. // - + vector allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { @@ -172,14 +175,16 @@ namespace mongo { return; } - map< string, BSONObj > shardLimitsMap; - for ( vector::const_iterator it = allShards.begin(); it != allShards.end(); ++it ){ + map< string, BSONObj > shardLimitsMap; + for ( vector::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus(); - BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) << - ShardFields::currSize( status.mapped() ) << - ShardFields::draining( s.isDraining()) ); + BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) << + LimitsFields::currSize( status.mapped() ) << + ShardFields::draining( s.isDraining() ) << + LimitsFields::hasOpsQueued( status.hasOpsQueued() ) + ); shardLimitsMap[ s.getName() ] = limitsObj; } @@ -193,8 +198,8 @@ namespace mongo { map< string,vector > shardToChunksMap; cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) ); - while ( cursor->more() ){ - BSONObj chunk = cursor->next(); + while ( cursor->more() ) { + BSONObj chunk = cursor->nextSafe(); vector& chunks = shardToChunksMap[chunk["shard"].String()]; chunks.push_back( chunk.getOwned() ); } @@ -204,8 +209,8 @@ namespace mongo { log(1) << "skipping empty collection (" << ns << ")"; continue; } - - for ( vector::iterator i=allShards.begin(); i!=allShards.end(); ++i ){ + + for ( vector::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); @@ -216,75 +221,109 @@ namespace mongo { } } - void Balancer::run(){ + bool Balancer::_init() { + try { + + log() << "about to contact config servers and shards" << endl; + + // contact the config server and refresh shard information + // checks that each shard is indeed a different process (no hostname mixup) + // these checks are redundant in that they're redone at every new round but we want to do them initially here + // so to catch any problem soon + Shard::reloadShardInfo(); + _checkOIDs(); + + log() << "config servers and shards contacted successfully" << endl; - { // init stuff, don't want to do at static init StringBuilder buf; buf << getHostNameCached() << ":" << cmdLine.port; _myid = buf.str(); - log(1) << "balancer myid: " << _myid << endl; - _started = time(0); - Shard::reloadShardInfo(); + log() << "balancer id: " << _myid << " started at " << time_t_to_String_short(_started) << endl; + + return true; + } - - _ping(); - _checkOIDs(); + catch ( std::exception& ) { + log( LL_WARNING ) << "could not initialize balancer, please check that all shards and config servers are up" << endl; + return false; + + } + } + + void Balancer::run() { + + // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely + while ( ! inShutdown() ) { + + if ( ! _init() ) { + log() << "will retry to initialize balancer in one minute" << endl; + sleepsecs( 60 ); + continue; + } + + break; + } + + // getConnectioString and the constructor of a DistributedLock do not throw, which is what we expect on while + // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); - while ( ! inShutdown() ){ - + while ( ! inShutdown() ) { + try { + + // first make sure we should even be running + if ( ! grid.shouldBalance() ) { + log(1) << "skipping balancing round because balancing is disabled" << endl; + sleepsecs( 30 ); + continue; + } + + ScopedDbConnection conn( config ); - _ping( conn.conn() ); - if ( ! _checkOIDs() ){ + _ping( conn.conn() ); + if ( ! _checkOIDs() ) { uassert( 13258 , "oids broken after resetting!" , _checkOIDs() ); } - + // use fresh shard state - Shard::reloadShardInfo(); + Shard::reloadShardInfo(); dist_lock_try lk( &balanceLock , "doing balance round" ); - if ( ! lk.got() ){ - log(1) << "skipping balancing round during ongoing split or move activity." << endl; + if ( ! lk.got() ) { + log(1) << "skipping balancing round because another balancer is active" << endl; conn.done(); sleepsecs( 30 ); // no need to wake up soon continue; } - - if ( ! grid.shouldBalance() ) { - log(1) << "skipping balancing round because balancing is disabled" << endl;; - conn.done(); - - sleepsecs( 30 ); - continue; - } - log(1) << "*** start balancing round" << endl; + log(1) << "*** start balancing round" << endl; vector candidateChunks; _doBalanceRound( conn.conn() , &candidateChunks ); if ( candidateChunks.size() == 0 ) { log(1) << "no need to move any chunk" << endl; - } else { + } + else { _balancedLastTime = _moveChunks( &candidateChunks ); } - log(1) << "*** end of balancing round" << endl; + log(1) << "*** end of balancing round" << endl; conn.done(); sleepsecs( _balancedLastTime ? 5 : 10 ); } - catch ( std::exception& e ){ + catch ( std::exception& e ) { log() << "caught exception while doing balance: " << e.what() << endl; // Just to match the opening statement if in log level 1 - log(1) << "*** End of balancing round" << endl; + log(1) << "*** End of balancing round" << endl; sleepsecs( 30 ); // sleep a fair amount b/c of error continue; diff --git a/s/balance.h b/s/balance.h index cafae11..0ad2647 100644 --- a/s/balance.h +++ b/s/balance.h @@ -1,4 +1,4 @@ -// balance.h +//@file balance.h /** * Copyright (C) 2008 10gen Inc. @@ -24,7 +24,16 @@ #include "balancer_policy.h" namespace mongo { - + + /** + * The balancer is a background task that tries to keep the number of chunks across all servers of the cluster even. Although + * every mongos will have one balancer running, only one of them will be active at the any given point in time. The balancer + * uses a 'DistributedLock' for that coordination. + * + * The balancer does act continuously but in "rounds". At a given round, it would decide if there is an imbalance by + * checking the difference in chunks between the most and least loaded shards. It would issue a request for a chunk + * migration per round, if it found so. + */ class Balancer : public BackgroundJob { public: Balancer(); @@ -34,47 +43,63 @@ namespace mongo { virtual void run(); - virtual string name() { return "Balancer"; } + virtual string name() const { return "Balancer"; } private: typedef BalancerPolicy::ChunkInfo CandidateChunk; typedef shared_ptr CandidateChunkPtr; + // hostname:port of my mongos + string _myid; + + // time the Balancer started running + time_t _started; + + // number of moved chunks in last round + int _balancedLastTime; + + // decide which chunks to move; owned here. + BalancerPolicy* _policy; + + /** + * Checks that the balancer can connect to all servers it needs to do its job. + * + * @return true if balancing can be started + * + * This method throws on a network exception + */ + bool _init(); + /** - * Gathers all the necessary information about shards and chunks, and - * decides whether there are candidate chunks to be moved. + * Gathers all the necessary information about shards and chunks, and decides whether there are candidate chunks to + * be moved. + * + * @param conn is the connection with the config server(s) + * @param candidateChunks (IN/OUT) filled with candidate chunks, one per collection, that could possibly be moved */ void _doBalanceRound( DBClientBase& conn, vector* candidateChunks ); /** - * Execute the chunk migrations described in 'candidateChunks' and - * returns the number of chunks effectively moved. + * Issues chunk migration request, one at a time. + * + * @param candidateChunks possible chunks to move + * @return number of chunks effectively moved */ int _moveChunks( const vector* candidateChunks ); /** - * Check the health of the master configuration server + * Marks this balancer as being live on the config server(s). + * + * @param conn is the connection with the config server(s) */ - void _ping(); void _ping( DBClientBase& conn ); /** - * @return true if everything is ok + * @return true if all the servers listed in configdb as being shards are reachable and are distinct processes */ bool _checkOIDs(); - // internal state - - string _myid; // hostname:port of my mongos - time_t _started; // time Balancer starte running - int _balancedLastTime; // number of moved chunks in last round - BalancerPolicy* _policy; // decide which chunks to move; owned here. - - // non-copyable, non-assignable - - Balancer(const Balancer&); - Balancer operator=(const Balancer&); }; - + extern Balancer balancer; } diff --git a/s/balancer_policy.cpp b/s/balancer_policy.cpp index 98619c0..2098a1f 100644 --- a/s/balancer_policy.cpp +++ b/s/balancer_policy.cpp @@ -28,54 +28,62 @@ namespace mongo { - BalancerPolicy::ChunkInfo* BalancerPolicy::balance( const string& ns, - const ShardToLimitsMap& shardToLimitsMap, - const ShardToChunksMap& shardToChunksMap, - int balancedLastTime ){ + // limits map fields + BSONField LimitsFields::currSize( "currSize" ); + BSONField LimitsFields::hasOpsQueued( "hasOpsQueued" ); + + BalancerPolicy::ChunkInfo* BalancerPolicy::balance( const string& ns, + const ShardToLimitsMap& shardToLimitsMap, + const ShardToChunksMap& shardToChunksMap, + int balancedLastTime ) { pair min("",numeric_limits::max()); pair max("",0); vector drainingShards; - - for (ShardToChunksIter i = shardToChunksMap.begin(); i!=shardToChunksMap.end(); ++i ){ - // Find whether this shard has reached its size cap or whether it is being removed. + for (ShardToChunksIter i = shardToChunksMap.begin(); i!=shardToChunksMap.end(); ++i ) { + + // Find whether this shard's capacity or availability are exhausted const string& shard = i->first; BSONObj shardLimits; ShardToLimitsIter it = shardToLimitsMap.find( shard ); if ( it != shardToLimitsMap.end() ) shardLimits = it->second; const bool maxedOut = isSizeMaxed( shardLimits ); const bool draining = isDraining( shardLimits ); + const bool opsQueued = hasOpsQueued( shardLimits ); - // Check whether this shard is a better chunk receiver then the current one. - // Maxed out shards or draining shards cannot be considered receivers. + // Is this shard a better chunk receiver then the current one? + // Shards that would be bad receiver candidates: + // + maxed out shards + // + draining shards + // + shards with operations queued for writeback const unsigned size = i->second.size(); - if ( ! maxedOut && ! draining ){ - if ( size < min.second ){ + if ( ! maxedOut && ! draining && ! opsQueued ) { + if ( size < min.second ) { min = make_pair( shard , size ); } } // Check whether this shard is a better chunk donor then the current one. // Draining shards take a lower priority than overloaded shards. - if ( size > max.second ){ - max = make_pair( shard , size ); + if ( size > max.second ) { + max = make_pair( shard , size ); } - if ( draining && (size > 0)){ + if ( draining && (size > 0)) { drainingShards.push_back( shard ); } } - // If there is no candidate chunk receiver -- they may have all been maxed out, - // draining, ... -- there's not much that the policy can do. - if ( min.second == numeric_limits::max() ){ + // If there is no candidate chunk receiver -- they may have all been maxed out, + // draining, ... -- there's not much that the policy can do. + if ( min.second == numeric_limits::max() ) { log() << "no availalable shards to take chunks" << endl; return NULL; } - + log(1) << "collection : " << ns << endl; log(1) << "donor : " << max.second << " chunks on " << max.first << endl; log(1) << "receiver : " << min.second << " chunks on " << min.first << endl; - if ( ! drainingShards.empty() ){ + if ( ! drainingShards.empty() ) { string drainingStr; joinStringDelim( drainingShards, &drainingStr, ',' ); log(1) << "draining : " << ! drainingShards.empty() << "(" << drainingShards.size() << ")" << endl; @@ -86,34 +94,36 @@ namespace mongo { const int imbalance = max.second - min.second; const int threshold = balancedLastTime ? 2 : 8; string from, to; - if ( imbalance >= threshold ){ + if ( imbalance >= threshold ) { from = max.first; to = min.first; - } else if ( ! drainingShards.empty() ){ + } + else if ( ! drainingShards.empty() ) { from = drainingShards[ rand() % drainingShards.size() ]; to = min.first; - } else { - // Everything is balanced here! + } + else { + // Everything is balanced here! return NULL; } const vector& chunksFrom = shardToChunksMap.find( from )->second; const vector& chunksTo = shardToChunksMap.find( to )->second; BSONObj chunkToMove = pickChunk( chunksFrom , chunksTo ); - log() << "chose [" << from << "] to [" << to << "] " << chunkToMove << endl; + log() << "chose [" << from << "] to [" << to << "] " << chunkToMove << endl; return new ChunkInfo( ns, to, from, chunkToMove ); } - BSONObj BalancerPolicy::pickChunk( const vector& from, const vector& to ){ + BSONObj BalancerPolicy::pickChunk( const vector& from, const vector& to ) { // It is possible for a donor ('from') shard to have less chunks than a recevier one ('to') - // if the donor is in draining mode. - + // if the donor is in draining mode. + if ( to.size() == 0 ) return from[0]; - + if ( from[0]["min"].Obj().woCompare( to[to.size()-1]["max"].Obj() , BSONObj() , false ) == 0 ) return from[0]; @@ -123,174 +133,41 @@ namespace mongo { return from[0]; } - bool BalancerPolicy::isSizeMaxed( BSONObj limits ){ - // If there's no limit information for the shard, assume it can be a chunk receiver + bool BalancerPolicy::isSizeMaxed( BSONObj limits ) { + // If there's no limit information for the shard, assume it can be a chunk receiver // (i.e., there's not bound on space utilization) - if ( limits.isEmpty() ){ + if ( limits.isEmpty() ) { return false; } long long maxUsage = limits[ ShardFields::maxSize.name() ].Long(); - if ( maxUsage == 0 ){ + if ( maxUsage == 0 ) { return false; } - long long currUsage = limits[ ShardFields::currSize.name() ].Long(); - if ( currUsage < maxUsage ){ + long long currUsage = limits[ LimitsFields::currSize.name() ].Long(); + if ( currUsage < maxUsage ) { return false; } return true; } - bool BalancerPolicy::isDraining( BSONObj limits ){ + bool BalancerPolicy::isDraining( BSONObj limits ) { BSONElement draining = limits[ ShardFields::draining.name() ]; - if ( draining.eoo() || ! draining.Bool() ){ + if ( draining.eoo() || ! draining.trueValue() ) { return false; } return true; } - class PolicyObjUnitTest : public UnitTest { - public: - - typedef ShardFields sf; // convenience alias - - void caseSizeMaxedShard(){ - BSONObj shard0 = BSON( sf::maxSize(0LL) << sf::currSize(0LL) ); - assert( ! BalancerPolicy::isSizeMaxed( shard0 ) ); - - BSONObj shard1 = BSON( sf::maxSize(100LL) << sf::currSize(80LL) ); - assert( ! BalancerPolicy::isSizeMaxed( shard1 ) ); - - BSONObj shard2 = BSON( sf::maxSize(100LL) << sf::currSize(110LL) ); - assert( BalancerPolicy::isSizeMaxed( shard2 ) ); - - BSONObj empty; - assert( ! BalancerPolicy::isSizeMaxed( empty ) ); - } - - void caseDrainingShard(){ - BSONObj shard0 = BSON( sf::draining(true) ); - assert( BalancerPolicy::isDraining( shard0 ) ); - - BSONObj shard1 = BSON( sf::draining(false) ); - assert( ! BalancerPolicy::isDraining( shard1 ) ); - - BSONObj empty; - assert( ! BalancerPolicy::isDraining( empty ) ); - } - - void caseBalanceNormal(){ - // 2 chunks and 0 chunk shards - BalancerPolicy::ShardToChunksMap chunkMap; - vector chunks; - chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << - "max" << BSON( "x" << 49 ))); - chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << - "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); - chunkMap["shard0"] = chunks; - chunks.clear(); - chunkMap["shard1"] = chunks; - - // no limits - BalancerPolicy::ShardToLimitsMap limitsMap; - BSONObj limits0 = BSON( sf::maxSize(0LL) << sf::currSize(2LL) << sf::draining(false) ); - BSONObj limits1 = BSON( sf::maxSize(0LL) << sf::currSize(0LL) << sf::draining(false) ); - limitsMap["shard0"] = limits0; - limitsMap["shard1"] = limits1; - - BalancerPolicy::ChunkInfo* c = NULL; - c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 1 ); - assert( c != NULL ); - } - - void caseBalanceDraining(){ - // one normal, one draining - // 2 chunks and 0 chunk shards - BalancerPolicy::ShardToChunksMap chunkMap; - vector chunks; - chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << - "max" << BSON( "x" << 49 ))); - chunkMap["shard0"] = chunks; - chunks.clear(); - chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << - "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); - chunkMap["shard1"] = chunks; - - // shard0 is draining - BalancerPolicy::ShardToLimitsMap limitsMap; - BSONObj limits0 = BSON( sf::maxSize(0LL) << sf::currSize(2LL) << sf::draining(true) ); - BSONObj limits1 = BSON( sf::maxSize(0LL) << sf::currSize(0LL) << sf::draining(false) ); - limitsMap["shard0"] = limits0; - limitsMap["shard1"] = limits1; - - BalancerPolicy::ChunkInfo* c = NULL; - c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 ); - assert( c != NULL ); - assert( c->to == "shard1" ); - assert( c->from == "shard0" ); - assert( ! c->chunk.isEmpty() ); - } - - void caseBalanceEndedDraining(){ - // 2 chunks and 0 chunk (drain completed) shards - BalancerPolicy::ShardToChunksMap chunkMap; - vector chunks; - chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << - "max" << BSON( "x" << 49 ))); - chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << - "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); - chunkMap["shard0"] = chunks; - chunks.clear(); - chunkMap["shard1"] = chunks; - - // no limits - BalancerPolicy::ShardToLimitsMap limitsMap; - BSONObj limits0 = BSON( sf::maxSize(0LL) << sf::currSize(2LL) << sf::draining(false) ); - BSONObj limits1 = BSON( sf::maxSize(0LL) << sf::currSize(0LL) << sf::draining(true) ); - limitsMap["shard0"] = limits0; - limitsMap["shard1"] = limits1; - - BalancerPolicy::ChunkInfo* c = NULL; - c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 ); - assert( c == NULL ); - } - - void caseBalanceImpasse(){ - // one maxed out, one draining - // 2 chunks and 0 chunk shards - BalancerPolicy::ShardToChunksMap chunkMap; - vector chunks; - chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) << - "max" << BSON( "x" << 49 ))); - chunkMap["shard0"] = chunks; - chunks.clear(); - chunks.push_back(BSON( "min" << BSON( "x" << 49 ) << - "max" << BSON( "x" << BSON( "$maxkey"<<1 )))); - chunkMap["shard1"] = chunks; - - // shard0 is draining, shard1 is maxed out - BalancerPolicy::ShardToLimitsMap limitsMap; - BSONObj limits0 = BSON( sf::maxSize(0LL) << sf::currSize(2LL) << sf::draining(true) ); - BSONObj limits1 = BSON( sf::maxSize(1LL) << sf::currSize(1LL) << sf::draining(false) ); - limitsMap["shard0"] = limits0; - limitsMap["shard1"] = limits1; - - BalancerPolicy::ChunkInfo* c = NULL; - c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 ); - assert( c == NULL ); - } - - void run(){ - caseSizeMaxedShard(); - caseDrainingShard(); - caseBalanceNormal(); - caseBalanceDraining(); - caseBalanceImpasse(); - log(1) << "policyObjUnitTest passed" << endl; + bool BalancerPolicy::hasOpsQueued( BSONObj limits ) { + BSONElement opsQueued = limits[ LimitsFields::hasOpsQueued.name() ]; + if ( opsQueued.eoo() || ! opsQueued.trueValue() ) { + return false; } - } policyObjUnitTest; + return true; + } } // namespace mongo diff --git a/s/balancer_policy.h b/s/balancer_policy.h index 3622edc..cef5aa6 100644 --- a/s/balancer_policy.h +++ b/s/balancer_policy.h @@ -1,4 +1,4 @@ -// balancer_policy.h +// @file balancer_policy.h /** * Copyright (C) 2010 10gen Inc. @@ -29,20 +29,20 @@ namespace mongo { /** * Returns a suggested chunk to move whithin a collection's shards, given information about - * space usage and number of chunks for that collection. If the policy doesn't recommend + * space usage and number of chunks for that collection. If the policy doesn't recommend * moving, it returns NULL. * * @param ns is the collections namepace. - * @param shardLimitMap is a map from shardId to an object that describes (for now) space + * @param shardLimitMap is a map from shardId to an object that describes (for now) space * cap and usage. E.g.: { "maxSize" : , "usedSize" : }. * @param shardToChunksMap is a map from shardId to chunks that live there. A chunk's format - * is { }. + * is { }. * @param balancedLastTime is the number of chunks effectively moved in the last round. * @returns NULL or ChunkInfo of the best move to make towards balacing the collection. */ typedef map< string,BSONObj > ShardToLimitsMap; typedef map< string,vector > ShardToChunksMap; - static ChunkInfo* balance( const string& ns, const ShardToLimitsMap& shardToLimitsMap, + static ChunkInfo* balance( const string& ns, const ShardToLimitsMap& shardToLimitsMap, const ShardToChunksMap& shardToChunksMap, int balancedLastTime ); // below exposed for testing purposes only -- treat it as private -- @@ -57,11 +57,16 @@ namespace mongo { static bool isSizeMaxed( BSONObj shardLimits ); /** - * Returns true if 'shardLimist' contains a field "draining". Expects the optional field + * Returns true if 'shardLimist' contains a field "draining". Expects the optional field * "isDraining" on 'shrdLimits'. */ static bool isDraining( BSONObj shardLimits ); + /** + * Returns true if a shard currently has operations in any of its writeback queues + */ + static bool hasOpsQueued( BSONObj shardLimits ); + private: // Convenience types typedef ShardToChunksMap::const_iterator ShardToChunksIter; @@ -76,7 +81,16 @@ namespace mongo { const BSONObj chunk; ChunkInfo( const string& a_ns , const string& a_to , const string& a_from , const BSONObj& a_chunk ) - : ns( a_ns ) , to( a_to ) , from( a_from ), chunk( a_chunk ){} + : ns( a_ns ) , to( a_to ) , from( a_from ), chunk( a_chunk ) {} + }; + + /** + * Field names used in the 'limits' map. + */ + struct LimitsFields { + // we use 'draining' and 'maxSize' from the 'shards' collection plus the following + static BSONField currSize; // currently used disk space in bytes + static BSONField hasOpsQueued; // writeback queue is not empty? }; } // namespace mongo diff --git a/s/chunk.cpp b/s/chunk.cpp index 87d7747..1c72535 100644 --- a/s/chunk.cpp +++ b/s/chunk.cpp @@ -1,4 +1,4 @@ -// shard.cpp +// @file chunk.cpp /** * Copyright (C) 2008 10gen Inc. @@ -17,63 +17,62 @@ */ #include "pch.h" -#include "chunk.h" -#include "config.h" -#include "grid.h" -#include "../util/unittest.h" + #include "../client/connpool.h" -#include "../client/distlock.h" #include "../db/queryutil.h" +#include "../util/unittest.h" + +#include "chunk.h" +#include "config.h" #include "cursors.h" +#include "grid.h" #include "strategy.h" +#include "client.h" namespace mongo { - inline bool allOfType(BSONType type, const BSONObj& o){ + inline bool allOfType(BSONType type, const BSONObj& o) { BSONObjIterator it(o); - while(it.more()){ + while(it.more()) { if (it.next().type() != type) return false; } return true; } - RWLock chunkSplitLock("rw:chunkSplitLock"); - // ------- Shard -------- - int Chunk::MaxChunkSize = 1024 * 1024 * 200; - - Chunk::Chunk( ChunkManager * manager ) - : _manager(manager), - _lastmod(0), _modified(false), _dataWritten(0) - {} + string Chunk::chunkMetadataNS = "config.chunks"; + + int Chunk::MaxChunkSize = 1024 * 1024 * 64; + + Chunk::Chunk( ChunkManager * manager ) : _manager(manager), _lastmod(0) { + _setDataWritten(); + } Chunk::Chunk(ChunkManager * info , const BSONObj& min, const BSONObj& max, const Shard& shard) - : _manager(info), _min(min), _max(max), _shard(shard), - _lastmod(0), _modified(false), _dataWritten(0) - {} + : _manager(info), _min(min), _max(max), _shard(shard), _lastmod(0) { + _setDataWritten(); + } + + void Chunk::_setDataWritten() { + _dataWritten = rand() % ( MaxChunkSize / 5 ); + } string Chunk::getns() const { assert( _manager ); - return _manager->getns(); + return _manager->getns(); } - void Chunk::setShard( const Shard& s ){ - _shard = s; - _manager->_migrationNotification(this); - _modified = true; - } - - bool Chunk::contains( const BSONObj& obj ) const{ + bool Chunk::contains( const BSONObj& obj ) const { return _manager->getShardKey().compare( getMin() , obj ) <= 0 && _manager->getShardKey().compare( obj , getMax() ) < 0; } bool ChunkRange::contains(const BSONObj& obj) const { - // same as Chunk method - return + // same as Chunk method + return _manager->getShardKey().compare( getMin() , obj ) <= 0 && _manager->getShardKey().compare( obj , getMax() ) < 0; } @@ -85,324 +84,288 @@ namespace mongo { bool Chunk::maxIsInf() const { return _manager->getShardKey().globalMax().woCompare( getMax() ) == 0; } - - BSONObj Chunk::pickSplitPoint() const{ - int sort = 0; - - if ( minIsInf() ){ - sort = 1; - } - else if ( maxIsInf() ){ - sort = -1; - } - - if ( sort ){ - ShardConnection conn( getShard().getConnString() , _manager->getns() ); - Query q; - if ( sort == 1 ) - q.sort( _manager->getShardKey().key() ); - else { - BSONObj k = _manager->getShardKey().key(); - BSONObjBuilder r; - - BSONObjIterator i(k); - while( i.more() ) { - BSONElement e = i.next(); - uassert( 10163 , "can only handle numbers here - which i think is correct" , e.isNumber() ); - r.append( e.fieldName() , -1 * e.number() ); - } - - q.sort( r.obj() ); - } - BSONObj end = conn->findOne( _manager->getns() , q ); - conn.done(); - if ( ! end.isEmpty() ) - return _manager->getShardKey().extractKey( end ); + BSONObj Chunk::_getExtremeKey( int sort ) const { + ShardConnection conn( getShard().getConnString() , _manager->getns() ); + Query q; + if ( sort == 1 ) { + q.sort( _manager->getShardKey().key() ); } - - BSONObj cmd = BSON( "medianKey" << _manager->getns() - << "keyPattern" << _manager->getShardKey().key() - << "min" << getMin() - << "max" << getMax() ); + else { + // need to invert shard key pattern to sort backwards + // TODO: make a helper in ShardKeyPattern? - ScopedDbConnection conn( getShard().getConnString() ); - BSONObj result; - if ( ! conn->runCommand( "admin" , cmd , result ) ){ - stringstream ss; - ss << "medianKey command failed: " << result; - uassert( 10164 , ss.str() , 0 ); + BSONObj k = _manager->getShardKey().key(); + BSONObjBuilder r; + + BSONObjIterator i(k); + while( i.more() ) { + BSONElement e = i.next(); + uassert( 10163 , "can only handle numbers here - which i think is correct" , e.isNumber() ); + r.append( e.fieldName() , -1 * e.number() ); + } + + q.sort( r.obj() ); } - BSONObj median = result.getObjectField( "median" ).getOwned(); + // find the extreme key + BSONObj end = conn->findOne( _manager->getns() , q ); conn.done(); + if ( end.isEmpty() ) + return BSONObj(); + + return _manager->getShardKey().extractKey( end ); + } - if (median == getMin()){ - Query q; - q.minKey(_min).maxKey(_max); - q.sort(_manager->getShardKey().key()); + void Chunk::pickMedianKey( BSONObj& medianKey ) const { + // Ask the mongod holding this chunk to figure out the split points. + ScopedDbConnection conn( getShard().getConnString() ); + BSONObj result; + BSONObjBuilder cmd; + cmd.append( "splitVector" , _manager->getns() ); + cmd.append( "keyPattern" , _manager->getShardKey().key() ); + cmd.append( "min" , getMin() ); + cmd.append( "max" , getMax() ); + cmd.appendBool( "force" , true ); + BSONObj cmdObj = cmd.obj(); - median = conn->findOne(_manager->getns(), q); - median = _manager->getShardKey().extractKey( median ); + if ( ! conn->runCommand( "admin" , cmdObj , result )) { + conn.done(); + ostringstream os; + os << "splitVector command (median key) failed: " << result; + uassert( 13503 , os.str() , 0 ); } - - if ( median < getMin() || median >= getMax() ){ - stringstream ss; - ss << "medianKey returned value out of range. " - << " cmd: " << cmd - << " result: " << result; - uasserted( 13394 , ss.str() ); + + BSONObjIterator it( result.getObjectField( "splitKeys" ) ); + if ( it.more() ) { + medianKey = it.next().Obj().getOwned(); } - - return median; + + conn.done(); } - void Chunk::pickSplitVector( vector* splitPoints ) const { + void Chunk::pickSplitVector( vector& splitPoints , int chunkSize /* bytes */, int maxPoints, int maxObjs ) const { // Ask the mongod holding this chunk to figure out the split points. ScopedDbConnection conn( getShard().getConnString() ); BSONObj result; BSONObjBuilder cmd; cmd.append( "splitVector" , _manager->getns() ); cmd.append( "keyPattern" , _manager->getShardKey().key() ); - cmd.append( "maxChunkSize" , Chunk::MaxChunkSize / (1<<20) ); + cmd.append( "min" , getMin() ); + cmd.append( "max" , getMax() ); + cmd.append( "maxChunkSizeBytes" , chunkSize ); + cmd.append( "maxSplitPoints" , maxPoints ); + cmd.append( "maxChunkObjects" , maxObjs ); BSONObj cmdObj = cmd.obj(); - if ( ! conn->runCommand( "admin" , cmdObj , result )){ + if ( ! conn->runCommand( "admin" , cmdObj , result )) { + conn.done(); ostringstream os; os << "splitVector command failed: " << result; uassert( 13345 , os.str() , 0 ); - } + } BSONObjIterator it( result.getObjectField( "splitKeys" ) ); - while ( it.more() ){ - splitPoints->push_back( it.next().Obj().getOwned() ); + while ( it.more() ) { + splitPoints.push_back( it.next().Obj().getOwned() ); } conn.done(); } - ChunkPtr Chunk::split(){ - vector splitPoints; - splitPoints.push_back( pickSplitPoint() ); - return multiSplit( splitPoints ); + ChunkPtr Chunk::singleSplit( bool force , BSONObj& res ) { + vector splitPoint; + + // if splitting is not obligatory we may return early if there are not enough data + // we cap the number of objects that would fall in the first half (before the split point) + // the rationale is we'll find a split point without traversing all the data + if ( ! force ) { + vector candidates; + const int maxPoints = 2; + const int maxObjs = 250000; + pickSplitVector( candidates , getManager()->getCurrentDesiredChunkSize() , maxPoints , maxObjs ); + if ( candidates.size() <= 1 ) { + // no split points means there isn't enough data to split on + // 1 split point means we have between half the chunk size to full chunk size + // so we shouldn't split + log(1) << "chunk not full enough to trigger auto-split" << endl; + return ChunkPtr(); + } + + splitPoint.push_back( candidates.front() ); + + } + else { + // if forcing a split, use the chunk's median key + BSONObj medianKey; + pickMedianKey( medianKey ); + if ( ! medianKey.isEmpty() ) + splitPoint.push_back( medianKey ); + } + + // We assume that if the chunk being split is the first (or last) one on the collection, this chunk is + // likely to see more insertions. Instead of splitting mid-chunk, we use the very first (or last) key + // as a split point. + if ( minIsInf() ) { + splitPoint.clear(); + BSONObj key = _getExtremeKey( 1 ); + if ( ! key.isEmpty() ) { + splitPoint.push_back( key ); + } + + } + else if ( maxIsInf() ) { + splitPoint.clear(); + BSONObj key = _getExtremeKey( -1 ); + if ( ! key.isEmpty() ) { + splitPoint.push_back( key ); + } + } + + // Normally, we'd have a sound split point here if the chunk is not empty. It's also a good place to + // sanity check. + if ( splitPoint.empty() || _min == splitPoint.front() || _max == splitPoint.front() ) { + log() << "want to split chunk, but can't find split point chunk " << toString() + << " got: " << ( splitPoint.empty() ? "" : splitPoint.front().toString() ) << endl; + return ChunkPtr(); + } + + return multiSplit( splitPoint , res ); } - - ChunkPtr Chunk::multiSplit( const vector& m ){ - const size_t maxSplitPoints = 256; + + ChunkPtr Chunk::multiSplit( const vector& m , BSONObj& res ) { + const size_t maxSplitPoints = 8192; uassert( 10165 , "can't split as shard doesn't have a manager" , _manager ); uassert( 13332 , "need a split key to split chunk" , !m.empty() ); uassert( 13333 , "can't split a chunk in that many parts", m.size() < maxSplitPoints ); - uassert( 13003 , "can't split a chunk with only one distinct value" , _min.woCompare(_max) ); + uassert( 13003 , "can't split a chunk with only one distinct value" , _min.woCompare(_max) ); - DistributedLock lockSetup( ConnectionString( modelServer() , ConnectionString::SYNC ) , getns() ); - dist_lock_try dlk( &lockSetup , string("split-") + toString() ); - uassert( 10166 , "locking namespace failed" , dlk.got() ); - - { - ShardChunkVersion onServer = getVersionOnConfigServer(); - ShardChunkVersion mine = _lastmod; - if ( onServer > mine ){ - stringstream ss; - ss << "mulitSplit failing because config not up to date" - << " onServer: " << onServer.toString() - << " mine: " << mine.toString(); - - //reload config - grid.getDBConfig(_manager->_ns)->getChunkManager(_manager->_ns, true); - - uasserted( 13387 , ss.str() ); - } - } + ScopedDbConnection conn( getShard().getConnString() ); - BSONObjBuilder detail; - appendShortVersion( "before" , detail ); - log(1) << "before split on " << m.size() << " points " << toString() << endl; + BSONObjBuilder cmd; + cmd.append( "splitChunk" , _manager->getns() ); + cmd.append( "keyPattern" , _manager->getShardKey().key() ); + cmd.append( "min" , getMin() ); + cmd.append( "max" , getMax() ); + cmd.append( "from" , getShard().getConnString() ); + cmd.append( "splitKeys" , m ); + cmd.append( "shardId" , genID() ); + cmd.append( "configdb" , configServer.modelServer() ); + BSONObj cmdObj = cmd.obj(); - // Iterate over the split points in 'm', splitting off a new chunk per entry. That chunk's range - // covers until the next entry in 'm' or _max . - vector newChunks; - vector::const_iterator i = m.begin(); - BSONObj nextPoint = i->getOwned(); - _modified = true; - do { - BSONObj splitPoint = nextPoint; - log(4) << "splitPoint: " << splitPoint << endl; - nextPoint = (++i != m.end()) ? i->getOwned() : _max.getOwned(); - log(4) << "nextPoint: " << nextPoint << endl; - - if ( nextPoint <= splitPoint) { - stringstream ss; - ss << "multiSplit failing because keys min: " << splitPoint << " and max: " << nextPoint - << " do not define a valid chunk"; - uasserted( 13395, ss.str() ); - } + if ( ! conn->runCommand( "admin" , cmdObj , res )) { + warning() << "splitChunk failed - cmd: " << cmdObj << " result: " << res << endl; + conn.done(); + + // reloading won't stricly solve all problems, e.g. the collection's metdata lock can be taken + // but we issue here so that mongos may refresh wihtout needing to be written/read against + _manager->_reload(); + + return ChunkPtr(); + } - ChunkPtr c( new Chunk( _manager, splitPoint , nextPoint , _shard) ); - c->_modified = true; - newChunks.push_back( c ); - } while ( i != m.end() ); + conn.done(); + _manager->_reload(); - // Have the chunk manager reflect the key change for the first chunk and create an entry for every - // new chunk spawned by it. + // The previous multisplit logic adjusted the boundaries of 'this' chunk. Any call to 'this' object hereafter + // will see a different _max for the chunk. + // TODO Untie this dependency since, for metadata purposes, the reload() above already fixed boundaries { rwlock lk( _manager->_lock , true ); setMax(m[0].getOwned()); DEV assert( shared_from_this() ); _manager->_chunkMap[_max] = shared_from_this(); - - for ( vector::const_iterator it = newChunks.begin(); it != newChunks.end(); ++it ){ - ChunkPtr s = *it; - _manager->_chunkMap[s->getMax()] = s; - } - } - - log(1) << "after split adjusted range: " << toString() << endl; - for ( vector::const_iterator it = newChunks.begin(); it != newChunks.end(); ++it ){ - ChunkPtr s = *it; - log(1) << "after split created new chunk: " << s->toString() << endl; - } - - // Save the new key boundaries in the configDB. - _manager->save( false ); - - // Log all these changes in the configDB's log. We log a simple split differently than a multi-split. - if ( newChunks.size() == 1) { - appendShortVersion( "left" , detail ); - newChunks[0]->appendShortVersion( "right" , detail ); - configServer.logChange( "split" , _manager->getns(), detail.obj() ); - - } else { - BSONObj beforeDetailObj = detail.obj(); - BSONObj firstDetailObj = beforeDetailObj.getOwned(); - const int newChunksSize = newChunks.size(); - - BSONObjBuilder firstDetail; - firstDetail.appendElements( beforeDetailObj ); - firstDetail.append( "number" , 0 ); - firstDetail.append( "of" , newChunksSize ); - appendShortVersion( "chunk" , firstDetail ); - configServer.logChange( "multi-split" , _manager->getns() , firstDetail.obj() ); - - for ( int i=0; i < newChunksSize; i++ ){ - BSONObjBuilder chunkDetail; - chunkDetail.appendElements( beforeDetailObj ); - chunkDetail.append( "number", i+1 ); - chunkDetail.append( "of" , newChunksSize ); - newChunks[i]->appendShortVersion( "chunk" , chunkDetail ); - configServer.logChange( "multi-split" , _manager->getns() , chunkDetail.obj() ); - } } - return newChunks[0]; + // return the second half, if a single split, or the first new chunk, if a multisplit. + return _manager->findChunk( m[0] ); } - bool Chunk::moveAndCommit( const Shard& to , string& errmsg ){ + bool Chunk::moveAndCommit( const Shard& to , long long chunkSize /* bytes */, BSONObj& res ) { uassert( 10167 , "can't move shard to its current location!" , getShard() != to ); - + log() << "moving chunk ns: " << _manager->getns() << " moving ( " << toString() << ") " << _shard.toString() << " -> " << to.toString() << endl; - + Shard from = _shard; - + ScopedDbConnection fromconn( from); - BSONObj res; bool worked = fromconn->runCommand( "admin" , - BSON( "moveChunk" << _manager->getns() << - "from" << from.getConnString() << - "to" << to.getConnString() << - "min" << _min << - "max" << _max << - "shardId" << genID() << - "configdb" << configServer.modelServer() - ) , + BSON( "moveChunk" << _manager->getns() << + "from" << from.getConnString() << + "to" << to.getConnString() << + "min" << _min << + "max" << _max << + "maxChunkSizeBytes" << chunkSize << + "shardId" << genID() << + "configdb" << configServer.modelServer() + ) , res - ); - + ); + fromconn.done(); - if ( worked ){ - _manager->_reload(); - return true; - } - - errmsg = res["errmsg"].String(); - errmsg += " " + res.toString(); - return false; + // if succeeded, needs to reload to pick up the new location + // if failed, mongos may be stale + // reload is excessive here as the failure could be simply because collection metadata is taken + _manager->_reload(); + + return worked; } - - bool Chunk::splitIfShould( long dataWritten ){ + + bool Chunk::splitIfShould( long dataWritten ) { LastError::Disabled d( lastError.get() ); + try { - return _splitIfShould( dataWritten ); - } - catch ( std::exception& e ){ - log( LL_ERROR ) << "splitIfShould failed: " << e.what() << endl; - return false; - } - } + _dataWritten += dataWritten; + int splitThreshold = getManager()->getCurrentDesiredChunkSize(); + if ( minIsInf() || maxIsInf() ) { + splitThreshold = (int) ((double)splitThreshold * .9); + } - bool Chunk::_splitIfShould( long dataWritten ){ - _dataWritten += dataWritten; - - // split faster in early chunks helps spread out an initial load better - int splitThreshold; - const int minChunkSize = 1 << 20; // 1 MBytes - int numChunks = getManager()->numChunks(); - if ( numChunks < 10 ){ - splitThreshold = max( MaxChunkSize / 4 , minChunkSize ); - } else if ( numChunks < 20 ){ - splitThreshold = max( MaxChunkSize / 2 , minChunkSize ); - } else { - splitThreshold = max( MaxChunkSize , minChunkSize ); - } - - if ( minIsInf() || maxIsInf() ){ - splitThreshold = (int) ((double)splitThreshold * .9); - } + if ( _dataWritten < splitThreshold / 5 ) + return false; - if ( _dataWritten < splitThreshold / 5 ) - return false; - - if ( ! chunkSplitLock.lock_try(0) ) - return false; - - rwlock lk( chunkSplitLock , 1 , true ); + log(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold << endl; - log(3) << "\t splitIfShould : " << *this << endl; + _dataWritten = 0; // reset so we check often enough - _dataWritten = 0; - - BSONObj splitPoint = pickSplitPoint(); - if ( splitPoint.isEmpty() || _min == splitPoint || _max == splitPoint) { - log() << "SHARD PROBLEM** shard is too big, but can't split: " << toString() << endl; - return false; - } + BSONObj res; + ChunkPtr newShard = singleSplit( false /* does not force a split if not enough data */ , res ); + if ( newShard.get() == NULL ) { + // singleSplit would have issued a message if we got here + _dataWritten = 0; // this means there wasn't enough data to split, so don't want to try again until considerable more data + return false; + } - long size = getPhysicalSize(); - if ( size < splitThreshold ) - return false; - - log() << "autosplitting " << _manager->getns() << " size: " << size << " shard: " << toString() - << " on: " << splitPoint << "(splitThreshold " << splitThreshold << ")" << endl; + log() << "autosplitted " << _manager->getns() << " shard: " << toString() + << " on: " << newShard->getMax() << "(splitThreshold " << splitThreshold << ")" +#ifdef _DEBUG + << " size: " << getPhysicalSize() // slow - but can be usefule when debugging +#endif + << endl; - vector splitPoints; - splitPoints.push_back( splitPoint ); - ChunkPtr newShard = multiSplit( splitPoints ); + moveIfShould( newShard ); - moveIfShould( newShard ); - - return true; + return true; + + } + catch ( std::exception& e ) { + // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail. + warning() << "could have autosplit on collection: " << _manager->getns() << " but: " << e.what() << endl; + return false; + } } - bool Chunk::moveIfShould( ChunkPtr newChunk ){ + bool Chunk::moveIfShould( ChunkPtr newChunk ) { ChunkPtr toMove; - - if ( newChunk->countObjects(2) <= 1 ){ + + if ( newChunk->countObjects(2) <= 1 ) { toMove = newChunk; } - else if ( this->countObjects(2) <= 1 ){ + else if ( this->countObjects(2) <= 1 ) { DEV assert( shared_from_this() ); toMove = shared_from_this(); } @@ -412,45 +375,46 @@ namespace mongo { } assert( toMove ); - - Shard newLocation = Shard::pick(); - if ( getShard() == newLocation ){ - // if this is the best server, then we shouldn't do anything! - log(1) << "not moving chunk: " << toString() << " b/c would move to same place " << newLocation.toString() << " -> " << getShard().toString() << endl; + + Shard newLocation = Shard::pick( getShard() ); + if ( getShard() == newLocation ) { + // if this is the best shard, then we shouldn't do anything (Shard::pick already logged our shard). + log(1) << "recently split chunk: " << toString() << "already in the best shard" << endl; return 0; } log() << "moving chunk (auto): " << toMove->toString() << " to: " << newLocation.toString() << " #objects: " << toMove->countObjects() << endl; - string errmsg; - massert( 10412 , (string)"moveAndCommit failed: " + errmsg , - toMove->moveAndCommit( newLocation , errmsg ) ); - + BSONObj res; + massert( 10412 , + str::stream() << "moveAndCommit failed: " << res , + toMove->moveAndCommit( newLocation , MaxChunkSize , res ) ); + return true; } - long Chunk::getPhysicalSize() const{ + long Chunk::getPhysicalSize() const { ScopedDbConnection conn( getShard().getConnString() ); - + BSONObj result; - uassert( 10169 , "datasize failed!" , conn->runCommand( "admin" , - BSON( "datasize" << _manager->getns() - << "keyPattern" << _manager->getShardKey().key() - << "min" << getMin() - << "max" << getMax() - << "maxSize" << ( MaxChunkSize + 1 ) - << "estimate" << true - ) , result ) ); - + uassert( 10169 , "datasize failed!" , conn->runCommand( "admin" , + BSON( "datasize" << _manager->getns() + << "keyPattern" << _manager->getShardKey().key() + << "min" << getMin() + << "max" << getMax() + << "maxSize" << ( MaxChunkSize + 1 ) + << "estimate" << true + ) , result ) ); + conn.done(); return (long)result["size"].number(); } - int Chunk::countObjects(int maxCount) const { + int Chunk::countObjects(int maxCount) const { static const BSONObj fields = BSON("_id" << 1 ); ShardConnection conn( getShard() , _manager->getns() ); - + // not using regular count as this is more flexible and supports $min/$max Query q = Query().minKey(_min).maxKey(_max); int n; @@ -458,33 +422,33 @@ namespace mongo { auto_ptr c = conn->query(_manager->getns(), q, maxCount, 0, &fields); assert( c.get() ); n = c->itcount(); - } + } conn.done(); return n; } - void Chunk::appendShortVersion( const char * name , BSONObjBuilder& b ){ + void Chunk::appendShortVersion( const char * name , BSONObjBuilder& b ) { BSONObjBuilder bb( b.subobjStart( name ) ); bb.append( "min" , _min ); bb.append( "max" , _max ); bb.done(); } - - bool Chunk::operator==( const Chunk& s ) const{ - return + + bool Chunk::operator==( const Chunk& s ) const { + return _manager->getShardKey().compare( _min , s._min ) == 0 && _manager->getShardKey().compare( _max , s._max ) == 0 ; } - void Chunk::serialize(BSONObjBuilder& to,ShardChunkVersion myLastMod){ - + void Chunk::serialize(BSONObjBuilder& to,ShardChunkVersion myLastMod) { + to.append( "_id" , genID( _manager->getns() , _min ) ); - if ( myLastMod.isSet() ){ + if ( myLastMod.isSet() ) { to.appendTimestamp( "lastmod" , myLastMod ); } - else if ( _lastmod.isSet() ){ + else if ( _lastmod.isSet() ) { assert( _lastmod > 0 && _lastmod < 1000 ); to.appendTimestamp( "lastmod" , _lastmod ); } @@ -503,15 +467,15 @@ namespace mongo { buf << ns << "-"; BSONObjIterator i(o); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); buf << e.fieldName() << "_" << e.toString(false, true); } return buf.str(); } - - void Chunk::unserialize(const BSONObj& from){ + + void Chunk::unserialize(const BSONObj& from) { string ns = from.getStringField( "ns" ); _shard.reset( from.getStringField( "shard" ) ); @@ -520,15 +484,15 @@ namespace mongo { BSONElement e = from["minDotted"]; - if (e.eoo()){ + if (e.eoo()) { _min = from.getObjectField( "min" ).getOwned(); _max = from.getObjectField( "max" ).getOwned(); - } + } else { // TODO delete this case after giving people a chance to migrate _min = e.embeddedObject().getOwned(); _max = from.getObjectField( "maxDotted" ).getOwned(); } - + uassert( 10170 , "Chunk needs a ns" , ! ns.empty() ); uassert( 13327 , "Chunk ns must match server ns" , ns == _manager->getns() ); @@ -538,26 +502,13 @@ namespace mongo { uassert( 10173 , "Chunk needs a max" , ! _max.isEmpty() ); } - string Chunk::modelServer() const { - // TODO: this could move around? - return configServer.modelServer(); - } - - ShardChunkVersion Chunk::getVersionOnConfigServer() const { - ScopedDbConnection conn( modelServer() ); - BSONObj o = conn->findOne( ShardNS::chunk , BSON( "_id" << genID() ) ); - conn.done(); - return o["lastmod"]; - } - string Chunk::toString() const { stringstream ss; ss << "ns:" << _manager->getns() << " at: " << _shard.toString() << " lastmod: " << _lastmod.toString() << " min: " << _min << " max: " << _max; return ss.str(); } - - - ShardKeyPattern Chunk::skey() const{ + + ShardKeyPattern Chunk::skey() const { return _manager->getShardKey(); } @@ -565,75 +516,66 @@ namespace mongo { AtomicUInt ChunkManager::NextSequenceNumber = 1; - ChunkManager::ChunkManager( DBConfig * config , string ns , ShardKeyPattern pattern , bool unique ) : - _config( config ) , _ns( ns ) , - _key( pattern ) , _unique( unique ) , - _sequenceNumber( ++NextSequenceNumber ), _lock("rw:ChunkManager") - { - _reload_inlock(); - - if ( _chunkMap.empty() ){ - ChunkPtr c( new Chunk(this, _key.globalMin(), _key.globalMax(), config->getPrimary()) ); - c->setModified( true ); - - _chunkMap[c->getMax()] = c; - _chunkRanges.reloadAll(_chunkMap); - - _shards.insert(c->getShard()); - - save_inlock( true ); - log() << "no chunks for:" << ns << " so creating first: " << c->toString() << endl; - } + ChunkManager::ChunkManager( string ns , ShardKeyPattern pattern , bool unique ) : + _ns( ns ) , _key( pattern ) , _unique( unique ) , _lock("rw:ChunkManager"), + _nsLock( ConnectionString( configServer.modelServer() , ConnectionString::SYNC ) , ns ) { + _reload_inlock(); // will set _sequenceNumber } - - ChunkManager::~ChunkManager(){ + + ChunkManager::~ChunkManager() { _chunkMap.clear(); _chunkRanges.clear(); _shards.clear(); } - - void ChunkManager::_reload(){ + + void ChunkManager::_reload() { rwlock lk( _lock , true ); _reload_inlock(); } - void ChunkManager::_reload_inlock(){ + void ChunkManager::_reload_inlock() { int tries = 3; - while (tries--){ + while (tries--) { _chunkMap.clear(); _chunkRanges.clear(); _shards.clear(); _load(); - if (_isValid()){ + if (_isValid()) { _chunkRanges.reloadAll(_chunkMap); + + // The shard versioning mechanism hinges on keeping track of the number of times we reloaded ChunkManager's. + // Increasing this number here will prompt checkShardVersion() to refresh the connection-level versions to + // the most up to date value. + _sequenceNumber = ++NextSequenceNumber; + return; } - if (_chunkMap.size() < 10){ + if (_chunkMap.size() < 10) { _printChunks(); } + sleepmillis(10 * (3-tries)); - sleepsecs(10); } - msgasserted(13282, "Couldn't load a valid config for " + _ns + " after 3 tries. Giving up"); - + + msgasserted(13282, "Couldn't load a valid config for " + _ns + " after 3 attempts. Please try again."); + } - void ChunkManager::_load(){ - static Chunk temp(0); - - ScopedDbConnection conn( temp.modelServer() ); + void ChunkManager::_load() { + ScopedDbConnection conn( configServer.modelServer() ); - auto_ptr cursor = conn->query(temp.getNS(), QUERY("ns" << _ns).sort("lastmod",1), 0, 0, 0, 0, - (DEBUG_BUILD ? 2 : 1000000)); // batch size. Try to induce potential race conditions in debug builds + // TODO really need the sort? + auto_ptr cursor = conn->query( Chunk::chunkMetadataNS, QUERY("ns" << _ns).sort("lastmod",1), 0, 0, 0, 0, + (DEBUG_BUILD ? 2 : 1000000)); // batch size. Try to induce potential race conditions in debug builds assert( cursor.get() ); - while ( cursor->more() ){ + while ( cursor->more() ) { BSONObj d = cursor->next(); - if ( d["isMaxMarker"].trueValue() ){ + if ( d["isMaxMarker"].trueValue() ) { continue; } - + ChunkPtr c( new Chunk( this ) ); c->unserialize( d ); @@ -655,10 +597,10 @@ namespace mongo { ENSURE(allOfType(MaxKey, prior(_chunkMap.end())->second->getMax())); // Make sure there are no gaps or overlaps - for (ChunkMap::const_iterator it=boost::next(_chunkMap.begin()), end=_chunkMap.end(); it != end; ++it){ + for (ChunkMap::const_iterator it=boost::next(_chunkMap.begin()), end=_chunkMap.end(); it != end; ++it) { ChunkMap::const_iterator last = prior(it); - if (!(it->second->getMin() == last->second->getMax())){ + if (!(it->second->getMin() == last->second->getMax())) { PRINT(it->second->toString()); PRINT(it->second->getMin()); PRINT(last->second->getMax()); @@ -677,54 +619,101 @@ namespace mongo { } } - bool ChunkManager::hasShardKey( const BSONObj& obj ){ + bool ChunkManager::hasShardKey( const BSONObj& obj ) { return _key.hasShardKey( obj ); } - ChunkPtr ChunkManager::findChunk( const BSONObj & obj , bool retry ){ + void ChunkManager::createFirstChunk( const Shard& shard ) { + assert( _chunkMap.size() == 0 ); + + ChunkPtr c( new Chunk(this, _key.globalMin(), _key.globalMax(), shard ) ); + + // this is the first chunk; start the versioning from scratch + ShardChunkVersion version; + version.incMajor(); + + // build update for the chunk collection + BSONObjBuilder chunkBuilder; + c->serialize( chunkBuilder , version ); + BSONObj chunkCmd = chunkBuilder.obj(); + + log() << "about to create first chunk for: " << _ns << endl; + + ScopedDbConnection conn( configServer.modelServer() ); + BSONObj res; + conn->update( Chunk::chunkMetadataNS, QUERY( "_id" << c->genID() ), chunkCmd, true, false ); + + string errmsg = conn->getLastError(); + if ( errmsg.size() ) { + stringstream ss; + ss << "saving first chunk failed. cmd: " << chunkCmd << " result: " << errmsg; + log( LL_ERROR ) << ss.str() << endl; + msgasserted( 13592 , ss.str() ); // assert(13592) + } + + conn.done(); + + // every instance of ChunkManager has a unique sequence number; callers of ChunkManager may + // inquiry about whether there were changes in chunk configuration (see re/load() calls) since + // the last access to ChunkManager by checking the sequence number + _sequenceNumber = ++NextSequenceNumber; + + _chunkMap[c->getMax()] = c; + _chunkRanges.reloadAll(_chunkMap); + _shards.insert(c->getShard()); + c->setLastmod(version); + + // the ensure index will have the (desired) indirect effect of creating the collection on the + // assigned shard, as it sets up the index over the sharding keys. + ensureIndex_inlock(); + + log() << "successfully created first chunk for " << c->toString() << endl; + } + + ChunkPtr ChunkManager::findChunk( const BSONObj & obj , bool retry ) { BSONObj key = _key.extractKey(obj); - + { - rwlock lk( _lock , false ); - + rwlock lk( _lock , false ); + BSONObj foo; ChunkPtr c; { ChunkMap::iterator it = _chunkMap.upper_bound(key); - if (it != _chunkMap.end()){ + if (it != _chunkMap.end()) { foo = it->first; c = it->second; } } - - if ( c ){ + + if ( c ) { if ( c->contains( obj ) ) return c; - + PRINT(foo); PRINT(*c); PRINT(key); - + _reload_inlock(); massert(13141, "Chunk map pointed to incorrect chunk", false); } } - if ( retry ){ + if ( retry ) { stringstream ss; ss << "couldn't find a chunk aftry retry which should be impossible extracted: " << key; throw UserException( 8070 , ss.str() ); } - + log() << "ChunkManager: couldn't find chunk for: " << key << " going to retry" << endl; _reload_inlock(); return findChunk( obj , true ); } ChunkPtr ChunkManager::findChunkOnServer( const Shard& shard ) const { - rwlock lk( _lock , false ); - - for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ){ + rwlock lk( _lock , false ); + + for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) { ChunkPtr c = i->second; if ( c->getShard() == shard ) return c; @@ -733,20 +722,33 @@ namespace mongo { return ChunkPtr(); } - void ChunkManager::getShardsForQuery( set& shards , const BSONObj& query ){ - rwlock lk( _lock , false ); + void ChunkManager::getShardsForQuery( set& shards , const BSONObj& query ) { + rwlock lk( _lock , false ); DEV PRINT(query); //TODO look into FieldRangeSetOr FieldRangeOrSet fros(_ns.c_str(), query, false); - uassert(13088, "no support for special queries yet", fros.getSpecial().empty()); + + const string special = fros.getSpecial(); + if (special == "2d") { + BSONForEach(field, query) { + if (getGtLtOp(field) == BSONObj::opNEAR) { + uassert(13501, "use geoNear command rather than $near query", false); + // TODO: convert to geoNear rather than erroring out + } + // $within queries are fine + } + } + else if (!special.empty()) { + uassert(13502, "unrecognized special query type: " + special, false); + } do { boost::scoped_ptr frs (fros.topFrs()); { // special case if most-significant field isn't in query FieldRange range = frs->range(_key.key().firstElement().fieldName()); - if ( !range.nontrivial() ){ + if ( !range.nontrivial() ) { DEV PRINT(range.nontrivial()); getAllShards(shards); return; @@ -754,7 +756,7 @@ namespace mongo { } BoundList ranges = frs->indexBounds(_key.key(), 1); - for (BoundList::const_iterator it=ranges.begin(), end=ranges.end(); it != end; ++it){ + for (BoundList::const_iterator it=ranges.begin(), end=ranges.end(); it != end; ++it) { BSONObj minObj = it->first.replaceFieldNames(_key.key()); BSONObj maxObj = it->second.replaceFieldNames(_key.key()); @@ -765,35 +767,36 @@ namespace mongo { min = _chunkRanges.upper_bound(minObj); max = _chunkRanges.upper_bound(maxObj); - assert(min != _chunkRanges.ranges().end()); + massert( 13507 , str::stream() << "invalid chunk config minObj: " << minObj , min != _chunkRanges.ranges().end()); // make max non-inclusive like end iterators if(max != _chunkRanges.ranges().end()) ++max; - for (ChunkRangeMap::const_iterator it=min; it != max; ++it){ + for (ChunkRangeMap::const_iterator it=min; it != max; ++it) { shards.insert(it->second->getShard()); } // once we know we need to visit all shards no need to keep looping //if (shards.size() == _shards.size()) - //return; + //return; } if (fros.moreOrClauses()) fros.popOrClause(); - } while (fros.moreOrClauses()); + } + while (fros.moreOrClauses()); } - void ChunkManager::getShardsForRange(set& shards, const BSONObj& min, const BSONObj& max){ + void ChunkManager::getShardsForRange(set& shards, const BSONObj& min, const BSONObj& max) { uassert(13405, "min must have shard key", hasShardKey(min)); uassert(13406, "max must have shard key", hasShardKey(max)); ChunkRangeMap::const_iterator it = _chunkRanges.upper_bound(min); ChunkRangeMap::const_iterator end = _chunkRanges.lower_bound(max); - for (; it!=end; ++ it){ + for (; it!=end; ++ it) { shards.insert(it->second->getShard()); // once we know we need to visit all shards no need to keep looping @@ -802,282 +805,165 @@ namespace mongo { } } - void ChunkManager::getAllShards( set& all ){ - rwlock lk( _lock , false ); + void ChunkManager::getAllShards( set& all ) { + rwlock lk( _lock , false ); all.insert(_shards.begin(), _shards.end()); } - - void ChunkManager::ensureIndex_inlock(){ + + void ChunkManager::ensureIndex_inlock() { //TODO in parallel? - for ( set::const_iterator i=_shards.begin(); i!=_shards.end(); ++i ){ + for ( set::const_iterator i=_shards.begin(); i!=_shards.end(); ++i ) { ScopedDbConnection conn( i->getConnString() ); - conn->ensureIndex( getns() , getShardKey().key() , _unique ); + conn->ensureIndex( getns() , getShardKey().key() , _unique , "" , false /* do not cache ensureIndex SERVER-1691 */ ); conn.done(); } } - - void ChunkManager::drop( ChunkManagerPtr me ){ - rwlock lk( _lock , true ); + + void ChunkManager::drop( ChunkManagerPtr me ) { + rwlock lk( _lock , true ); configServer.logChange( "dropCollection.start" , _ns , BSONObj() ); - - DistributedLock lockSetup( ConnectionString( configServer.modelServer() , ConnectionString::SYNC ) , getns() ); - dist_lock_try dlk( &lockSetup , "drop" ); - uassert( 13331 , "locking namespace failed" , dlk.got() ); - + + dist_lock_try dlk( &_nsLock , "drop" ); + uassert( 13331 , "collection's metadata is undergoing changes. Please try again." , dlk.got() ); + uassert( 10174 , "config servers not all up" , configServer.allUp() ); - + set seen; - + log(1) << "ChunkManager::drop : " << _ns << endl; // lock all shards so no one can do a split/migrate - for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ){ + for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) { ChunkPtr c = i->second; seen.insert( c->getShard() ); } - - log(1) << "ChunkManager::drop : " << _ns << "\t all locked" << endl; + + log(1) << "ChunkManager::drop : " << _ns << "\t all locked" << endl; // wipe my meta-data _chunkMap.clear(); _chunkRanges.clear(); _shards.clear(); - + // delete data from mongod - for ( set::iterator i=seen.begin(); i!=seen.end(); i++ ){ + for ( set::iterator i=seen.begin(); i!=seen.end(); i++ ) { ScopedDbConnection conn( *i ); conn->dropCollection( _ns ); conn.done(); } - - log(1) << "ChunkManager::drop : " << _ns << "\t removed shard data" << endl; - // clean up database meta-data - uassert( 10176 , "no sharding data?" , _config->removeSharding( _ns ) ); - + log(1) << "ChunkManager::drop : " << _ns << "\t removed shard data" << endl; + // remove chunk data - static Chunk temp(0); - ScopedDbConnection conn( temp.modelServer() ); - conn->remove( temp.getNS() , BSON( "ns" << _ns ) ); + ScopedDbConnection conn( configServer.modelServer() ); + conn->remove( Chunk::chunkMetadataNS , BSON( "ns" << _ns ) ); conn.done(); - log(1) << "ChunkManager::drop : " << _ns << "\t removed chunk data" << endl; - - for ( set::iterator i=seen.begin(); i!=seen.end(); i++ ){ + log(1) << "ChunkManager::drop : " << _ns << "\t removed chunk data" << endl; + + for ( set::iterator i=seen.begin(); i!=seen.end(); i++ ) { ScopedDbConnection conn( *i ); BSONObj res; if ( ! setShardVersion( conn.conn() , _ns , 0 , true , res ) ) - throw UserException( 8071 , (string)"OH KNOW, cleaning up after drop failed: " + res.toString() ); + throw UserException( 8071 , str::stream() << "cleaning up after drop failed: " << res ); conn.done(); } - log(1) << "ChunkManager::drop : " << _ns << "\t DONE" << endl; + log(1) << "ChunkManager::drop : " << _ns << "\t DONE" << endl; configServer.logChange( "dropCollection" , _ns , BSONObj() ); } - - void ChunkManager::save( bool major ){ - rwlock lk( _lock , true ); - save_inlock( major ); - } - - void ChunkManager::save_inlock( bool major ){ - - ShardChunkVersion a = getVersion_inlock(); - assert( a > 0 || _chunkMap.size() <= 1 ); - ShardChunkVersion nextChunkVersion = a; - nextChunkVersion.inc( major ); - - vector toFix; - vector newVersions; - - BSONObjBuilder cmdBuilder; - BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) ); - - - int numOps = 0; - for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ){ - ChunkPtr c = i->second; - if ( ! c->getModified() ) - continue; - - numOps++; - _sequenceNumber = ++NextSequenceNumber; - - ShardChunkVersion myVersion = nextChunkVersion; - nextChunkVersion.incMinor(); - toFix.push_back( c ); - newVersions.push_back( myVersion ); - - BSONObjBuilder op; - op.append( "op" , "u" ); - op.appendBool( "b" , true ); - op.append( "ns" , ShardNS::chunk ); - - BSONObjBuilder n( op.subobjStart( "o" ) ); - c->serialize( n , myVersion ); - n.done(); - - BSONObjBuilder q( op.subobjStart( "o2" ) ); - q.append( "_id" , c->genID() ); - q.done(); - - updates.append( op.obj() ); - } - - if ( numOps == 0 ) - return; - - updates.done(); - - if ( a > 0 || _chunkMap.size() > 1 ){ - BSONArrayBuilder temp( cmdBuilder.subarrayStart( "preCondition" ) ); - BSONObjBuilder b; - b.append( "ns" , ShardNS::chunk ); - b.append( "q" , BSON( "query" << BSON( "ns" << _ns ) << "orderby" << BSON( "lastmod" << -1 ) ) ); - { - BSONObjBuilder bb( b.subobjStart( "res" ) ); - bb.appendTimestamp( "lastmod" , a ); - bb.done(); - } - temp.append( b.obj() ); - temp.done(); - } - BSONObj cmd = cmdBuilder.obj(); - - log(7) << "ChunkManager::save update: " << cmd << endl; - - ScopedDbConnection conn( Chunk(0).modelServer() ); - BSONObj res; - bool ok = conn->runCommand( "config" , cmd , res ); - conn.done(); - - if ( ! ok ){ - stringstream ss; - ss << "saving chunks failed. cmd: " << cmd << " result: " << res; - log( LL_ERROR ) << ss.str() << endl; - msgasserted( 13327 , ss.str() ); - } - - for ( unsigned i=0; i_lastmod = newVersions[i]; - toFix[i]->setModified( false ); - } - - massert( 10417 , "how did version get smalled" , getVersion_inlock() >= a ); - - ensureIndex_inlock(); // TODO: this is too aggressive - but not really sooo bad - } - void ChunkManager::maybeChunkCollection() { uassert( 13346 , "can't pre-split already splitted collection" , (_chunkMap.size() == 1) ); ChunkPtr soleChunk = _chunkMap.begin()->second; vector splitPoints; - soleChunk->pickSplitVector( &splitPoints ); - if ( splitPoints.empty() ){ + soleChunk->pickSplitVector( splitPoints , Chunk::MaxChunkSize ); + if ( splitPoints.empty() ) { log(1) << "not enough data to warrant chunking " << getns() << endl; return; } - soleChunk->multiSplit( splitPoints ); - } - - ShardChunkVersion ChunkManager::getVersionOnConfigServer() const { - static Chunk temp(0); - - ScopedDbConnection conn( temp.modelServer() ); - - auto_ptr cursor = conn->query(temp.getNS(), QUERY("ns" << _ns).sort("lastmod",1), 1 ); - assert( cursor.get() ); - BSONObj o; - if ( cursor->more() ) - o = cursor->next(); - conn.done(); - - return o["lastmod"]; + BSONObj res; + ChunkPtr p; + p = soleChunk->multiSplit( splitPoints , res ); + if ( p.get() == NULL ) { + log( LL_WARNING ) << "could not split '" << getns() << "': " << res << endl; + return; + } } - ShardChunkVersion ChunkManager::getVersion( const Shard& shard ) const{ - rwlock lk( _lock , false ); + ShardChunkVersion ChunkManager::getVersion( const Shard& shard ) const { + rwlock lk( _lock , false ); // TODO: cache or something? - + ShardChunkVersion max = 0; - for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ){ + for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) { ChunkPtr c = i->second; DEV assert( c ); if ( c->getShard() != shard ) continue; - if ( c->_lastmod > max ) - max = c->_lastmod; - } + if ( c->getLastmod() > max ) + max = c->getLastmod(); + } return max; } - ShardChunkVersion ChunkManager::getVersion() const{ - rwlock lk( _lock , false ); - return getVersion_inlock(); - } - - ShardChunkVersion ChunkManager::getVersion_inlock() const{ + ShardChunkVersion ChunkManager::getVersion() const { + rwlock lk( _lock , false ); + ShardChunkVersion max = 0; - - for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ){ + + for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) { ChunkPtr c = i->second; - if ( c->_lastmod > max ) - max = c->_lastmod; - } + if ( c->getLastmod() > max ) + max = c->getLastmod(); + } return max; } string ChunkManager::toString() const { - rwlock lk( _lock , false ); + rwlock lk( _lock , false ); stringstream ss; ss << "ChunkManager: " << _ns << " key:" << _key.toString() << '\n'; - for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ){ + for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) { const ChunkPtr c = i->second; ss << "\t" << c->toString() << '\n'; } return ss.str(); } - void ChunkManager::_migrationNotification(Chunk* c){ - _chunkRanges.reloadRange(_chunkMap, c->getMin(), c->getMax()); - _shards.insert(c->getShard()); - } - - - void ChunkRangeManager::assertValid() const{ + void ChunkRangeManager::assertValid() const { if (_ranges.empty()) return; try { // No Nulls - for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it){ + for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it) { assert(it->second); } - + // Check endpoints assert(allOfType(MinKey, _ranges.begin()->second->getMin())); assert(allOfType(MaxKey, prior(_ranges.end())->second->getMax())); // Make sure there are no gaps or overlaps - for (ChunkRangeMap::const_iterator it=boost::next(_ranges.begin()), end=_ranges.end(); it != end; ++it){ + for (ChunkRangeMap::const_iterator it=boost::next(_ranges.begin()), end=_ranges.end(); it != end; ++it) { ChunkRangeMap::const_iterator last = prior(it); assert(it->second->getMin() == last->second->getMax()); } // Check Map keys - for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it){ + for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it) { assert(it->first == it->second->getMax()); } // Make sure we match the original chunks const ChunkMap chunks = _ranges.begin()->second->getManager()->_chunkMap; - for ( ChunkMap::const_iterator i=chunks.begin(); i!=chunks.end(); ++i ){ + for ( ChunkMap::const_iterator i=chunks.begin(); i!=chunks.end(); ++i ) { const ChunkPtr chunk = i->second; ChunkRangeMap::const_iterator min = _ranges.upper_bound(chunk->getMin()); @@ -1090,8 +976,9 @@ namespace mongo { assert(min->second->contains( chunk->getMin() )); assert(min->second->contains( chunk->getMax() ) || (min->second->getMax() == chunk->getMax())); } - - } catch (...) { + + } + catch (...) { log( LL_ERROR ) << "\t invalid ChunkRangeMap! printing ranges:" << endl; for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it) @@ -1101,15 +988,15 @@ namespace mongo { } } - void ChunkRangeManager::reloadRange(const ChunkMap& chunks, const BSONObj& min, const BSONObj& max){ - if (_ranges.empty()){ + void ChunkRangeManager::reloadRange(const ChunkMap& chunks, const BSONObj& min, const BSONObj& max) { + if (_ranges.empty()) { reloadAll(chunks); return; } - + ChunkRangeMap::iterator low = _ranges.upper_bound(min); ChunkRangeMap::iterator high = _ranges.lower_bound(max); - + assert(low != _ranges.end()); assert(high != _ranges.end()); assert(low->second); @@ -1135,10 +1022,10 @@ namespace mongo { // merge low-end if possible low = _ranges.upper_bound(min); assert(low != _ranges.end()); - if (low != _ranges.begin()){ + if (low != _ranges.begin()) { shared_ptr a = prior(low)->second; shared_ptr b = low->second; - if (a->getShard() == b->getShard()){ + if (a->getShard() == b->getShard()) { shared_ptr cr (new ChunkRange(*a, *b)); _ranges.erase(prior(low)); _ranges.erase(low); // invalidates low @@ -1150,10 +1037,10 @@ namespace mongo { // merge high-end if possible high = _ranges.lower_bound(max); - if (high != prior(_ranges.end())){ + if (high != prior(_ranges.end())) { shared_ptr a = high->second; shared_ptr b = boost::next(high)->second; - if (a->getShard() == b->getShard()){ + if (a->getShard() == b->getShard()) { shared_ptr cr (new ChunkRange(*a, *b)); _ranges.erase(boost::next(high)); _ranges.erase(high); //invalidates high @@ -1164,15 +1051,15 @@ namespace mongo { DEV assertValid(); } - void ChunkRangeManager::reloadAll(const ChunkMap& chunks){ + void ChunkRangeManager::reloadAll(const ChunkMap& chunks) { _ranges.clear(); _insertRange(chunks.begin(), chunks.end()); DEV assertValid(); } - void ChunkRangeManager::_insertRange(ChunkMap::const_iterator begin, const ChunkMap::const_iterator end){ - while (begin != end){ + void ChunkRangeManager::_insertRange(ChunkMap::const_iterator begin, const ChunkMap::const_iterator end) { + while (begin != end) { ChunkMap::const_iterator first = begin; Shard shard = first->second->getShard(); while (begin != end && (begin->second->getShard() == shard)) @@ -1182,32 +1069,50 @@ namespace mongo { _ranges[cr->getMax()] = cr; } } - + + int ChunkManager::getCurrentDesiredChunkSize() const { + // split faster in early chunks helps spread out an initial load better + const int minChunkSize = 1 << 20; // 1 MBytes + + int splitThreshold = Chunk::MaxChunkSize; + + int nc = numChunks(); + + if ( nc < 10 ) { + splitThreshold = max( splitThreshold / 4 , minChunkSize ); + } + else if ( nc < 20 ) { + splitThreshold = max( splitThreshold / 2 , minChunkSize ); + } + + return splitThreshold; + } + class ChunkObjUnitTest : public UnitTest { public: - void runShard(){ + void runShard() { ChunkPtr c; assert( ! c ); c.reset( new Chunk( 0 ) ); assert( c ); } - - void runShardChunkVersion(){ + + void runShardChunkVersion() { vector all; all.push_back( ShardChunkVersion(1,1) ); all.push_back( ShardChunkVersion(1,2) ); all.push_back( ShardChunkVersion(2,1) ); all.push_back( ShardChunkVersion(2,2) ); - - for ( unsigned i=0; i ChunkMap; typedef map,BSONObjCmp> ChunkRangeMap; - + + typedef shared_ptr ChunkManagerPtr; + /** config.chunks { ns : "alleyinsider.fs.chunks" , min : {} , max : {} , server : "localhost:30001" } - + x is in a shard iff min <= x < max - */ + */ class Chunk : boost::noncopyable, public boost::enable_shared_from_this { public: - Chunk( ChunkManager * info ); Chunk( ChunkManager * info , const BSONObj& min, const BSONObj& max, const Shard& shard); - - const BSONObj& getMin() const { return _min; } - const BSONObj& getMax() const { return _max; } - - void setMin(const BSONObj& o){ - _min = o; - } - void setMax(const BSONObj& o){ - _max = o; - } - - string getns() const; - Shard getShard() const { return _shard; } + // + // serialization support + // - void setShard( const Shard& shard ); - - bool contains( const BSONObj& obj ) const; + void serialize(BSONObjBuilder& to, ShardChunkVersion myLastMod=0); + void unserialize(const BSONObj& from); - string toString() const; + // + // chunk boundary support + // - friend ostream& operator << (ostream& out, const Chunk& c){ return (out << c.toString()); } + const BSONObj& getMin() const { return _min; } + const BSONObj& getMax() const { return _max; } + void setMin(const BSONObj& o) { _min = o; } + void setMax(const BSONObj& o) { _max = o; } - bool operator==(const Chunk& s) const; - - bool operator!=(const Chunk& s) const{ - return ! ( *this == s ); - } - // if min/max key is pos/neg infinity bool minIsInf() const; bool maxIsInf() const; - BSONObj pickSplitPoint() const; - ChunkPtr split(); + bool contains( const BSONObj& obj ) const; - void pickSplitVector( vector* splitPoints ) const; - ChunkPtr multiSplit( const vector& splitPoints ); + string genID() const; + static string genID( const string& ns , const BSONObj& min ); + + // + // chunk version support + // + + void appendShortVersion( const char * name , BSONObjBuilder& b ); + + ShardChunkVersion getLastmod() const { return _lastmod; } + void setLastmod( ShardChunkVersion v ) { _lastmod = v; } + + // + // split support + // - /** - * @return size of shard in bytes - * talks to mongod to do this - */ - long getPhysicalSize() const; - - int countObjects(int maxcount=0) const; - /** * if the amount of data written nears the max size of a shard * then we check the real size, and if its too big, we split + * @return if something was split */ bool splitIfShould( long dataWritten ); - - /* + + /** + * Splits this chunk at a non-specificed split key to be chosen by the mongod holding this chunk. + * + * @param force if set to true, will split the chunk regardless if the split is really necessary size wise + * if set to false, will only split if the chunk has reached the currently desired maximum size + * @param res the object containing details about the split execution + * @return if found a key, return a pointer to the first chunk, otherwise return a null pointer + */ + ChunkPtr singleSplit( bool force , BSONObj& res ); + + /** + * Splits this chunk at the given key (or keys) + * + * @param splitPoints the vector of keys that should be used to divide this chunk + * @param res the object containing details about the split execution + * @return shared pointer to the first new Chunk or null pointer if failed + */ + ChunkPtr multiSplit( const vector& splitPoints , BSONObj& res ); + + /** + * Asks the mongod holding this chunk to find a key that approximately divides this chunk in two + * + * @param medianKey the key that divides this chunk, if there is one, or empty + */ + void pickMedianKey( BSONObj& medianKey ) const; + + /** + * @param splitPoints vector to be filled in + * @param chunkSize chunk size to target in bytes + * @param maxPoints limits the number of split points that are needed, zero is max (optional) + * @param maxObjs limits the number of objects in each chunk, zero is as max (optional) + */ + void pickSplitVector( vector& splitPoints , int chunkSize , int maxPoints = 0, int maxObjs = 0) const; + + // + // migration support + // + + /** * moves either this shard or newShard if it makes sense too + * * @return whether or not a shard was moved */ bool moveIfShould( ChunkPtr newShard = ChunkPtr() ); - bool moveAndCommit( const Shard& to , string& errmsg ); + /** + * Issues a migrate request for this chunk + * + * @param to shard to move this chunk to + * @param chunSize maximum number of bytes beyond which the migrate should no go trhough + * @param res the object containing details about the migrate execution + * @return true if move was successful + */ + bool moveAndCommit( const Shard& to , long long chunkSize , BSONObj& res ); - const char * getNS(){ return "config.chunks"; } - void serialize(BSONObjBuilder& to, ShardChunkVersion myLastMod=0); - void unserialize(const BSONObj& from); - string modelServer() const; - - void appendShortVersion( const char * name , BSONObjBuilder& b ); + /** + * @return size of shard in bytes + * talks to mongod to do this + */ + long getPhysicalSize() const; + + // + // chunk size support + int countObjects(int maxcount=0) const; + + // + // public constants + // + + static string chunkMetadataNS; static int MaxChunkSize; - string genID() const; - static string genID( const string& ns , const BSONObj& min ); + // + // accessors and helpers + // - const ChunkManager* getManager() const { return _manager; } - - bool getModified() { return _modified; } - void setModified( bool modified ) { _modified = modified; } + string toString() const; - ShardChunkVersion getVersionOnConfigServer() const; - private: + friend ostream& operator << (ostream& out, const Chunk& c) { return (out << c.toString()); } + bool operator==(const Chunk& s) const; + bool operator!=(const Chunk& s) const { return ! ( *this == s ); } - bool _splitIfShould( long dataWritten ); + string getns() const; + const char * getNS() { return "config.chunks"; } + Shard getShard() const { return _shard; } + const ChunkManager* getManager() const { return _manager; } + private: // main shard info - + ChunkManager * _manager; - ShardKeyPattern skey() const; BSONObj _min; BSONObj _max; Shard _shard; ShardChunkVersion _lastmod; - bool _modified; - // transient stuff long _dataWritten; - + // methods, etc.. - - void _split( BSONObj& middle ); - friend class ChunkManager; - friend class ShardObjUnitTest; + /** + * if sort 1, return lowest key + * if sort -1, return highest key + * will return empty object if have none + */ + BSONObj _getExtremeKey( int sort ) const; + + /** initializes _dataWritten with a random value so that a mongos restart wouldn't cause delay in splitting */ + void _setDataWritten(); + + ShardKeyPattern skey() const; }; - class ChunkRange{ + class ChunkRange { public: - const ChunkManager* getManager() const{ return _manager; } - Shard getShard() const{ return _shard; } + const ChunkManager* getManager() const { return _manager; } + Shard getShard() const { return _shard; } const BSONObj& getMin() const { return _min; } const BSONObj& getMax() const { return _max; } @@ -181,11 +234,10 @@ namespace mongo { : _manager(begin->second->getManager()) , _shard(begin->second->getShard()) , _min(begin->second->getMin()) - , _max(prior(end)->second->getMax()) - { + , _max(prior(end)->second->getMax()) { assert( begin != end ); - DEV while (begin != end){ + DEV while (begin != end) { assert(begin->second->getManager() == _manager); assert(begin->second->getShard() == _shard); ++begin; @@ -197,14 +249,13 @@ namespace mongo { : _manager(min.getManager()) , _shard(min.getShard()) , _min(min.getMin()) - , _max(max.getMax()) - { + , _max(max.getMax()) { assert(min.getShard() == max.getShard()); assert(min.getManager() == max.getManager()); assert(min.getMax() == max.getMin()); } - friend ostream& operator<<(ostream& out, const ChunkRange& cr){ + friend ostream& operator<<(ostream& out, const ChunkRange& cr) { return (out << "ChunkRange(min=" << cr._min << ", max=" << cr._max << ", shard=" << cr._shard <<")"); } @@ -239,7 +290,7 @@ namespace mongo { }; /* config.sharding - { ns: 'alleyinsider.fs.chunks' , + { ns: 'alleyinsider.fs.chunks' , key: { ts : 1 } , shards: [ { min: 1, max: 100, server: a } , { min: 101, max: 200 , server : b } ] } @@ -247,75 +298,61 @@ namespace mongo { class ChunkManager { public: - ChunkManager( DBConfig * config , string ns , ShardKeyPattern pattern , bool unique ); + ChunkManager( string ns , ShardKeyPattern pattern , bool unique ); virtual ~ChunkManager(); string getns() const { return _ns; } - + int numChunks() const { rwlock lk( _lock , false ); return _chunkMap.size(); } bool hasShardKey( const BSONObj& obj ); + void createFirstChunk( const Shard& shard ); ChunkPtr findChunk( const BSONObj& obj , bool retry = false ); ChunkPtr findChunkOnServer( const Shard& shard ) const; - - ShardKeyPattern& getShardKey(){ return _key; } + const ShardKeyPattern& getShardKey() const { return _key; } - bool isUnique(){ return _unique; } + bool isUnique() const { return _unique; } void maybeChunkCollection(); - + void getShardsForQuery( set& shards , const BSONObj& query ); void getAllShards( set& all ); void getShardsForRange(set& shards, const BSONObj& min, const BSONObj& max); // [min, max) - void save( bool major ); - string toString() const; ShardChunkVersion getVersion( const Shard& shard ) const; ShardChunkVersion getVersion() const; - /** - * actually does a query on the server - * doesn't look at any local data - */ - ShardChunkVersion getVersionOnConfigServer() const; - /** * this is just an increasing number of how many ChunkManagers we have so we know if something has been updated */ - unsigned long long getSequenceNumber(){ - return _sequenceNumber; - } - - void getInfo( BSONObjBuilder& b ){ + unsigned long long getSequenceNumber() const { return _sequenceNumber; } + + void getInfo( BSONObjBuilder& b ) { b.append( "key" , _key.key() ); b.appendBool( "unique" , _unique ); } - + /** * @param me - so i don't get deleted before i'm done */ void drop( ChunkManagerPtr me ); void _printChunks() const; - + + int getCurrentDesiredChunkSize() const; + private: - void _reload(); void _reload_inlock(); void _load(); - void save_inlock( bool major ); - ShardChunkVersion getVersion_inlock() const; void ensureIndex_inlock(); - - DBConfig * _config; + string _ns; ShardKeyPattern _key; bool _unique; - - map _maxMarkers; ChunkMap _chunkMap; ChunkRangeManager _chunkRanges; @@ -323,11 +360,9 @@ namespace mongo { set _shards; unsigned long long _sequenceNumber; - - mutable RWLock _lock; - // This should only be called from Chunk after it has been migrated - void _migrationNotification(Chunk* c); + mutable RWLock _lock; + DistributedLock _nsLock; friend class Chunk; friend class ChunkRangeManager; // only needed for CRM::assertValid() @@ -362,12 +397,14 @@ namespace mongo { /* struct chunk_lock { chunk_lock( const Chunk* c ){ - + } - + Chunk _c; }; */ inline string Chunk::genID() const { return genID(_manager->getns(), _min); } + bool setShardVersion( DBClientBase & conn , const string& ns , ShardChunkVersion version , bool authoritative , BSONObj& result ); + } // namespace mongo diff --git a/s/client.cpp b/s/client.cpp new file mode 100644 index 0000000..b8559b6 --- /dev/null +++ b/s/client.cpp @@ -0,0 +1,292 @@ +// s/client.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "server.h" + +#include "../db/commands.h" +#include "../db/dbmessage.h" +#include "../db/stats/counters.h" + +#include "../client/connpool.h" + +#include "client.h" +#include "request.h" +#include "config.h" +#include "chunk.h" +#include "stats.h" +#include "cursors.h" +#include "grid.h" +#include "s/writeback_listener.h" + +namespace mongo { + + ClientInfo::ClientInfo( int clientId ) : _id( clientId ) { + _cur = &_a; + _prev = &_b; + _autoSplitOk = true; + newRequest(); + } + + ClientInfo::~ClientInfo() { + if ( _lastAccess ) { + scoped_lock lk( _clientsLock ); + Cache::iterator i = _clients.find( _id ); + if ( i != _clients.end() ) { + _clients.erase( i ); + } + } + } + + void ClientInfo::addShard( const string& shard ) { + _cur->insert( shard ); + _sinceLastGetError.insert( shard ); + } + + void ClientInfo::newRequest( AbstractMessagingPort* p ) { + + if ( p ) { + HostAndPort r = p->remote(); + if ( _remote.port() == -1 ) + _remote = r; + else if ( _remote != r ) { + stringstream ss; + ss << "remotes don't match old [" << _remote.toString() << "] new [" << r.toString() << "]"; + throw UserException( 13134 , ss.str() ); + } + } + + _lastAccess = (int) time(0); + + set * temp = _cur; + _cur = _prev; + _prev = temp; + _cur->clear(); + } + + void ClientInfo::disconnect() { + _lastAccess = 0; + } + + ClientInfo * ClientInfo::get( int clientId , bool create ) { + + if ( ! clientId ) + clientId = getClientId(); + + if ( ! clientId ) { + ClientInfo * info = _tlInfo.get(); + if ( ! info ) { + info = new ClientInfo( 0 ); + _tlInfo.reset( info ); + } + info->newRequest(); + return info; + } + + scoped_lock lk( _clientsLock ); + Cache::iterator i = _clients.find( clientId ); + if ( i != _clients.end() ) + return i->second; + if ( ! create ) + return 0; + ClientInfo * info = new ClientInfo( clientId ); + _clients[clientId] = info; + return info; + } + + void ClientInfo::disconnect( int clientId ) { + if ( ! clientId ) + return; + + scoped_lock lk( _clientsLock ); + Cache::iterator i = _clients.find( clientId ); + if ( i == _clients.end() ) + return; + + ClientInfo* ci = i->second; + ci->disconnect(); + delete ci; + _clients.erase( i ); + } + + void ClientInfo::_addWriteBack( vector& all , const BSONObj& gle ) { + BSONElement w = gle["writeback"]; + + if ( w.type() != jstOID ) + return; + + BSONElement cid = gle["connectionId"]; + + if ( cid.eoo() ) { + error() << "getLastError writeback can't work because of version mis-match" << endl; + return; + } + + all.push_back( WBInfo( cid.numberLong() , w.OID() ) ); + } + + vector ClientInfo::_handleWriteBacks( vector& all , bool fromWriteBackListener ) { + vector res; + + if ( fromWriteBackListener ) { + LOG(1) << "not doing recusrive writeback" << endl; + return res; + } + + if ( all.size() == 0 ) + return res; + + for ( unsigned i=0; i * shards = getPrev(); + + if ( shards->size() == 0 ) { + result.appendNull( "err" ); + return true; + } + + vector writebacks; + + // handle single server + if ( shards->size() == 1 ) { + string theShard = *(shards->begin() ); + + ShardConnection conn( theShard , "" ); + + BSONObj res; + bool ok = conn->runCommand( "admin" , options , res ); + res = res.getOwned(); + conn.done(); + + + _addWriteBack( writebacks , res ); + + // hit other machines just to block + for ( set::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { + string temp = *i; + if ( temp == theShard ) + continue; + + ShardConnection conn( temp , "" ); + _addWriteBack( writebacks , conn->getLastErrorDetailed() ); + conn.done(); + } + clearSinceLastGetError(); + + if ( writebacks.size() ){ + vector v = _handleWriteBacks( writebacks , fromWriteBackListener ); + if ( v.size() == 0 && fromWriteBackListener ) { + // ok + } + else { + assert( v.size() == 1 ); + result.appendElements( v[0] ); + result.appendElementsUnique( res ); + result.append( "initialGLEHost" , theShard ); + } + } + else { + result.append( "singleShard" , theShard ); + result.appendElements( res ); + } + + return ok; + } + + BSONArrayBuilder bbb( result.subarrayStart( "shards" ) ); + + long long n = 0; + + // hit each shard + vector errors; + vector errorObjects; + for ( set::iterator i = shards->begin(); i != shards->end(); i++ ) { + string theShard = *i; + bbb.append( theShard ); + ShardConnection conn( theShard , "" ); + BSONObj res; + bool ok = conn->runCommand( "admin" , options , res ); + _addWriteBack( writebacks, res ); + + string temp = DBClientWithCommands::getLastErrorString( res ); + if ( conn->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) { + errors.push_back( temp ); + errorObjects.push_back( res ); + } + n += res["n"].numberLong(); + conn.done(); + } + + bbb.done(); + + result.appendNumber( "n" , n ); + + // hit other machines just to block + for ( set::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { + string temp = *i; + if ( shards->count( temp ) ) + continue; + + ShardConnection conn( temp , "" ); + _addWriteBack( writebacks, conn->getLastErrorDetailed() ); + conn.done(); + } + clearSinceLastGetError(); + + if ( errors.size() == 0 ) { + result.appendNull( "err" ); + _handleWriteBacks( writebacks , fromWriteBackListener ); + return true; + } + + result.append( "err" , errors[0].c_str() ); + + { + // errs + BSONArrayBuilder all( result.subarrayStart( "errs" ) ); + for ( unsigned i=0; i ClientInfo::_tlInfo; + +} // namespace mongo diff --git a/s/client.h b/s/client.h new file mode 100644 index 0000000..bd4295f --- /dev/null +++ b/s/client.h @@ -0,0 +1,120 @@ +// client.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "../pch.h" + +namespace mongo { + + /** + * holds information about a client connected to a mongos + * 1 per client socket + * currently implemented with a thread local + */ + class ClientInfo { + + typedef map Cache; + + public: + ClientInfo( int clientId ); + ~ClientInfo(); + + /** new request from client, adjusts internal state */ + void newRequest( AbstractMessagingPort* p = 0 ); + + /** client disconnected */ + void disconnect(); + + /** + * @return remote socket address of the client + */ + HostAndPort getRemote() const { return _remote; } + + /** + * notes that this client use this shard + * keeps track of all shards accessed this request + */ + void addShard( const string& shard ); + + /** + * gets shards used on the previous request + */ + set * getPrev() const { return _prev; }; + + /** + * gets all shards we've accessed since the last time we called clearSinceLastGetError + */ + const set& sinceLastGetError() const { return _sinceLastGetError; } + + /** + * clears list of shards we've talked to + */ + void clearSinceLastGetError() { _sinceLastGetError.clear(); } + + /** + * calls getLastError + * resets shards since get last error + * @return if the command was ok or if there was an error + */ + bool getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener = false ); + + /** @return if its ok to auto split from this client */ + bool autoSplitOk() const { return _autoSplitOk; } + + void noAutoSplit() { _autoSplitOk = false; } + + static ClientInfo * get( int clientId = 0 , bool create = true ); + static void disconnect( int clientId ); + + private: + + struct WBInfo { + WBInfo( ConnectionId c , OID o ) : connectionId( c ) , id( o ) {} + ConnectionId connectionId; + OID id; + }; + + // for getLastError + void _addWriteBack( vector& all , const BSONObj& o ); + vector _handleWriteBacks( vector& all , bool fromWriteBackListener ); + + + int _id; // unique client id + HostAndPort _remote; // server:port of remote socket end + + // we use _a and _b to store shards we've talked to on the current request and the previous + // we use 2 so we can flip for getLastError type operations + + set _a; // actual set for _cur or _prev + set _b; // " + + set * _cur; // pointer to _a or _b depending on state + set * _prev; // "" + + + set _sinceLastGetError; // all shards accessed since last getLastError + + int _lastAccess; + bool _autoSplitOk; + + static mongo::mutex _clientsLock; + static Cache& _clients; + static boost::thread_specific_ptr _tlInfo; + }; + + +} diff --git a/s/commands_admin.cpp b/s/commands_admin.cpp index 551b8a9..532161a 100644 --- a/s/commands_admin.cpp +++ b/s/commands_admin.cpp @@ -29,6 +29,7 @@ #include "../util/message.h" #include "../util/processinfo.h" #include "../util/stringutils.h" +#include "../util/version.h" #include "../client/connpool.h" @@ -41,6 +42,8 @@ #include "grid.h" #include "strategy.h" #include "stats.h" +#include "writeback_listener.h" +#include "client.h" namespace mongo { @@ -48,7 +51,7 @@ namespace mongo { class GridAdminCmd : public Command { public: - GridAdminCmd( const char * n ) : Command( n , false, tolowerString(n).c_str() ){ + GridAdminCmd( const char * n ) : Command( n , false, tolowerString(n).c_str() ) { } virtual bool slaveOk() const { return true; @@ -58,7 +61,7 @@ namespace mongo { } // all grid commands are designed not to lock - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } }; // --------------- misc commands ---------------------- @@ -69,31 +72,34 @@ namespace mongo { virtual void help( stringstream& help ) const { help << " shows status/reachability of servers in the cluster"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { result.append("configserver", configServer.getPrimary().getConnString() ); result.append("isdbgrid", 1); return true; } } netstat; - + class ServerStatusCmd : public Command { public: - ServerStatusCmd() : Command( "serverStatus" , true ){ + ServerStatusCmd() : Command( "serverStatus" , true ) { _started = time(0); } - + virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return NONE; } - + virtual LockType locktype() const { return NONE; } + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + result.append( "host" , prettyHostName() ); + result.append("version", versionString); + result.append("process","mongos"); result.append("uptime",(double) (time(0)-_started)); result.appendDate( "localTime" , jsTime() ); { BSONObjBuilder t( result.subobjStart( "mem" ) ); - + ProcessInfo p; - if ( p.supported() ){ + if ( p.supported() ) { t.appendNumber( "resident" , p.getResidentSize() ); t.appendNumber( "virtual" , p.getVirtualMemorySize() ); t.appendBool( "supported" , true ); @@ -102,7 +108,7 @@ namespace mongo { result.append( "note" , "not all mem info support on this platform" ); t.appendBool( "supported" , false ); } - + t.done(); } @@ -112,7 +118,7 @@ namespace mongo { bb.append( "available" , connTicketHolder.available() ); bb.done(); } - + { BSONObjBuilder bb( result.subobjStart( "extra_info" ) ); bb.append("note", "fields vary by platform"); @@ -120,7 +126,7 @@ namespace mongo { p.getExtraInfo(bb); bb.done(); } - + result.append( "opcounters" , globalOpCounters.getObj() ); { BSONObjBuilder bb( result.subobjStart( "ops" ) ); @@ -130,7 +136,7 @@ namespace mongo { } result.append( "shardCursorType" , shardedCursorTypes.getObj() ); - + { BSONObjBuilder asserts( result.subobjStart( "asserts" ) ); asserts.append( "regular" , assertionCount.regular ); @@ -141,6 +147,13 @@ namespace mongo { asserts.done(); } + { + BSONObjBuilder bb( result.subobjStart( "network" ) ); + networkCounter.append( bb ); + bb.done(); + } + + return 1; } @@ -149,34 +162,34 @@ namespace mongo { class FsyncCommand : public GridAdminCmd { public: - FsyncCommand() : GridAdminCmd( "fsync" ){} - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - if ( cmdObj["lock"].trueValue() ){ + FsyncCommand() : GridAdminCmd( "fsync" ) {} + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + if ( cmdObj["lock"].trueValue() ) { errmsg = "can't do lock through mongos"; return false; } - + BSONObjBuilder sub; bool ok = true; int numFiles = 0; - + vector shards; Shard::getAllShards( shards ); - for ( vector::iterator i=shards.begin(); i!=shards.end(); i++ ){ + for ( vector::iterator i=shards.begin(); i!=shards.end(); i++ ) { Shard s = *i; BSONObj x = s.runCommand( "admin" , "fsync" ); sub.append( s.getName() , x ); - if ( ! x["ok"].trueValue() ){ + if ( ! x["ok"].trueValue() ) { ok = false; errmsg = x["errmsg"].String(); } - + numFiles += x["numFiles"].numberInt(); } - + result.append( "numFiles" , numFiles ); result.append( "all" , sub.obj() ); return ok; @@ -192,43 +205,43 @@ namespace mongo { help << " example: { moveprimary : 'foo' , to : 'localhost:9999' }"; // TODO: locking? } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string dbname = cmdObj.firstElement().valuestrsafe(); - if ( dbname.size() == 0 ){ + if ( dbname.size() == 0 ) { errmsg = "no db"; return false; } - if ( dbname == "config" ){ + if ( dbname == "config" ) { errmsg = "can't move config db"; return false; } DBConfigPtr config = grid.getDBConfig( dbname , false ); - if ( ! config ){ + if ( ! config ) { errmsg = "can't find db!"; return false; } string to = cmdObj["to"].valuestrsafe(); - if ( ! to.size() ){ + if ( ! to.size() ) { errmsg = "you have to specify where you want to move it"; return false; } Shard s = Shard::make( to ); - if ( config->getPrimary() == s.getConnString() ){ + if ( config->getPrimary() == s.getConnString() ) { errmsg = "thats already the primary"; return false; } - if ( ! grid.knowAboutShard( s.getConnString() ) ){ + if ( ! grid.knowAboutShard( s.getConnString() ) ) { errmsg = "that server isn't known to me"; return false; } - - log() << "movePrimary: moving " << dbname << " primary from: " << config->getPrimary().toString() + + log() << "movePrimary: moving " << dbname << " primary from: " << config->getPrimary().toString() << " to: " << s.toString() << endl; // TODO LOCKING: this is not safe with multiple mongos @@ -241,7 +254,7 @@ namespace mongo { bool worked = toconn->runCommand( dbname.c_str() , BSON( "clone" << config->getPrimary().getConnString() ) , cloneRes ); toconn.done(); - if ( ! worked ){ + if ( ! worked ) { log() << "clone failed" << cloneRes << endl; errmsg = "clone failed"; return false; @@ -264,25 +277,25 @@ namespace mongo { class EnableShardingCmd : public GridAdminCmd { public: - EnableShardingCmd() : GridAdminCmd( "enableSharding" ){} + EnableShardingCmd() : GridAdminCmd( "enableSharding" ) {} virtual void help( stringstream& help ) const { help - << "Enable sharding for a db. (Use 'shardcollection' command afterwards.)\n" - << " { enablesharding : \"\" }\n"; + << "Enable sharding for a db. (Use 'shardcollection' command afterwards.)\n" + << " { enablesharding : \"\" }\n"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string dbname = cmdObj.firstElement().valuestrsafe(); - if ( dbname.size() == 0 ){ + if ( dbname.size() == 0 ) { errmsg = "no db"; return false; } DBConfigPtr config = grid.getDBConfig( dbname ); - if ( config->isShardingEnabled() ){ + if ( config->isShardingEnabled() ) { errmsg = "already enabled"; return false; } - + log() << "enabling sharding on: " << dbname << endl; config->enableSharding(); @@ -295,46 +308,46 @@ namespace mongo { class ShardCollectionCmd : public GridAdminCmd { public: - ShardCollectionCmd() : GridAdminCmd( "shardCollection" ){} + ShardCollectionCmd() : GridAdminCmd( "shardCollection" ) {} virtual void help( stringstream& help ) const { help - << "Shard a collection. Requires key. Optional unique. Sharding must already be enabled for the database.\n" - << " { enablesharding : \"\" }\n"; + << "Shard a collection. Requires key. Optional unique. Sharding must already be enabled for the database.\n" + << " { enablesharding : \"\" }\n"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj.firstElement().valuestrsafe(); - if ( ns.size() == 0 ){ + if ( ns.size() == 0 ) { errmsg = "no ns"; return false; } DBConfigPtr config = grid.getDBConfig( ns ); - if ( ! config->isShardingEnabled() ){ + if ( ! config->isShardingEnabled() ) { errmsg = "sharding not enabled for db"; return false; } - if ( config->isSharded( ns ) ){ + if ( config->isSharded( ns ) ) { errmsg = "already sharded"; return false; } BSONObj key = cmdObj.getObjectField( "key" ); - if ( key.isEmpty() ){ + if ( key.isEmpty() ) { errmsg = "no shard key"; return false; } - BSONForEach(e, key){ - if (!e.isNumber() || e.number() != 1.0){ + BSONForEach(e, key) { + if (!e.isNumber() || e.number() != 1.0) { errmsg = "shard keys must all be ascending"; return false; } } - if ( ns.find( ".system." ) != string::npos ){ + if ( ns.find( ".system." ) != string::npos ) { errmsg = "can't shard system namespaces"; return false; } @@ -344,10 +357,10 @@ namespace mongo { // 1. A unique index must have the sharding key as its prefix. Otherwise maintainig uniqueness would // require coordinated access to all shards. Trying to shard a collection with such an index is not // allowed. - // + // // 2. Sharding a collection requires an index over the sharding key. That index must be create upfront. // The rationale is that sharding a non-empty collection would need to create the index and that could - // be slow. Requiring the index upfront allows the admin to plan before sharding and perhaps use + // be slow. Requiring the index upfront allows the admin to plan before sharding and perhaps use // background index construction. One exception to the rule: empty collections. It's fairly easy to // create the index as part of the sharding process. // @@ -358,20 +371,20 @@ namespace mongo { bool hasShardIndex = false; ScopedDbConnection conn( config->getPrimary() ); - BSONObjBuilder b; - b.append( "ns" , ns ); + BSONObjBuilder b; + b.append( "ns" , ns ); auto_ptr cursor = conn->query( config->getName() + ".system.indexes" , b.obj() ); - while ( cursor->more() ){ + while ( cursor->more() ) { BSONObj idx = cursor->next(); // Is index key over the sharding key? Remember that. - if ( key.woCompare( idx["key"].embeddedObjectUserCheck() ) == 0 ){ + if ( key.woCompare( idx["key"].embeddedObjectUserCheck() ) == 0 ) { hasShardIndex = true; } // Not a unique index? Move on. - if ( idx["unique"].eoo() || ! idx["unique"].Bool() ) + if ( idx["unique"].eoo() || ! idx["unique"].trueValue() ) continue; // Shard key is prefix of unique index? Move on. @@ -384,17 +397,31 @@ namespace mongo { } BSONObj res = conn->findOne( config->getName() + ".system.namespaces" , BSON( "name" << ns ) ); - if ( res["options"].type() == Object && res["options"].embeddedObject()["capped"].trueValue() ){ + if ( res["options"].type() == Object && res["options"].embeddedObject()["capped"].trueValue() ) { errmsg = "can't shard capped collection"; conn.done(); return false; } - if ( ! hasShardIndex && ( conn->count( ns ) != 0 ) ){ + if ( hasShardIndex ) { + // make sure there are no null entries in the sharding index + BSONObjBuilder cmd; + cmd.append( "checkShardingIndex" , ns ); + cmd.append( "keyPattern" , key ); + BSONObj cmdObj = cmd.obj(); + if ( ! conn->runCommand( "admin" , cmdObj , res )) { + errmsg = res["errmsg"].str(); + conn.done(); + return false; + } + } + + if ( ! hasShardIndex && ( conn->count( ns ) != 0 ) ) { errmsg = "please create an index over the sharding key before sharding."; + conn.done(); return false; } - + conn.done(); } @@ -409,26 +436,26 @@ namespace mongo { class GetShardVersion : public GridAdminCmd { public: - GetShardVersion() : GridAdminCmd( "getShardVersion" ){} + GetShardVersion() : GridAdminCmd( "getShardVersion" ) {} virtual void help( stringstream& help ) const { help << " example: { getShardVersion : 'alleyinsider.foo' } "; } - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj.firstElement().valuestrsafe(); - if ( ns.size() == 0 ){ + if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } - + DBConfigPtr config = grid.getDBConfig( ns ); - if ( ! config->isSharded( ns ) ){ + if ( ! config->isSharded( ns ) ) { errmsg = "ns not sharded."; return false; } - + ChunkManagerPtr cm = config->getChunkManager( ns ); - if ( ! cm ){ + if ( ! cm ) { errmsg = "no chunk manager?"; return false; } @@ -439,144 +466,141 @@ namespace mongo { } } getShardVersionCmd; - class SplitCollectionHelper : public GridAdminCmd { + class SplitCollectionCmd : public GridAdminCmd { public: - SplitCollectionHelper( const char * name ) : GridAdminCmd( name ) , _name( name ){} + SplitCollectionCmd() : GridAdminCmd( "split" ) {} virtual void help( stringstream& help ) const { help - << " example: { split : 'alleyinsider.blog.posts' , find : { ts : 1 } } - split the shard that contains give key \n" - << " example: { split : 'alleyinsider.blog.posts' , middle : { ts : 1 } } - split the shard that contains the key with this as the middle \n" - << " NOTE: this does not move move the chunks, it merely creates a logical seperation \n" - ; + << " example: - split the shard that contains give key \n" + << " { split : 'alleyinsider.blog.posts' , find : { ts : 1 } }\n" + << " example: - split the shard that contains the key with this as the middle \n" + << " { split : 'alleyinsider.blog.posts' , middle : { ts : 1 } }\n" + << " NOTE: this does not move move the chunks, it merely creates a logical seperation \n" + ; } - virtual bool _split( BSONObjBuilder& result , string&errmsg , const string& ns , ChunkManagerPtr manager , ChunkPtr old , BSONObj middle ) = 0; - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { ShardConnection::sync(); string ns = cmdObj.firstElement().valuestrsafe(); - if ( ns.size() == 0 ){ + if ( ns.size() == 0 ) { errmsg = "no ns"; return false; } DBConfigPtr config = grid.getDBConfig( ns ); - if ( ! config->isSharded( ns ) ){ + if ( ! config->isSharded( ns ) ) { errmsg = "ns not sharded. have to shard before can split"; return false; } BSONObj find = cmdObj.getObjectField( "find" ); - if ( find.isEmpty() ){ + if ( find.isEmpty() ) { find = cmdObj.getObjectField( "middle" ); - if ( find.isEmpty() ){ + if ( find.isEmpty() ) { errmsg = "need to specify find or middle"; return false; } } - - ChunkManagerPtr info = config->getChunkManager( ns ); - ChunkPtr old = info->findChunk( find ); - - return _split( result , errmsg , ns , info , old , cmdObj.getObjectField( "middle" ) ); - } - - protected: - string _name; - }; - - class SplitValueCommand : public SplitCollectionHelper { - public: - SplitValueCommand() : SplitCollectionHelper( "splitValue" ){} - virtual bool _split( BSONObjBuilder& result , string& errmsg , const string& ns , ChunkManagerPtr manager , ChunkPtr old , BSONObj middle ){ - - result << "shardinfo" << old->toString(); - - result.appendBool( "auto" , middle.isEmpty() ); - - if ( middle.isEmpty() ) - middle = old->pickSplitPoint(); - result.append( "middle" , middle ); - - return true; - } + ChunkManagerPtr info = config->getChunkManager( ns ); + ChunkPtr chunk = info->findChunk( find ); + BSONObj middle = cmdObj.getObjectField( "middle" ); - } splitValueCmd; + assert( chunk.get() ); + log() << "splitting: " << ns << " shard: " << chunk << endl; + BSONObj res; + ChunkPtr p; + if ( middle.isEmpty() ) { + p = chunk->singleSplit( true /* force a split even if not enough data */ , res ); - class SplitCollection : public SplitCollectionHelper { - public: - SplitCollection() : SplitCollectionHelper( "split" ){} - virtual bool _split( BSONObjBuilder& result , string& errmsg , const string& ns , ChunkManagerPtr manager , ChunkPtr old , BSONObj middle ){ - assert( old.get() ); - log() << "splitting: " << ns << " shard: " << old << endl; - - if ( middle.isEmpty() ) - old->split(); + } else { + // sanity check if the key provided is a valid split point + if ( ( middle == chunk->getMin() ) || ( middle == chunk->getMax() ) ) { + errmsg = "cannot split on initial or final chunk's key"; + return false; + } + vector splitPoints; splitPoints.push_back( middle ); - old->multiSplit( splitPoints ); + p = chunk->multiSplit( splitPoints , res ); } + if ( p.get() == NULL ) { + errmsg = "split failed"; + result.append( "cause" , res ); + return false; + } + config->getChunkManager( ns , true ); return true; } - - } splitCollectionCmd; class MoveChunkCmd : public GridAdminCmd { public: - MoveChunkCmd() : GridAdminCmd( "moveChunk" ){} + MoveChunkCmd() : GridAdminCmd( "moveChunk" ) {} virtual void help( stringstream& help ) const { help << "{ movechunk : 'test.foo' , find : { num : 1 } , to : 'localhost:30001' }"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { ShardConnection::sync(); Timer t; string ns = cmdObj.firstElement().valuestrsafe(); - if ( ns.size() == 0 ){ + if ( ns.size() == 0 ) { errmsg = "no ns"; return false; } DBConfigPtr config = grid.getDBConfig( ns ); - if ( ! config->isSharded( ns ) ){ + if ( ! config->isSharded( ns ) ) { errmsg = "ns not sharded. have to shard before can move a chunk"; return false; } BSONObj find = cmdObj.getObjectField( "find" ); - if ( find.isEmpty() ){ + if ( find.isEmpty() ) { errmsg = "need to specify find. see help"; return false; } string toString = cmdObj["to"].valuestrsafe(); - if ( ! toString.size() ){ + if ( ! toString.size() ) { errmsg = "you have to specify where you want to move the chunk"; return false; } - + Shard to = Shard::make( toString ); + // so far, chunk size serves test purposes; it may or may not become a supported parameter + long long maxChunkSizeBytes = cmdObj["maxChunkSizeBytes"].numberLong(); + if ( maxChunkSizeBytes == 0 ) { + maxChunkSizeBytes = Chunk::MaxChunkSize; + } + tlog() << "CMD: movechunk: " << cmdObj << endl; ChunkManagerPtr info = config->getChunkManager( ns ); ChunkPtr c = info->findChunk( find ); const Shard& from = c->getShard(); - if ( from == to ){ + if ( from == to ) { errmsg = "that chunk is already on that shard"; return false; } - - if ( ! c->moveAndCommit( to , errmsg ) ) + + BSONObj res; + if ( ! c->moveAndCommit( to , maxChunkSizeBytes , res ) ) { + errmsg = "move failed"; + result.append( "cause" , res ); return false; + } + + // pre-emptively reload the config to get new version info + config->getChunkManager( ns , true ); result.append( "millis" , t.millis() ); return true; @@ -591,12 +615,12 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "list all shards of the system"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { ScopedDbConnection conn( configServer.getPrimary() ); vector all; auto_ptr cursor = conn->query( "config.shards" , BSONObj() ); - while ( cursor->more() ){ + while ( cursor->more() ) { BSONObj o = cursor->next(); all.push_back( o ); } @@ -608,27 +632,27 @@ namespace mongo { } } listShardsCmd; - /* a shard is a single mongod server or a replica pair. add it (them) to the cluster as a storage partition. */ + /* a shard is a single mongod server or a replica pair. add it (them) to the cluster as a storage partition. */ class AddShard : public GridAdminCmd { public: AddShard() : GridAdminCmd("addShard") { } virtual void help( stringstream& help ) const { help << "add a new shard to the system"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { errmsg.clear(); // get replica set component hosts ConnectionString servers = ConnectionString::parse( cmdObj.firstElement().valuestrsafe() , errmsg ); - if ( ! errmsg.empty() ){ + if ( ! errmsg.empty() ) { log() << "addshard request " << cmdObj << " failed:" << errmsg << endl; return false; } // using localhost in server names implies every other process must use locahost addresses too vector serverAddrs = servers.getServers(); - for ( size_t i = 0 ; i < serverAddrs.size() ; i++ ){ - if ( serverAddrs[i].isLocalHost() != grid.allowLocalHost() ){ + for ( size_t i = 0 ; i < serverAddrs.size() ; i++ ) { + if ( serverAddrs[i].isLocalHost() != grid.allowLocalHost() ) { errmsg = "can't use localhost as a shard since all shards need to communicate. " "either use all shards and configdbs in localhost or all in actual IPs " ; log() << "addshard request " << cmdObj << " failed: attempt to mix localhosts and IPs" << endl; @@ -636,7 +660,7 @@ namespace mongo { } // it's fine if mongods of a set all use default port - if ( ! serverAddrs[i].hasPort() ){ + if ( ! serverAddrs[i].hasPort() ) { serverAddrs[i].setPort( CmdLine::ShardServerPort ); } } @@ -645,15 +669,15 @@ namespace mongo { string name = ""; if ( cmdObj["name"].type() == String ) { name = cmdObj["name"].valuestrsafe(); - } + } // maxSize is the space usage cap in a shard in MBs long long maxSize = 0; - if ( cmdObj[ ShardFields::maxSize.name() ].isNumber() ){ + if ( cmdObj[ ShardFields::maxSize.name() ].isNumber() ) { maxSize = cmdObj[ ShardFields::maxSize.name() ].numberLong(); } - - if ( ! grid.addShard( &name , servers , maxSize , errmsg ) ){ + + if ( ! grid.addShard( &name , servers , maxSize , errmsg ) ) { log() << "addshard request " << cmdObj << " failed: " << errmsg << endl; return false; } @@ -673,10 +697,10 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "remove a shard to the system."; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string target = cmdObj.firstElement().valuestrsafe(); Shard s = Shard::make( target ); - if ( ! grid.knowAboutShard( s.getConnString() ) ){ + if ( ! grid.knowAboutShard( s.getConnString() ) ) { errmsg = "unknown shard"; return false; } @@ -687,7 +711,7 @@ namespace mongo { BSONObj searchDoc = BSON( "_id" << s.getName() ); BSONObj drainingDoc = BSON( "_id" << s.getName() << ShardFields::draining(true) ); BSONObj shardDoc = conn->findOne( "config.shards", drainingDoc ); - if ( shardDoc.isEmpty() ){ + if ( shardDoc.isEmpty() ) { // TODO prevent move chunks to this shard. @@ -696,7 +720,7 @@ namespace mongo { conn->update( "config.shards" , searchDoc , newStatus, false /* do no upsert */); errmsg = conn->getLastError(); - if ( errmsg.size() ){ + if ( errmsg.size() ) { log() << "error starting remove shard: " << s.getName() << " err: " << errmsg << endl; return false; } @@ -704,7 +728,7 @@ namespace mongo { Shard::reloadShardInfo(); result.append( "msg" , "draining started successfully" ); - result.append( "state" , "started" ); + result.append( "state" , "started" ); result.append( "shard" , s.getName() ); conn.done(); return true; @@ -716,12 +740,12 @@ namespace mongo { long long chunkCount = conn->count( "config.chunks" , shardIDDoc ); BSONObj primaryDoc = BSON( "primary" << shardDoc[ "_id" ].str() ); long long dbCount = conn->count( "config.databases" , primaryDoc ); - if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ){ - log() << "going to remove shard: " << s.getName() << endl; + if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ) { + log() << "going to remove shard: " << s.getName() << endl; conn->remove( "config.shards" , searchDoc ); errmsg = conn->getLastError(); - if ( errmsg.size() ){ + if ( errmsg.size() ) { log() << "error concluding remove shard: " << s.getName() << " err: " << errmsg << endl; return false; } @@ -755,7 +779,7 @@ namespace mongo { class IsDbGridCmd : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } @@ -769,7 +793,7 @@ namespace mongo { class CmdIsMaster : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual bool slaveOk() const { return true; @@ -777,10 +801,11 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "test if this is master half of a replica pair"; } - CmdIsMaster() : Command("ismaster") { } + CmdIsMaster() : Command("isMaster" , false , "ismaster") { } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { - result.append("ismaster", 1.0 ); + result.appendBool("ismaster", true ); result.append("msg", "isdbgrid"); + result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); return true; } } ismaster; @@ -794,23 +819,23 @@ namespace mongo { virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual void help( stringstream &help ) const { help << "{whatsmyuri:1}"; - } + } virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { result << "you" << ClientInfo::get()->getRemote(); return true; } } cmdWhatsMyUri; - + class CmdShardingGetPrevError : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual bool slaveOk() const { return true; @@ -827,7 +852,7 @@ namespace mongo { class CmdShardingGetLastError : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool requiresAuth() { return false; } virtual bool slaveOk() const { return true; @@ -836,196 +861,147 @@ namespace mongo { help << "check for an error on the last command executed"; } CmdShardingGetLastError() : Command("getLastError" , false , "getlasterror") { } - - void addWriteBack( vector& all , const BSONObj& o ){ - BSONElement e = o["writeback"]; - if ( e.type() == jstOID ) - all.push_back( e.OID() ); - } - - void handleWriteBacks( vector& all ){ - if ( all.size() == 0 ) - return; - - for ( unsigned i=0; imsg.size() && le->nPrev == 1 ){ + if ( le->msg.size() && le->nPrev == 1 ) { le->appendSelf( result ); return true; } } - + ClientInfo * client = ClientInfo::get(); - set * shards = client->getPrev(); - - if ( shards->size() == 0 ){ - result.appendNull( "err" ); - return true; - } + return client->getLastError( cmdObj , result ); + } + } cmdGetLastError; - //log() << "getlasterror enter: " << shards->size() << endl; + } + class CmdShardingResetError : public Command { + public: + CmdShardingResetError() : Command( "resetError" , false , "reseterror" ) {} - vector writebacks; - - // handle single server - if ( shards->size() == 1 ){ - string theShard = *(shards->begin() ); - result.append( "theshard" , theShard.c_str() ); - ShardConnection conn( theShard , "" ); - BSONObj res; - bool ok = conn->runCommand( dbName , cmdObj , res ); - //log() << "\t" << res << endl; - result.appendElements( res ); - conn.done(); - result.append( "singleShard" , theShard ); - addWriteBack( writebacks , res ); - - // hit other machines just to block - for ( set::const_iterator i=client->sinceLastGetError().begin(); i!=client->sinceLastGetError().end(); ++i ){ - string temp = *i; - if ( temp == theShard ) - continue; - - ShardConnection conn( temp , "" ); - addWriteBack( writebacks , conn->getLastErrorDetailed() ); - conn.done(); - } - client->clearSinceLastGetError(); - handleWriteBacks( writebacks ); - return ok; - } - - BSONArrayBuilder bbb( result.subarrayStart( "shards" ) ); - - long long n = 0; - - // hit each shard - vector errors; - for ( set::iterator i = shards->begin(); i != shards->end(); i++ ){ - string theShard = *i; - bbb.append( theShard ); - ShardConnection conn( theShard , "" ); - BSONObj res; - bool ok = conn->runCommand( dbName , cmdObj , res ); - addWriteBack( writebacks, res ); - string temp = DBClientWithCommands::getLastErrorString( res ); - if ( ok == false || temp.size() ) - errors.push_back( temp ); - n += res["n"].numberLong(); - conn.done(); - } - - bbb.done(); - - result.appendNumber( "n" , n ); - - // hit other machines just to block - for ( set::const_iterator i=client->sinceLastGetError().begin(); i!=client->sinceLastGetError().end(); ++i ){ - string temp = *i; - if ( shards->count( temp ) ) - continue; - - ShardConnection conn( temp , "" ); - addWriteBack( writebacks, conn->getLastErrorDetailed() ); - conn.done(); - } - client->clearSinceLastGetError(); + virtual LockType locktype() const { return NONE; } + virtual bool requiresAuth() { return false; } + virtual bool slaveOk() const { + return true; + } - if ( errors.size() == 0 ){ - result.appendNull( "err" ); - handleWriteBacks( writebacks ); - return true; - } - - result.append( "err" , errors[0].c_str() ); - - BSONObjBuilder all; - for ( unsigned i=0; ireset(); + + ClientInfo * client = ClientInfo::get(); + set * shards = client->getPrev(); + + for ( set::iterator i = shards->begin(); i != shards->end(); i++ ) { + string theShard = *i; + ShardConnection conn( theShard , "" ); + BSONObj res; + conn->runCommand( dbName , cmdObj , res ); + conn.done(); } - } cmdGetLastError; - - } - + + return true; + } + } cmdShardingResetError; + class CmdListDatabases : public Command { public: - CmdListDatabases() : Command("listDatabases", false , "listdatabases" ) {} + CmdListDatabases() : Command("listDatabases", true , "listdatabases" ) {} virtual bool logTheOp() { return false; } virtual bool slaveOk() const { return true; } virtual bool slaveOverrideOk() { return true; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual void help( stringstream& help ) const { help << "list databases on cluster"; } - + bool run(const string& , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { vector shards; Shard::getAllShards( shards ); - + map sizes; map< string,shared_ptr > dbShardInfo; - for ( vector::iterator i=shards.begin(); i!=shards.end(); i++ ){ + for ( vector::iterator i=shards.begin(); i!=shards.end(); i++ ) { Shard s = *i; BSONObj x = s.runCommand( "admin" , "listDatabases" ); BSONObjIterator j( x["databases"].Obj() ); - while ( j.more() ){ + while ( j.more() ) { BSONObj theDB = j.next().Obj(); - + string name = theDB["name"].String(); long long size = theDB["sizeOnDisk"].numberLong(); long long& totalSize = sizes[name]; - if ( size == 1 ){ + if ( size == 1 ) { if ( totalSize <= 1 ) totalSize = 1; } else totalSize += size; - + shared_ptr& bb = dbShardInfo[name]; if ( ! bb.get() ) bb.reset( new BSONObjBuilder() ); bb->appendNumber( s.getName() , size ); } - + } - + long long totalSize = 0; BSONArrayBuilder bb( result.subarrayStart( "databases" ) ); - for ( map::iterator i=sizes.begin(); i!=sizes.end(); ++i ){ + for ( map::iterator i=sizes.begin(); i!=sizes.end(); ++i ) { string name = i->first; + + if ( name == "local" ) { + // we don't return local + // since all shards have their own independant local + continue; + } + long long size = i->second; totalSize += size; - + BSONObjBuilder temp; temp.append( "name" , name ); - temp.appendNumber( "size" , size ); + temp.appendNumber( "sizeOnDisk" , size ); temp.appendBool( "empty" , size == 1 ); temp.append( "shards" , dbShardInfo[name]->obj() ); - + bb.append( temp.obj() ); } + + if ( sizes.find( "config" ) == sizes.end() ){ + ScopedDbConnection conn( configServer.getPrimary() ); + BSONObj x; + if ( conn->simpleCommand( "config" , &x , "dbstats" ) ){ + BSONObjBuilder b; + b.append( "name" , "config" ); + b.appendBool( "empty" , false ); + if ( x["fileSize"].type() ) + b.appendAs( x["fileSize"] , "sizeOnDisk" ); + else + b.append( "sizeOnDisk" , 1 ); + bb.append( b.obj() ); + } + else { + bb.append( BSON( "name" << "config" ) ); + } + conn.done(); + } + bb.done(); result.appendNumber( "totalSize" , totalSize ); result.appendNumber( "totalSizeMb" , totalSize / ( 1024 * 1024 ) ); - + return 1; } @@ -1038,9 +1014,9 @@ namespace mongo { virtual bool slaveOk() const { return true; } virtual bool slaveOverrideOk() { return true; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual void help( stringstream& help ) const { help << "Not supported sharded"; } - + bool run(const string& , BSONObj& jsobj, string& errmsg, BSONObjBuilder& /*result*/, bool /*fromRepl*/) { errmsg = "closeAllDatabases isn't supported through mongos"; return false; @@ -1048,4 +1024,22 @@ namespace mongo { } cmdCloseAllDatabases; + class CmdReplSetGetStatus : public Command { + public: + CmdReplSetGetStatus() : Command("replSetGetStatus"){} + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream& help ) const { help << "Not supported through mongos"; } + + bool run(const string& , BSONObj& jsobj, string& errmsg, BSONObjBuilder& /*result*/, bool /*fromRepl*/) { + if ( jsobj["forShell"].trueValue() ) + lastError.disableForCommand(); + + errmsg = "replSetGetStatus is not supported through mongos"; + return false; + } + } cmdReplSetGetStatus; + } // namespace mongo diff --git a/s/commands_public.cpp b/s/commands_public.cpp index 80d5cc9..02000a0 100644 --- a/s/commands_public.cpp +++ b/s/commands_public.cpp @@ -33,10 +33,10 @@ namespace mongo { namespace dbgrid_pub_cmds { - + class PublicGridCommand : public Command { public: - PublicGridCommand( const char* n, const char* oldname=NULL ) : Command( n, false, oldname ){ + PublicGridCommand( const char* n, const char* oldname=NULL ) : Command( n, false, oldname ) { } virtual bool slaveOk() const { return true; @@ -46,18 +46,18 @@ namespace mongo { } // all grid commands are designed not to lock - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } protected: - bool passthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ){ + bool passthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) { return _passthrough(conf->getName(), conf, cmdObj, result); } - bool adminPassthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ){ + bool adminPassthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) { return _passthrough("admin", conf, cmdObj, result); } - + private: - bool _passthrough(const string& db, DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ){ + bool _passthrough(const string& db, DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) { ShardConnection conn( conf->getPrimary() , "" ); BSONObj res; bool ok = conn->runCommand( db , cmdObj , res ); @@ -75,33 +75,33 @@ namespace mongo { virtual bool adminOnly() const { return false; } // all grid commands are designed not to lock - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } // default impl uses all shards for DB - virtual void getShards(const string& dbName , BSONObj& cmdObj, set& shards){ + virtual void getShards(const string& dbName , BSONObj& cmdObj, set& shards) { DBConfigPtr conf = grid.getDBConfig( dbName , false ); conf->getAllShards(shards); } - + virtual void aggregateResults(const vector& results, BSONObjBuilder& output) {} // don't override - virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& output, bool){ + virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& output, bool) { set shards; getShards(dbName, cmdObj, shards); list< shared_ptr > futures; - for ( set::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ){ + for ( set::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) { futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj ) ); } - + vector results; BSONObjBuilder subobj (output.subobjStart("raw")); BSONObjBuilder errors; - for ( list< shared_ptr >::iterator i=futures.begin(); i!=futures.end(); i++ ){ + for ( list< shared_ptr >::iterator i=futures.begin(); i!=futures.end(); i++ ) { shared_ptr res = *i; - if ( ! res->join() ){ + if ( ! res->join() ) { errors.appendAs(res->result()["errmsg"], res->getServer()); } results.push_back( res->result() ); @@ -111,11 +111,11 @@ namespace mongo { subobj.done(); BSONObj errobj = errors.done(); - if (! errobj.isEmpty()){ + if (! errobj.isEmpty()) { errmsg = errobj.toString(false, true); return false; } - + aggregateResults(results, output); return true; } @@ -126,39 +126,40 @@ namespace mongo { public: AllShardsCollectionCommand(const char* n, const char* oldname=NULL) : RunOnAllShardsCommand(n, oldname) {} - virtual void getShards(const string& dbName , BSONObj& cmdObj, set& shards){ + virtual void getShards(const string& dbName , BSONObj& cmdObj, set& shards) { string fullns = dbName + '.' + cmdObj.firstElement().valuestrsafe(); - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { shards.insert(conf->getShard(fullns)); - } else { + } + else { conf->getChunkManager(fullns)->getAllShards(shards); } } }; - + class NotAllowedOnShardedCollectionCmd : public PublicGridCommand { public: - NotAllowedOnShardedCollectionCmd( const char * n ) : PublicGridCommand( n ){} + NotAllowedOnShardedCollectionCmd( const char * n ) : PublicGridCommand( n ) {} virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) = 0; - - virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + + virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string fullns = getFullNS( dbName , cmdObj ); - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { return passthrough( conf , cmdObj , result ); } errmsg = "can't do command: " + name + " on sharded collection"; return false; } }; - + // ---- class DropIndexesCmd : public AllShardsCollectionCommand { @@ -194,7 +195,7 @@ namespace mongo { long long indexSize = 0; long long fileSize = 0; - for (vector::const_iterator it(results.begin()), end(results.end()); it != end; ++it){ + for (vector::const_iterator it(results.begin()), end(results.end()); it != end; ++it) { const BSONObj& b = *it; objects += b["objects"].numberLong(); dataSize += b["dataSize"].numberLong(); @@ -219,23 +220,24 @@ namespace mongo { class DropCmd : public PublicGridCommand { public: - DropCmd() : PublicGridCommand( "drop" ){} - bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + DropCmd() : PublicGridCommand( "drop" ) {} + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string collection = cmdObj.firstElement().valuestrsafe(); string fullns = dbName + "." + collection; - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - + log() << "DROP: " << fullns << endl; - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { return passthrough( conf , cmdObj , result ); } - + ChunkManagerPtr cm = conf->getChunkManager( fullns ); massert( 10418 , "how could chunk manager be null!" , cm ); - + cm->drop( cm ); + uassert( 13512 , "drop collection attempted on non-sharded collection" , conf->removeSharding( fullns ) ); return 1; } @@ -243,25 +245,25 @@ namespace mongo { class DropDBCmd : public PublicGridCommand { public: - DropDBCmd() : PublicGridCommand( "dropDatabase" ){} - bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - + DropDBCmd() : PublicGridCommand( "dropDatabase" ) {} + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + BSONElement e = cmdObj.firstElement(); - - if ( ! e.isNumber() || e.number() != 1 ){ + + if ( ! e.isNumber() || e.number() != 1 ) { errmsg = "invalid params"; return 0; } - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - + log() << "DROP DATABASE: " << dbName << endl; - if ( ! conf ){ + if ( ! conf ) { result.append( "info" , "database didn't exist" ); return true; } - + if ( ! conf->dropDatabase( errmsg ) ) return false; @@ -272,8 +274,8 @@ namespace mongo { class RenameCollectionCmd : public PublicGridCommand { public: - RenameCollectionCmd() : PublicGridCommand( "renameCollection" ){} - bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + RenameCollectionCmd() : PublicGridCommand( "renameCollection" ) {} + bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string fullnsFrom = cmdObj.firstElement().valuestrsafe(); string dbNameFrom = nsToDatabase( fullnsFrom.c_str() ); DBConfigPtr confFrom = grid.getDBConfig( dbNameFrom , false ); @@ -297,18 +299,19 @@ namespace mongo { class CopyDBCmd : public PublicGridCommand { public: - CopyDBCmd() : PublicGridCommand( "copydb" ){} - bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + CopyDBCmd() : PublicGridCommand( "copydb" ) {} + bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string todb = cmdObj.getStringField("todb"); uassert(13402, "need a todb argument", !todb.empty()); - + DBConfigPtr confTo = grid.getDBConfig( todb ); uassert(13398, "cant copy to sharded DB", !confTo->isShardingEnabled()); string fromhost = cmdObj.getStringField("fromhost"); - if (!fromhost.empty()){ + if (!fromhost.empty()) { return adminPassthrough( confTo , cmdObj , result ); - } else { + } + else { string fromdb = cmdObj.getStringField("fromdb"); uassert(13399, "need a fromdb argument", !fromdb.empty()); @@ -317,7 +320,7 @@ namespace mongo { uassert(13401, "cant copy from sharded DB", !confFrom->isShardingEnabled()); BSONObjBuilder b; - BSONForEach(e, cmdObj){ + BSONForEach(e, cmdObj) { if (strcmp(e.fieldName(), "fromhost") != 0) b.append(e); } @@ -328,67 +331,67 @@ namespace mongo { } } - }copyDBCmd; + } copyDBCmd; class CountCmd : public PublicGridCommand { public: CountCmd() : PublicGridCommand("count") { } - bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool l){ + bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool l) { string collection = cmdObj.firstElement().valuestrsafe(); string fullns = dbName + "." + collection; - + BSONObj filter; if ( cmdObj["query"].isABSONObj() ) filter = cmdObj["query"].Obj(); - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { ShardConnection conn( conf->getPrimary() , fullns ); BSONObj temp; bool ok = conn->runCommand( dbName , cmdObj , temp ); conn.done(); - - if ( ok ){ + + if ( ok ) { result.append( temp["n"] ); return true; } - - if ( temp["code"].numberInt() != StaleConfigInContextCode ){ + + if ( temp["code"].numberInt() != StaleConfigInContextCode ) { errmsg = temp["errmsg"].String(); result.appendElements( temp ); return false; } - + // this collection got sharded ChunkManagerPtr cm = conf->getChunkManager( fullns , true ); - if ( ! cm ){ + if ( ! cm ) { errmsg = "should be sharded now"; result.append( "root" , temp ); return false; } } - + long long total = 0; map shardCounts; - + ChunkManagerPtr cm = conf->getChunkManager( fullns ); - while ( true ){ - if ( ! cm ){ + while ( true ) { + if ( ! cm ) { // probably unsharded now return run( dbName , cmdObj , errmsg , result , l ); } - + set shards; cm->getShardsForQuery( shards , filter ); assert( shards.size() ); - + bool hadToBreak = false; - for (set::iterator it=shards.begin(), end=shards.end(); it != end; ++it){ + for (set::iterator it=shards.begin(), end=shards.end(); it != end; ++it) { ShardConnection conn(*it, fullns); - if ( conn.setVersion() ){ + if ( conn.setVersion() ) { total = 0; shardCounts.clear(); cm = conf->getChunkManager( fullns ); @@ -396,19 +399,19 @@ namespace mongo { hadToBreak = true; break; } - + BSONObj temp; bool ok = conn->runCommand( dbName , BSON( "count" << collection << "query" << filter ) , temp ); conn.done(); - - if ( ok ){ + + if ( ok ) { long long mine = temp["n"].numberLong(); total += mine; shardCounts[it->getName()] = mine; continue; } - - if ( StaleConfigInContextCode == temp["code"].numberInt() ){ + + if ( StaleConfigInContextCode == temp["code"].numberInt() ) { // my version is old total = 0; shardCounts.clear(); @@ -425,7 +428,7 @@ namespace mongo { if ( ! hadToBreak ) break; } - + total = applySkipLimit( total , cmdObj ); result.appendNumber( "n" , total ); BSONObjBuilder temp( result.subobjStart( "shards" ) ); @@ -439,13 +442,13 @@ namespace mongo { class CollectionStats : public PublicGridCommand { public: CollectionStats() : PublicGridCommand("collStats", "collstats") { } - bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string collection = cmdObj.firstElement().valuestrsafe(); string fullns = dbName + "." + collection; - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { result.append( "ns" , fullns ); result.appendBool("sharded", false); result.append( "primary" , conf->getPrimary().getName() ); @@ -458,17 +461,17 @@ namespace mongo { set servers; cm->getAllShards(servers); - + BSONObjBuilder shardStats; long long count=0; long long size=0; long long storageSize=0; int nindexes=0; bool warnedAboutIndexes = false; - for ( set::iterator i=servers.begin(); i!=servers.end(); i++ ){ + for ( set::iterator i=servers.begin(); i!=servers.end(); i++ ) { ScopedDbConnection conn( *i ); BSONObj res; - if ( ! conn->runCommand( dbName , cmdObj , res ) ){ + if ( ! conn->runCommand( dbName , cmdObj , res ) ) { errmsg = "failed on shard: " + res.toString(); return false; } @@ -480,19 +483,19 @@ namespace mongo { int myIndexes = res["nindexes"].numberInt(); - if ( nindexes == 0 ){ + if ( nindexes == 0 ) { nindexes = myIndexes; } - else if ( nindexes == myIndexes ){ + else if ( nindexes == myIndexes ) { // no-op } else { // hopefully this means we're building an index - + if ( myIndexes > nindexes ) nindexes = myIndexes; - - if ( ! warnedAboutIndexes ){ + + if ( ! warnedAboutIndexes ) { result.append( "warning" , "indexes don't all match - ok if ensureIndex is running" ); warnedAboutIndexes = true; } @@ -510,7 +513,7 @@ namespace mongo { result.append("nchunks", cm->numChunks()); result.append("shards", shardStats.obj()); - + return true; } } collectionStatsCmd; @@ -518,19 +521,19 @@ namespace mongo { class FindAndModifyCmd : public PublicGridCommand { public: FindAndModifyCmd() : PublicGridCommand("findAndModify", "findandmodify") { } - bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string collection = cmdObj.firstElement().valuestrsafe(); string fullns = dbName + "." + collection; - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { return passthrough( conf , cmdObj , result); } - + ChunkManagerPtr cm = conf->getChunkManager( fullns ); massert( 13002 , "how could chunk manager be null!" , cm ); - + BSONObj filter = cmdObj.getObjectField("query"); uassert(13343, "query for sharded findAndModify must have shardkey", cm->hasShardKey(filter)); @@ -542,11 +545,11 @@ namespace mongo { bool ok = conn->runCommand( conf->getName() , cmdObj , res ); conn.done(); - if (ok || (strcmp(res["errmsg"].valuestrsafe(), "No matching object found") != 0)){ + if (ok || (strcmp(res["errmsg"].valuestrsafe(), "No matching object found") != 0)) { result.appendElements(res); return ok; } - + return true; } @@ -555,18 +558,18 @@ namespace mongo { class DataSizeCmd : public PublicGridCommand { public: DataSizeCmd() : PublicGridCommand("dataSize", "datasize") { } - bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string fullns = cmdObj.firstElement().String(); - + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { return passthrough( conf , cmdObj , result); } - + ChunkManagerPtr cm = conf->getChunkManager( fullns ); massert( 13407 , "how could chunk manager be null!" , cm ); - + BSONObj min = cmdObj.getObjectField( "min" ); BSONObj max = cmdObj.getObjectField( "max" ); BSONObj keyPattern = cmdObj.getObjectField( "keyPattern" ); @@ -580,13 +583,13 @@ namespace mongo { set shards; cm->getShardsForRange(shards, min, max); - for ( set::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ){ + for ( set::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ) { ScopedDbConnection conn( *i ); BSONObj res; bool ok = conn->runCommand( conf->getName() , cmdObj , res ); conn.done(); - - if ( ! ok ){ + + if ( ! ok ) { result.appendElements( res ); return false; } @@ -607,64 +610,64 @@ namespace mongo { class ConvertToCappedCmd : public NotAllowedOnShardedCollectionCmd { public: - ConvertToCappedCmd() : NotAllowedOnShardedCollectionCmd("convertToCapped"){} - - virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ){ + ConvertToCappedCmd() : NotAllowedOnShardedCollectionCmd("convertToCapped") {} + + virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) { return dbName + "." + cmdObj.firstElement().valuestrsafe(); } - + } convertToCappedCmd; class GroupCmd : public NotAllowedOnShardedCollectionCmd { public: - GroupCmd() : NotAllowedOnShardedCollectionCmd("group"){} - - virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ){ + GroupCmd() : NotAllowedOnShardedCollectionCmd("group") {} + + virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) { return dbName + "." + cmdObj.firstElement().embeddedObjectUserCheck()["ns"].valuestrsafe(); } - + } groupCmd; class DistinctCmd : public PublicGridCommand { public: - DistinctCmd() : PublicGridCommand("distinct"){} + DistinctCmd() : PublicGridCommand("distinct") {} virtual void help( stringstream &help ) const { help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }"; } - bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string collection = cmdObj.firstElement().valuestrsafe(); string fullns = dbName + "." + collection; DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { return passthrough( conf , cmdObj , result ); } - + ChunkManagerPtr cm = conf->getChunkManager( fullns ); massert( 10420 , "how could chunk manager be null!" , cm ); BSONObj query = getQuery(cmdObj); set shards; cm->getShardsForQuery(shards, query); - + set all; int size = 32; - - for ( set::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ){ + + for ( set::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ) { ShardConnection conn( *i , fullns ); BSONObj res; bool ok = conn->runCommand( conf->getName() , cmdObj , res ); conn.done(); - - if ( ! ok ){ + + if ( ! ok ) { result.appendElements( res ); return false; } - + BSONObjIterator it( res["values"].embeddedObject() ); - while ( it.more() ){ + while ( it.more() ) { BSONElement nxt = it.next(); BSONObjBuilder temp(32); temp.appendAs( nxt , "" ); @@ -672,13 +675,13 @@ namespace mongo { } } - + BSONObjBuilder b( size ); int n=0; - for ( set::iterator i = all.begin() ; i != all.end(); i++ ){ - b.appendAs( i->firstElement() , b.numStr( n++ ).c_str() ); + for ( set::iterator i = all.begin() ; i != all.end(); i++ ) { + b.appendAs( i->firstElement() , b.numStr( n++ ) ); } - + result.appendArray( "values" , b.obj() ); return true; } @@ -686,11 +689,11 @@ namespace mongo { class FileMD5Cmd : public PublicGridCommand { public: - FileMD5Cmd() : PublicGridCommand("filemd5"){} + FileMD5Cmd() : PublicGridCommand("filemd5") {} virtual void help( stringstream &help ) const { help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }"; } - bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string fullns = dbName; fullns += "."; { @@ -702,17 +705,17 @@ namespace mongo { fullns += ".chunks"; DBConfigPtr conf = grid.getDBConfig( dbName , false ); - - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { return passthrough( conf , cmdObj , result ); } - + ChunkManagerPtr cm = conf->getChunkManager( fullns ); massert( 13091 , "how could chunk manager be null!" , cm ); uassert( 13092 , "GridFS chunks collection can only be sharded on files_id", cm->getShardKey().key() == BSON("files_id" << 1)); ChunkPtr chunk = cm->findChunk( BSON("files_id" << cmdObj.firstElement()) ); - + ShardConnection conn( chunk->getShard() , fullns ); BSONObj res; bool ok = conn->runCommand( conf->getName() , cmdObj , res ); @@ -723,104 +726,254 @@ namespace mongo { } } fileMD5Cmd; + class Geo2dFindNearCmd : public PublicGridCommand { + public: + Geo2dFindNearCmd() : PublicGridCommand( "geoNear" ) {} + void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; } + + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + string collection = cmdObj.firstElement().valuestrsafe(); + string fullns = dbName + "." + collection; + + DBConfigPtr conf = grid.getDBConfig( dbName , false ); + + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { + return passthrough( conf , cmdObj , result ); + } + + ChunkManagerPtr cm = conf->getChunkManager( fullns ); + massert( 13500 , "how could chunk manager be null!" , cm ); + + BSONObj query = getQuery(cmdObj); + set shards; + cm->getShardsForQuery(shards, query); + + int limit = 100; + if (cmdObj["num"].isNumber()) + limit = cmdObj["num"].numberInt(); + + list< shared_ptr > futures; + BSONArrayBuilder shardArray; + for ( set::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) { + futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj ) ); + shardArray.append(i->getName()); + } + + multimap results; // TODO: maybe use merge-sort instead + string nearStr; + double time = 0; + double btreelocs = 0; + double nscanned = 0; + double objectsLoaded = 0; + for ( list< shared_ptr >::iterator i=futures.begin(); i!=futures.end(); i++ ) { + shared_ptr res = *i; + if ( ! res->join() ) { + errmsg = res->result()["errmsg"].String(); + return false; + } + + nearStr = res->result()["near"].String(); + time += res->result()["stats"]["time"].Number(); + btreelocs += res->result()["stats"]["btreelocs"].Number(); + nscanned += res->result()["stats"]["nscanned"].Number(); + objectsLoaded += res->result()["stats"]["objectsLoaded"].Number(); + + BSONForEach(obj, res->result()["results"].embeddedObject()) { + results.insert(make_pair(obj["dis"].Number(), obj.embeddedObject().getOwned())); + } + + // TODO: maybe shrink results if size() > limit + } + + result.append("ns" , fullns); + result.append("near", nearStr); + + int outCount = 0; + double totalDistance = 0; + double maxDistance = 0; + { + BSONArrayBuilder sub (result.subarrayStart("results")); + for (multimap::const_iterator it(results.begin()), end(results.end()); it!= end && outCount < limit; ++it, ++outCount) { + totalDistance += it->first; + maxDistance = it->first; // guaranteed to be highest so far + + sub.append(it->second); + } + sub.done(); + } + + { + BSONObjBuilder sub (result.subobjStart("stats")); + sub.append("time", time); + sub.append("btreelocs", btreelocs); + sub.append("nscanned", nscanned); + sub.append("objectsLoaded", objectsLoaded); + sub.append("avgDistance", totalDistance / outCount); + sub.append("maxDistance", maxDistance); + sub.append("shards", shardArray.arr()); + sub.done(); + } + + return true; + } + } geo2dFindNearCmd; + class MRCmd : public PublicGridCommand { public: - MRCmd() : PublicGridCommand( "mapreduce" ){} - - string getTmpName( const string& coll ){ + MRCmd() : PublicGridCommand( "mapreduce" ) {} + + string getTmpName( const string& coll ) { static int inc = 1; stringstream ss; ss << "tmp.mrs." << coll << "_" << time(0) << "_" << inc++; return ss.str(); } - BSONObj fixForShards( const BSONObj& orig , const string& output ){ + BSONObj fixForShards( const BSONObj& orig , const string& output, BSONObj& customOut , string& badShardedField ) { BSONObjBuilder b; BSONObjIterator i( orig ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); string fn = e.fieldName(); - if ( fn == "map" || - fn == "mapreduce" || - fn == "reduce" || - fn == "query" || - fn == "sort" || - fn == "scope" || - fn == "verbose" ){ + if ( fn == "map" || + fn == "mapreduce" || + fn == "mapparams" || + fn == "reduce" || + fn == "query" || + fn == "sort" || + fn == "scope" || + fn == "verbose" ) { b.append( e ); } - else if ( fn == "keeptemp" || - fn == "out" || - fn == "finalize" ){ + else if ( fn == "out" || + fn == "finalize" ) { // we don't want to copy these + if (fn == "out" && e.type() == Object) { + // check if there is a custom output + BSONObj out = e.embeddedObject(); + if (out.hasField("db")) + customOut = out; + } } else { - uassert( 10177 , (string)"don't know mr field: " + fn , 0 ); + badShardedField = fn; + return BSONObj(); } } b.append( "out" , output ); return b.obj(); } - - bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + + bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { Timer t; string collection = cmdObj.firstElement().valuestrsafe(); string fullns = dbName + "." + collection; + const string shardedOutputCollection = getTmpName( collection ); + + string badShardedField; + BSONObj customOut; + BSONObj shardedCommand = fixForShards( cmdObj , shardedOutputCollection, customOut , badShardedField ); + + bool customOutDB = ! customOut.isEmpty() && customOut.hasField( "db" ); + DBConfigPtr conf = grid.getDBConfig( dbName , false ); - if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ){ + if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) { + if ( customOutDB ) { + errmsg = "can't use out 'db' with non-sharded db"; + return false; + } return passthrough( conf , cmdObj , result ); } - + + if ( badShardedField.size() ) { + errmsg = str::stream() << "unknown m/r field for sharding: " << badShardedField; + return false; + } + BSONObjBuilder timingBuilder; ChunkManagerPtr cm = conf->getChunkManager( fullns ); BSONObj q; - if ( cmdObj["query"].type() == Object ){ + if ( cmdObj["query"].type() == Object ) { q = cmdObj["query"].embeddedObjectUserCheck(); } - + set shards; cm->getShardsForQuery( shards , q ); - - const string shardedOutputCollection = getTmpName( collection ); - - BSONObj shardedCommand = fixForShards( cmdObj , shardedOutputCollection ); - + + BSONObjBuilder finalCmd; finalCmd.append( "mapreduce.shardedfinish" , cmdObj ); finalCmd.append( "shardedOutputCollection" , shardedOutputCollection ); - list< shared_ptr > futures; - - for ( set::iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ){ - futures.push_back( Future::spawnCommand( i->getConnString() , dbName , shardedCommand ) ); - } - BSONObjBuilder shardresults; - for ( list< shared_ptr >::iterator i=futures.begin(); i!=futures.end(); i++ ){ - shared_ptr res = *i; - if ( ! res->join() ){ - errmsg = "mongod mr failed: "; - errmsg += res->result().toString(); - return 0; + { + // we need to use our connections to the shard + // so filtering is done correctly for un-owned docs + // so we allocate them in our thread + // and hand off + + vector< shared_ptr > shardConns; + + list< shared_ptr > futures; + + for ( set::iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) { + shared_ptr temp( new ShardConnection( i->getConnString() , fullns ) ); + assert( temp->get() ); + futures.push_back( Future::spawnCommand( i->getConnString() , dbName , shardedCommand , temp->get() ) ); + shardConns.push_back( temp ); + } + + bool failed = false; + + BSONObjBuilder shardresults; + for ( list< shared_ptr >::iterator i=futures.begin(); i!=futures.end(); i++ ) { + shared_ptr res = *i; + if ( ! res->join() ) { + error() << "sharded m/r failed on shard: " << res->getServer() << " error: " << res->result() << endl; + result.append( "cause" , res->result() ); + errmsg = "mongod mr failed: "; + errmsg += res->result().toString(); + failed = true; + continue; + } + shardresults.append( res->getServer() , res->result() ); } - shardresults.append( res->getServer() , res->result() ); + + for ( unsigned i=0; idone(); + + if ( failed ) + return 0; + + finalCmd.append( "shards" , shardresults.obj() ); + timingBuilder.append( "shards" , t.millis() ); } - - finalCmd.append( "shards" , shardresults.obj() ); - timingBuilder.append( "shards" , t.millis() ); Timer t2; - ShardConnection conn( conf->getPrimary() , fullns ); + // by default the target database is same as input + Shard outServer = conf->getPrimary(); + string outns = fullns; + if ( customOutDB ) { + // have to figure out shard for the output DB + BSONElement elmt = customOut.getField("db"); + string outdb = elmt.valuestrsafe(); + outns = outdb + "." + collection; + DBConfigPtr conf2 = grid.getDBConfig( outdb , true ); + outServer = conf2->getPrimary(); + } + log() << "customOut: " << customOut << " outServer: " << outServer << endl; + + ShardConnection conn( outServer , outns ); BSONObj finalResult; bool ok = conn->runCommand( dbName , finalCmd.obj() , finalResult ); conn.done(); - if ( ! ok ){ + if ( ! ok ) { errmsg = "final reduce failed: "; errmsg += finalResult.toString(); return 0; @@ -830,22 +983,22 @@ namespace mongo { result.appendElements( finalResult ); result.append( "timeMillis" , t.millis() ); result.append( "timing" , timingBuilder.obj() ); - + return 1; } } mrCmd; - + class ApplyOpsCmd : public PublicGridCommand { public: - ApplyOpsCmd() : PublicGridCommand( "applyOps" ){} - - virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + ApplyOpsCmd() : PublicGridCommand( "applyOps" ) {} + + virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { errmsg = "applyOps not allowed through mongos"; return false; } - + } applyOpsCmd; - + } } diff --git a/s/config.cpp b/s/config.cpp index 1ad15d5..35a3be2 100644 --- a/s/config.cpp +++ b/s/config.cpp @@ -25,17 +25,17 @@ #include "../db/pdfile.h" #include "../db/cmdline.h" -#include "server.h" -#include "config.h" #include "chunk.h" +#include "config.h" #include "grid.h" +#include "server.h" namespace mongo { int ConfigServer::VERSION = 3; Shard Shard::EMPTY; - string ShardNS::shard = "config.shards"; + string ShardNS::shard = "config.shards"; string ShardNS::database = "config.databases"; string ShardNS::collection = "config.collections"; string ShardNS::chunk = "config.chunks"; @@ -45,42 +45,41 @@ namespace mongo { BSONField ShardFields::draining("draining"); BSONField ShardFields::maxSize ("maxSize"); - BSONField ShardFields::currSize("currSize"); OID serverID; /* --- DBConfig --- */ - DBConfig::CollectionInfo::CollectionInfo( DBConfig * db , const BSONObj& in ){ + DBConfig::CollectionInfo::CollectionInfo( const BSONObj& in ) { _dirty = false; _dropped = in["dropped"].trueValue(); if ( in["key"].isABSONObj() ) - shard( db , in["_id"].String() , in["key"].Obj() , in["unique"].trueValue() ); + shard( in["_id"].String() , in["key"].Obj() , in["unique"].trueValue() ); } - void DBConfig::CollectionInfo::shard( DBConfig * db , const string& ns , const ShardKeyPattern& key , bool unique ){ - _cm.reset( new ChunkManager( db, ns , key , unique ) ); + void DBConfig::CollectionInfo::shard( const string& ns , const ShardKeyPattern& key , bool unique ) { + _cm.reset( new ChunkManager( ns , key , unique ) ); _dirty = true; _dropped = false; } - void DBConfig::CollectionInfo::unshard(){ + void DBConfig::CollectionInfo::unshard() { _cm.reset(); _dropped = true; _dirty = true; } - - void DBConfig::CollectionInfo::save( const string& ns , DBClientBase* conn ){ + + void DBConfig::CollectionInfo::save( const string& ns , DBClientBase* conn ) { BSONObj key = BSON( "_id" << ns ); - + BSONObjBuilder val; val.append( "_id" , ns ); val.appendDate( "lastmod" , time(0) ); val.appendBool( "dropped" , _dropped ); if ( _cm ) _cm->getInfo( val ); - + conn->update( ShardNS::collection , key , val.obj() , true ); string err = conn->getLastError(); uassert( 13473 , (string)"failed to save collection (" + ns + "): " + err , err.size() == 0 ); @@ -88,14 +87,14 @@ namespace mongo { _dirty = false; } - bool DBConfig::isSharded( const string& ns ){ + bool DBConfig::isSharded( const string& ns ) { if ( ! _shardingEnabled ) return false; scoped_lock lk( _lock ); return _isSharded( ns ); } - bool DBConfig::_isSharded( const string& ns ){ + bool DBConfig::_isSharded( const string& ns ) { if ( ! _shardingEnabled ) return false; Collections::iterator i = _collections.find( ns ); @@ -105,25 +104,28 @@ namespace mongo { } - const Shard& DBConfig::getShard( const string& ns ){ + const Shard& DBConfig::getShard( const string& ns ) { if ( isSharded( ns ) ) return Shard::EMPTY; - + uassert( 10178 , "no primary!" , _primary.ok() ); return _primary; } - - void DBConfig::enableSharding(){ + + void DBConfig::enableSharding() { if ( _shardingEnabled ) return; + + assert( _name != "config" ); + scoped_lock lk( _lock ); - _shardingEnabled = true; + _shardingEnabled = true; _save(); } - - ChunkManagerPtr DBConfig::shardCollection( const string& ns , ShardKeyPattern fieldsAndOrder , bool unique ){ + + ChunkManagerPtr DBConfig::shardCollection( const string& ns , ShardKeyPattern fieldsAndOrder , bool unique ) { uassert( 8042 , "db doesn't have sharding enabled" , _shardingEnabled ); - + scoped_lock lk( _lock ); CollectionInfo& ci = _collections[ns]; @@ -131,35 +133,48 @@ namespace mongo { log() << "enable sharding on: " << ns << " with shard key: " << fieldsAndOrder << endl; - ci.shard( this , ns , fieldsAndOrder , unique ); - ci.getCM()->maybeChunkCollection(); - + // From this point on, 'ns' is going to be treated as a sharded collection. We assume this is the first + // time it is seen by the sharded system and thus create the first chunk for the collection. All the remaining + // chunks will be created as a by-product of splitting. + ci.shard( ns , fieldsAndOrder , unique ); + ChunkManagerPtr cm = ci.getCM(); + uassert( 13449 , "collections already sharded" , (cm->numChunks() == 0) ); + cm->createFirstChunk( getPrimary() ); _save(); - return ci.getCM(); + + try { + cm->maybeChunkCollection(); + } + catch ( UserException& e ) { + // failure to chunk is not critical enough to abort the command (and undo the _save()'d configDB state) + log() << "couldn't chunk recently created collection: " << ns << " " << e << endl; + } + + return cm; } - bool DBConfig::removeSharding( const string& ns ){ - if ( ! _shardingEnabled ){ + bool DBConfig::removeSharding( const string& ns ) { + if ( ! _shardingEnabled ) { return false; } - + scoped_lock lk( _lock ); - + Collections::iterator i = _collections.find( ns ); if ( i == _collections.end() ) return false; - + CollectionInfo& ci = _collections[ns]; if ( ! ci.isSharded() ) return false; - + ci.unshard(); _save(); return true; } - - ChunkManagerPtr DBConfig::getChunkManager( const string& ns , bool shouldReload ){ + + ChunkManagerPtr DBConfig::getChunkManager( const string& ns , bool shouldReload ) { scoped_lock lk( _lock ); if ( shouldReload ) @@ -170,93 +185,80 @@ namespace mongo { return ci.getCM(); } - void DBConfig::setPrimary( string s ){ + void DBConfig::setPrimary( string s ) { scoped_lock lk( _lock ); _primary.reset( s ); _save(); } - - void DBConfig::serialize(BSONObjBuilder& to){ + + void DBConfig::serialize(BSONObjBuilder& to) { to.append("_id", _name); to.appendBool("partitioned", _shardingEnabled ); to.append("primary", _primary.getName() ); } - - bool DBConfig::unserialize(const BSONObj& from){ + + void DBConfig::unserialize(const BSONObj& from) { log(1) << "DBConfig unserialize: " << _name << " " << from << endl; assert( _name == from["_id"].String() ); _shardingEnabled = from.getBoolField("partitioned"); _primary.reset( from.getStringField("primary") ); - // this is a temporary migration thing + // In the 1.5.x series, we used to have collection metadata nested in the database entry. The 1.6.x series + // had migration code that ported that info to where it belongs now: the 'collections' collection. We now + // just assert that we're not migrating from a 1.5.x directly into a 1.7.x without first converting. BSONObj sharded = from.getObjectField( "sharded" ); - if ( sharded.isEmpty() ) - return false; - - BSONObjIterator i(sharded); - while ( i.more() ){ - BSONElement e = i.next(); - uassert( 10182 , "sharded things have to be objects" , e.type() == Object ); - - BSONObj c = e.embeddedObject(); - uassert( 10183 , "key has to be an object" , c["key"].type() == Object ); - - _collections[e.fieldName()].shard( this , e.fieldName() , c["key"].Obj() , c["unique"].trueValue() ); - } - return true; + if ( ! sharded.isEmpty() ) + uasserted( 13509 , "can't migrate from 1.5.x release to the current one; need to upgrade to 1.6.x first"); } - bool DBConfig::load(){ + bool DBConfig::load() { scoped_lock lk( _lock ); return _load(); } - bool DBConfig::_load(){ + bool DBConfig::_load() { ScopedDbConnection conn( configServer.modelServer() ); - - BSONObj o = conn->findOne( ShardNS::database , BSON( "_id" << _name ) ); + BSONObj o = conn->findOne( ShardNS::database , BSON( "_id" << _name ) ); - if ( o.isEmpty() ){ + if ( o.isEmpty() ) { conn.done(); return false; } - - if ( unserialize( o ) ) - _save(); - + + unserialize( o ); + BSONObjBuilder b; b.appendRegex( "_id" , (string)"^" + _name + "." ); - auto_ptr cursor = conn->query( ShardNS::collection ,b.obj() ); assert( cursor.get() ); - while ( cursor->more() ){ + while ( cursor->more() ) { BSONObj o = cursor->next(); - _collections[o["_id"].String()] = CollectionInfo( this , o ); + _collections[o["_id"].String()] = CollectionInfo( o ); } - - conn.done(); + + conn.done(); return true; } - void DBConfig::_save(){ + void DBConfig::_save() { ScopedDbConnection conn( configServer.modelServer() ); - + BSONObj n; { BSONObjBuilder b; serialize(b); n = b.obj(); } - + conn->update( ShardNS::database , BSON( "_id" << _name ) , n , true ); string err = conn->getLastError(); uassert( 13396 , (string)"DBConfig save failed: " + err , err.size() == 0 ); - - for ( Collections::iterator i=_collections.begin(); i!=_collections.end(); ++i ){ + + for ( Collections::iterator i=_collections.begin(); i!=_collections.end(); ++i ) { if ( ! i->second.isDirty() ) continue; i->second.save( i->first , conn.get() ); @@ -265,18 +267,17 @@ namespace mongo { conn.done(); } - - bool DBConfig::reload(){ + bool DBConfig::reload() { scoped_lock lk( _lock ); return _reload(); } - - bool DBConfig::_reload(){ + + bool DBConfig::_reload() { // TODO: i don't think is 100% correct return _load(); } - - bool DBConfig::dropDatabase( string& errmsg ){ + + bool DBConfig::dropDatabase( string& errmsg ) { /** * 1) make sure everything is up * 2) update config server @@ -287,81 +288,88 @@ namespace mongo { log() << "DBConfig::dropDatabase: " << _name << endl; configServer.logChange( "dropDatabase.start" , _name , BSONObj() ); - + // 1 - if ( ! configServer.allUp( errmsg ) ){ + if ( ! configServer.allUp( errmsg ) ) { log(1) << "\t DBConfig::dropDatabase not all up" << endl; return 0; } - + // 2 grid.removeDB( _name ); { ScopedDbConnection conn( configServer.modelServer() ); conn->remove( ShardNS::database , BSON( "_id" << _name ) ); + errmsg = conn->getLastError(); + if ( ! errmsg.empty() ) { + log() << "could not drop '" << _name << "': " << errmsg << endl; + conn.done(); + return false; + } + conn.done(); } - if ( ! configServer.allUp( errmsg ) ){ + if ( ! configServer.allUp( errmsg ) ) { log() << "error removing from config server even after checking!" << endl; return 0; } log(1) << "\t removed entry from config server for: " << _name << endl; - + set allServers; // 3 - while ( true ){ - int num; + while ( true ) { + int num = 0; if ( ! _dropShardedCollections( num , allServers , errmsg ) ) return 0; log() << " DBConfig::dropDatabase: " << _name << " dropped sharded collections: " << num << endl; if ( num == 0 ) break; } - + // 4 { ScopedDbConnection conn( _primary ); BSONObj res; - if ( ! conn->dropDatabase( _name , &res ) ){ + if ( ! conn->dropDatabase( _name , &res ) ) { errmsg = res.toString(); return 0; } conn.done(); } - + // 5 - for ( set::iterator i=allServers.begin(); i!=allServers.end(); i++ ){ + for ( set::iterator i=allServers.begin(); i!=allServers.end(); i++ ) { ScopedDbConnection conn( *i ); BSONObj res; - if ( ! conn->dropDatabase( _name , &res ) ){ + if ( ! conn->dropDatabase( _name , &res ) ) { errmsg = res.toString(); return 0; } - conn.done(); + conn.done(); } - + log(1) << "\t dropped primary db for: " << _name << endl; configServer.logChange( "dropDatabase" , _name , BSONObj() ); return true; } - bool DBConfig::_dropShardedCollections( int& num, set& allServers , string& errmsg ){ + bool DBConfig::_dropShardedCollections( int& num, set& allServers , string& errmsg ) { num = 0; set seen; - while ( true ){ + while ( true ) { Collections::iterator i = _collections.begin(); - for ( ; i != _collections.end(); ++i ){ + for ( ; i != _collections.end(); ++i ) { if ( i->second.isSharded() ) break; } - + if ( i == _collections.end() ) break; - if ( seen.count( i->first ) ){ + if ( seen.count( i->first ) ) { errmsg = "seen a collection twice!"; return false; } @@ -371,19 +379,20 @@ namespace mongo { i->second.getCM()->getAllShards( allServers ); i->second.getCM()->drop( i->second.getCM() ); - + uassert( 10176 , str::stream() << "shard state missing for " << i->first , removeSharding( i->first ) ); + num++; uassert( 10184 , "_dropShardedCollections too many collections - bailing" , num < 100000 ); log(2) << "\t\t dropped " << num << " so far" << endl; } - + return true; } - - void DBConfig::getAllShards(set& shards) const{ + + void DBConfig::getAllShards(set& shards) const { shards.insert(getPrimary()); - for (Collections::const_iterator it(_collections.begin()), end(_collections.end()); it != end; ++it){ - if (it->second.isSharded()){ + for (Collections::const_iterator it(_collections.begin()), end(_collections.end()); it != end; ++it) { + if (it->second.isSharded()) { it->second.getCM()->getAllShards(shards); } // TODO: handle collections on non-primary shard } @@ -391,20 +400,20 @@ namespace mongo { /* --- ConfigServer ---- */ - ConfigServer::ConfigServer() : DBConfig( "config" ){ + ConfigServer::ConfigServer() : DBConfig( "config" ) { _shardingEnabled = false; } - + ConfigServer::~ConfigServer() { } - bool ConfigServer::init( string s ){ + bool ConfigServer::init( string s ) { vector configdbs; splitStringDelim( s, &configdbs, ',' ); return init( configdbs ); } - bool ConfigServer::init( vector configHosts ){ + bool ConfigServer::init( vector configHosts ) { uassert( 10187 , "need configdbs" , configHosts.size() ); string hn = getHostName(); @@ -412,19 +421,19 @@ namespace mongo { sleepsecs(5); dbexit( EXIT_BADOPTIONS ); } - + set hosts; - for ( size_t i=0; i::iterator i=hosts.begin(); i!=hosts.end(); i++ ){ + + for ( set::iterator i=hosts.begin(); i!=hosts.end(); i++ ) { string host = *i; bool ok = false; - for ( int x=10; x>0; x-- ){ - if ( ! hostbyname( host.c_str() ).empty() ){ + for ( int x=10; x>0; x-- ) { + if ( ! hostbyname( host.c_str() ).empty() ) { ok = true; break; } @@ -436,10 +445,10 @@ namespace mongo { } _config = configHosts; - + string fullString; joinStringDelim( configHosts, &fullString, ',' ); - _primary.setAddress( fullString , true ); + _primary.setAddress( ConnectionString( fullString , ConnectionString::SYNC ) ); log(1) << " config string : " << fullString << endl; return true; @@ -448,14 +457,14 @@ namespace mongo { bool ConfigServer::checkConfigServersConsistent( string& errmsg , int tries ) const { if ( _config.size() == 1 ) return true; - + if ( tries <= 0 ) return false; - + unsigned firstGood = 0; int up = 0; vector res; - for ( unsigned i=0; i<_config.size(); i++ ){ + for ( unsigned i=0; i<_config.size(); i++ ) { BSONObj x; try { ScopedDbConnection conn( _config[i] ); @@ -469,125 +478,125 @@ namespace mongo { } conn.done(); } - catch ( std::exception& ){ - log(LL_WARNING) << " couldn't check on config server:" << _config[i] << " ok for now" << endl; + catch ( SocketException& e ) { + warning() << " couldn't check on config server:" << _config[i] << " ok for now : " << e.toString() << endl; } res.push_back(x); } - if ( up == 0 ){ + if ( up == 0 ) { errmsg = "no config servers reachable"; return false; } - if ( up == 1 ){ + if ( up == 1 ) { log( LL_WARNING ) << "only 1 config server reachable, continuing" << endl; return true; } BSONObj base = res[firstGood]; - for ( unsigned i=firstGood+1; igetLastError(); conn.done(); return true; } - catch ( DBException& ){ + catch ( DBException& ) { log() << "ConfigServer::allUp : " << _primary.toString() << " seems down!" << endl; errmsg = _primary.toString() + " seems down"; return false; } - + } - - int ConfigServer::dbConfigVersion(){ + + int ConfigServer::dbConfigVersion() { ScopedDbConnection conn( _primary ); int version = dbConfigVersion( conn.conn() ); conn.done(); return version; } - - int ConfigServer::dbConfigVersion( DBClientBase& conn ){ + + int ConfigServer::dbConfigVersion( DBClientBase& conn ) { auto_ptr c = conn.query( "config.version" , BSONObj() ); int version = 0; - if ( c->more() ){ + if ( c->more() ) { BSONObj o = c->next(); version = o["version"].numberInt(); uassert( 10189 , "should only have 1 thing in config.version" , ! c->more() ); } else { - if ( conn.count( ShardNS::shard ) || conn.count( ShardNS::database ) ){ + if ( conn.count( ShardNS::shard ) || conn.count( ShardNS::database ) ) { version = 1; } } - + return version; } - - void ConfigServer::reloadSettings(){ + + void ConfigServer::reloadSettings() { set got; - + ScopedDbConnection conn( _primary ); auto_ptr c = conn->query( ShardNS::settings , BSONObj() ); assert( c.get() ); - while ( c->more() ){ + while ( c->more() ) { BSONObj o = c->next(); string name = o["_id"].valuestrsafe(); got.insert( name ); - if ( name == "chunksize" ){ + if ( name == "chunksize" ) { log(1) << "MaxChunkSize: " << o["value"] << endl; Chunk::MaxChunkSize = o["value"].numberInt() * 1024 * 1024; } - else if ( name == "balancer" ){ + else if ( name == "balancer" ) { // ones we ignore here } else { @@ -595,12 +604,12 @@ namespace mongo { } } - if ( ! got.count( "chunksize" ) ){ + if ( ! got.count( "chunksize" ) ) { conn->insert( ShardNS::settings , BSON( "_id" << "chunksize" << "value" << (Chunk::MaxChunkSize / ( 1024 * 1024 ) ) ) ); } - - + + // indexes try { conn->ensureIndex( ShardNS::chunk , BSON( "ns" << 1 << "min" << 1 ) , true ); @@ -608,66 +617,86 @@ namespace mongo { conn->ensureIndex( ShardNS::chunk , BSON( "ns" << 1 << "lastmod" << 1 ) , true ); conn->ensureIndex( ShardNS::shard , BSON( "host" << 1 ) , true ); } - catch ( std::exception& e ){ + catch ( std::exception& e ) { log( LL_WARNING ) << "couldn't create indexes on config db: " << e.what() << endl; } conn.done(); } - string ConfigServer::getHost( string name , bool withPort ){ - if ( name.find( ":" ) != string::npos ){ + string ConfigServer::getHost( string name , bool withPort ) { + if ( name.find( ":" ) != string::npos ) { if ( withPort ) return name; return name.substr( 0 , name.find( ":" ) ); } - if ( withPort ){ + if ( withPort ) { stringstream ss; ss << name << ":" << CmdLine::ConfigServerPort; return ss.str(); } - + return name; } - void ConfigServer::logChange( const string& what , const string& ns , const BSONObj& detail ){ - assert( _primary.ok() ); + /* must never throw */ + void ConfigServer::logChange( const string& what , const string& ns , const BSONObj& detail ) { + string changeID; - static bool createdCapped = false; - static AtomicUInt num; - - ScopedDbConnection conn( _primary ); - - if ( ! createdCapped ){ - try { - conn->createCollection( "config.changelog" , 1024 * 1024 * 10 , true ); - } - catch ( UserException& e ){ - log(1) << "couldn't create changelog (like race condition): " << e << endl; - // don't care + try { + // get this entry's ID so we can use on the exception code path too + stringstream id; + static AtomicUInt num; + id << getHostNameCached() << "-" << terseCurrentTime() << "-" << num++; + changeID = id.str(); + + // send a copy of the message to the log in case it doesn't manage to reach config.changelog + Client* c = currentClient.get(); + BSONObj msg = BSON( "_id" << changeID << "server" << getHostNameCached() << "clientAddr" << (c ? c->clientAddress(true) : "N/A") + << "time" << DATENOW << "what" << what << "ns" << ns << "details" << detail ); + log() << "about to log metadata event: " << msg << endl; + + assert( _primary.ok() ); + + ScopedDbConnection conn( _primary ); + + static bool createdCapped = false; + if ( ! createdCapped ) { + try { + conn->createCollection( "config.changelog" , 1024 * 1024 * 10 , true ); + } + catch ( UserException& e ) { + log(1) << "couldn't create changelog (like race condition): " << e << endl; + // don't care + } + createdCapped = true; } - createdCapped = true; + + conn->insert( "config.changelog" , msg ); + + conn.done(); + } - - stringstream id; - id << getHostNameCached() << "-" << terseCurrentTime() << "-" << num++; - BSONObj msg = BSON( "_id" << id.str() << "server" << getHostNameCached() << "time" << DATENOW << - "what" << what << "ns" << ns << "details" << detail ); - log() << "config change: " << msg << endl; + catch ( std::exception& e ) { + // if we got here, it means the config change is only in the log; it didn't make it to config.changelog + log() << "not logging config change: " << changeID << " " << e.what() << endl; + } + } + void ConfigServer::replicaSetChange( const ReplicaSetMonitor * monitor ) { try { - conn->insert( "config.changelog" , msg ); + ScopedDbConnection conn( configServer.getConnectionString() ); + conn->update( ShardNS::shard , BSON( "_id" << monitor->getName() ) , BSON( "$set" << BSON( "host" << monitor->getServerAddress() ) ) ); + conn.done(); } - catch ( std::exception& e ){ - log() << "not logging config change: " << e.what() << endl; + catch ( DBException & ) { + error() << "RSChangeWatcher: could not update config db for set: " << monitor->getName() << " to: " << monitor->getServerAddress() << endl; } - - conn.done(); } - DBConfigPtr configServerPtr (new ConfigServer()); - ConfigServer& configServer = dynamic_cast(*configServerPtr); + DBConfigPtr configServerPtr (new ConfigServer()); + ConfigServer& configServer = dynamic_cast(*configServerPtr); -} +} diff --git a/s/config.h b/s/config.h index 5bff03f..0636835 100644 --- a/s/config.h +++ b/s/config.h @@ -26,14 +26,16 @@ #include "../db/namespace.h" #include "../client/dbclient.h" #include "../client/model.h" -#include "shardkey.h" + +#include "chunk.h" #include "shard.h" +#include "shardkey.h" namespace mongo { struct ShardNS { static string shard; - + static string database; static string collection; static string chunk; @@ -46,11 +48,10 @@ namespace mongo { * Field names used in the 'shards' collection. */ struct ShardFields { - static BSONField draining; - static BSONField maxSize; - static BSONField currSize; + static BSONField draining; // is it draining chunks? + static BSONField maxSize; // max allowed disk space usage }; - + class ConfigServer; class DBConfig; @@ -59,93 +60,95 @@ namespace mongo { extern DBConfigPtr configServerPtr; extern ConfigServer& configServer; - class ChunkManager; - typedef shared_ptr ChunkManagerPtr; - /** * top level configuration for a database */ class DBConfig { struct CollectionInfo { - CollectionInfo(){ + CollectionInfo() { _dirty = false; _dropped = false; } - - CollectionInfo( DBConfig * db , const BSONObj& in ); - + + CollectionInfo( const BSONObj& in ); + bool isSharded() const { return _cm.get(); } - + ChunkManagerPtr getCM() const { return _cm; } - void shard( DBConfig * db , const string& ns , const ShardKeyPattern& key , bool unique ); + void shard( const string& ns , const ShardKeyPattern& key , bool unique ); void unshard(); bool isDirty() const { return _dirty; } bool wasDropped() const { return _dropped; } - + void save( const string& ns , DBClientBase* conn ); - + private: ChunkManagerPtr _cm; bool _dirty; bool _dropped; }; - + typedef map Collections; - + public: - DBConfig( string name ) - : _name( name ) , - _primary("config","") , - _shardingEnabled(false), - _lock("DBConfig"){ + DBConfig( string name ) + : _name( name ) , + _primary("config","") , + _shardingEnabled(false), + _lock("DBConfig") { assert( name.size() ); } - virtual ~DBConfig(){} - - string getName(){ return _name; }; + virtual ~DBConfig() {} + + string getName() { return _name; }; /** * @return if anything in this db is partitioned or not */ - bool isShardingEnabled(){ + bool isShardingEnabled() { return _shardingEnabled; } - + void enableSharding(); ChunkManagerPtr shardCollection( const string& ns , ShardKeyPattern fieldsAndOrder , bool unique ); - + + /** + @return true if there was sharding info to remove + */ + bool removeSharding( const string& ns ); + /** * @return whether or not the 'ns' collection is partitioned */ bool isSharded( const string& ns ); - + ChunkManagerPtr getChunkManager( const string& ns , bool reload = false ); - + /** * @return the correct for shard for the ns * if the namespace is sharded, will return NULL */ const Shard& getShard( const string& ns ); - + const Shard& getPrimary() const { uassert( 8041 , (string)"no primary shard configured for db: " + _name , _primary.ok() ); return _primary; } - + void setPrimary( string s ); bool load(); bool reload(); - + bool dropDatabase( string& errmsg ); // model stuff @@ -153,16 +156,13 @@ namespace mongo { // lockless loading void serialize(BSONObjBuilder& to); - /** - * if i need save in new format - */ - bool unserialize(const BSONObj& from); + void unserialize(const BSONObj& from); void getAllShards(set& shards) const; protected: - /** + /** lockless */ bool _isSharded( const string& ns ); @@ -173,24 +173,16 @@ namespace mongo { bool _reload(); void _save(); - - /** - @return true if there was sharding info to remove - */ - bool removeSharding( const string& ns ); - string _name; // e.g. "alleyinsider" Shard _primary; // e.g. localhost , mongo.foo.com:9999 bool _shardingEnabled; - + //map _sharded; // { "alleyinsider.blog.posts" : { ts : 1 } , ... ] - all ns that are sharded //map _shards; // this will only have entries for things that have been looked at Collections _collections; mongo::mutex _lock; // TODO: change to r/w lock ?? - - friend class ChunkManager; }; class ConfigServer : public DBConfig { @@ -198,38 +190,42 @@ namespace mongo { ConfigServer(); ~ConfigServer(); - + bool ok( bool checkConsistency = false ); - - virtual string modelServer(){ + + virtual string modelServer() { uassert( 10190 , "ConfigServer not setup" , _primary.ok() ); return _primary.getConnString(); } - + /** - call at startup, this will initiate connection to the grid db + call at startup, this will initiate connection to the grid db */ bool init( vector configHosts ); - + bool init( string s ); bool allUp(); bool allUp( string& errmsg ); - + int dbConfigVersion(); int dbConfigVersion( DBClientBase& conn ); - + void reloadSettings(); /** * @return 0 = ok, otherwise error # */ int checkConfigVersion( bool upgrade ); - + /** - * log a change to config.changes + * Create a metadata change log entry in the config.changelog collection. + * * @param what e.g. "split" , "migrate" - * @param msg any more info + * @param ns to which collection the metadata change is being applied + * @param msg additional info about the metadata change + * + * This call is guaranteed never to throw. */ void logChange( const string& what , const string& ns , const BSONObj& detail = BSONObj() ); @@ -237,8 +233,10 @@ namespace mongo { return ConnectionString( _primary.getConnString() , ConnectionString::SYNC ); } + void replicaSetChange( const ReplicaSetMonitor * monitor ); + static int VERSION; - + /** * check to see if all config servers have the same state diff --git a/s/config_migrate.cpp b/s/config_migrate.cpp index 1a42144..57890a0 100644 --- a/s/config_migrate.cpp +++ b/s/config_migrate.cpp @@ -30,12 +30,12 @@ namespace mongo { - int ConfigServer::checkConfigVersion( bool upgrade ){ + int ConfigServer::checkConfigVersion( bool upgrade ) { int cur = dbConfigVersion(); if ( cur == VERSION ) return 0; - - if ( cur == 0 ){ + + if ( cur == 0 ) { ScopedDbConnection conn( _primary ); conn->insert( "config.version" , BSON( "_id" << 1 << "version" << VERSION ) ); pool.flush(); @@ -43,20 +43,20 @@ namespace mongo { conn.done(); return 0; } - - if ( cur == 2 ){ + + if ( cur == 2 ) { // need to upgrade assert( VERSION == 3 ); - if ( ! upgrade ){ + if ( ! upgrade ) { log() << "newer version of mongo meta data\n" << "need to --upgrade after shutting all mongos down" << endl; return -9; } - + ScopedDbConnection conn( _primary ); - + // do a backup string backupName; { @@ -67,20 +67,20 @@ namespace mongo { log() << "backing up config to: " << backupName << endl; conn->copyDatabase( "config" , backupName ); - map hostToShard; + map hostToShard; set shards; // shards { unsigned n = 0; auto_ptr c = conn->query( ShardNS::shard , BSONObj() ); - while ( c->more() ){ + while ( c->more() ) { BSONObj o = c->next(); string host = o["host"].String(); string name = ""; - + BSONElement id = o["_id"]; - if ( id.type() == String ){ + if ( id.type() == String ) { name = id.String(); } else { @@ -88,18 +88,18 @@ namespace mongo { ss << "shard" << hostToShard.size(); name = ss.str(); } - + hostToShard[host] = name; shards.insert( name ); n++; } - + assert( n == hostToShard.size() ); assert( n == shards.size() ); - + conn->remove( ShardNS::shard , BSONObj() ); - - for ( map::iterator i=hostToShard.begin(); i != hostToShard.end(); i++ ){ + + for ( map::iterator i=hostToShard.begin(); i != hostToShard.end(); i++ ) { conn->insert( ShardNS::shard , BSON( "_id" << i->second << "host" << i->first ) ); } } @@ -109,27 +109,27 @@ namespace mongo { auto_ptr c = conn->query( ShardNS::database , BSONObj() ); map newDBs; unsigned n = 0; - while ( c->more() ){ + while ( c->more() ) { BSONObj old = c->next(); n++; - - if ( old["name"].eoo() ){ + + if ( old["name"].eoo() ) { // already done newDBs[old["_id"].String()] = old; continue; } - + BSONObjBuilder b(old.objsize()); b.appendAs( old["name"] , "_id" ); - + BSONObjIterator i(old); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); if ( strcmp( "_id" , e.fieldName() ) == 0 || - strcmp( "name" , e.fieldName() ) == 0 ){ + strcmp( "name" , e.fieldName() ) == 0 ) { continue; } - + b.append( e ); } @@ -139,45 +139,45 @@ namespace mongo { } assert( n == newDBs.size() ); - + conn->remove( ShardNS::database , BSONObj() ); - - for ( map::iterator i=newDBs.begin(); i!=newDBs.end(); i++ ){ + + for ( map::iterator i=newDBs.begin(); i!=newDBs.end(); i++ ) { conn->insert( ShardNS::database , i->second ); } - + } - + // chunks { unsigned num = 0; map chunks; auto_ptr c = conn->query( ShardNS::chunk , BSONObj() ); - while ( c->more() ){ + while ( c->more() ) { BSONObj x = c->next(); BSONObjBuilder b; string id = Chunk::genID( x["ns"].String() , x["min"].Obj() ); b.append( "_id" , id ); - + BSONObjIterator i(x); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); if ( strcmp( e.fieldName() , "_id" ) == 0 ) continue; b.append( e ); } - + BSONObj n = b.obj(); log() << x << "\n\t" << n << endl; chunks[id] = n; num++; } - + assert( num == chunks.size() ); - + conn->remove( ShardNS::chunk , BSONObj() ); - for ( map::iterator i=chunks.begin(); i!=chunks.end(); i++ ){ + for ( map::iterator i=chunks.begin(); i!=chunks.end(); i++ ) { conn->insert( ShardNS::chunk , i->second ); } @@ -188,7 +188,7 @@ namespace mongo { pool.flush(); return 1; } - + log() << "don't know how to upgrade " << cur << " to " << VERSION << endl; return -8; } diff --git a/s/cursors.cpp b/s/cursors.cpp index 6dd7a20..cf2735b 100644 --- a/s/cursors.cpp +++ b/s/cursors.cpp @@ -21,90 +21,90 @@ #include "../client/connpool.h" #include "../db/queryutil.h" #include "../db/commands.h" -#include "../util/background.h" +#include "../util/concurrency/task.h" namespace mongo { - + // -------- ShardedCursor ----------- - ShardedClientCursor::ShardedClientCursor( QueryMessage& q , ClusteredCursor * cursor ){ + ShardedClientCursor::ShardedClientCursor( QueryMessage& q , ClusteredCursor * cursor ) { assert( cursor ); _cursor = cursor; - + _skip = q.ntoskip; _ntoreturn = q.ntoreturn; - + _totalSent = 0; _done = false; _id = 0; - - if ( q.queryOptions & QueryOption_NoCursorTimeout ){ + + if ( q.queryOptions & QueryOption_NoCursorTimeout ) { _lastAccessMillis = 0; } - else + else _lastAccessMillis = Listener::getElapsedTimeMillis(); } - ShardedClientCursor::~ShardedClientCursor(){ + ShardedClientCursor::~ShardedClientCursor() { assert( _cursor ); delete _cursor; _cursor = 0; } - long long ShardedClientCursor::getId(){ - if ( _id <= 0 ){ + long long ShardedClientCursor::getId() { + if ( _id <= 0 ) { _id = cursorCache.genId(); assert( _id >= 0 ); } return _id; } - void ShardedClientCursor::accessed(){ + void ShardedClientCursor::accessed() { if ( _lastAccessMillis > 0 ) _lastAccessMillis = Listener::getElapsedTimeMillis(); } - long long ShardedClientCursor::idleTime( long long now ){ + long long ShardedClientCursor::idleTime( long long now ) { if ( _lastAccessMillis == 0 ) return 0; return now - _lastAccessMillis; } - bool ShardedClientCursor::sendNextBatch( Request& r , int ntoreturn ){ + bool ShardedClientCursor::sendNextBatch( Request& r , int ntoreturn ) { uassert( 10191 , "cursor already done" , ! _done ); - + int maxSize = 1024 * 1024; if ( _totalSent > 0 ) maxSize *= 3; - + BufBuilder b(32768); - + int num = 0; bool sendMore = true; - while ( _cursor->more() ){ + while ( _cursor->more() ) { BSONObj o = _cursor->next(); b.appendBuf( (void*)o.objdata() , o.objsize() ); num++; - - if ( b.len() > maxSize ){ + + if ( b.len() > maxSize ) { break; } - if ( num == ntoreturn ){ + if ( num == ntoreturn ) { // soft limit aka batch size break; } - if ( ntoreturn != 0 && ( -1 * num + _totalSent ) == ntoreturn ){ + if ( ntoreturn != 0 && ( -1 * num + _totalSent ) == ntoreturn ) { // hard limit - total to send sendMore = false; break; } - if ( ntoreturn == 0 && _totalSent == 0 && num > 100 ){ + if ( ntoreturn == 0 && _totalSent == 0 && num > 100 ) { // first batch should be max 100 unless batch size specified break; } @@ -112,123 +112,141 @@ namespace mongo { bool hasMore = sendMore && _cursor->more(); log(6) << "\t hasMore:" << hasMore << " wouldSendMoreIfHad: " << sendMore << " id:" << getId() << " totalSent: " << _totalSent << endl; - + replyToQuery( 0 , r.p() , r.m() , b.buf() , b.len() , num , _totalSent , hasMore ? getId() : 0 ); _totalSent += num; _done = ! hasMore; - + return hasMore; } // ---- CursorCache ----- - + long long CursorCache::TIMEOUT = 600000; CursorCache::CursorCache() - :_mutex( "CursorCache" ), _shardedTotal(0){ + :_mutex( "CursorCache" ), _shardedTotal(0) { } - CursorCache::~CursorCache(){ + CursorCache::~CursorCache() { // TODO: delete old cursors? int logLevel = 1; if ( _cursors.size() || _refs.size() ) logLevel = 0; log( logLevel ) << " CursorCache at shutdown - " - << " sharded: " << _cursors.size() + << " sharded: " << _cursors.size() << " passthrough: " << _refs.size() << endl; } - ShardedClientCursorPtr CursorCache::get( long long id ){ + ShardedClientCursorPtr CursorCache::get( long long id ) const { + LOG(_myLogLevel) << "CursorCache::get id: " << id << endl; scoped_lock lk( _mutex ); - MapSharded::iterator i = _cursors.find( id ); - if ( i == _cursors.end() ){ + MapSharded::const_iterator i = _cursors.find( id ); + if ( i == _cursors.end() ) { OCCASIONALLY log() << "Sharded CursorCache missing cursor id: " << id << endl; return ShardedClientCursorPtr(); } i->second->accessed(); return i->second; } - - void CursorCache::store( ShardedClientCursorPtr cursor ){ + + void CursorCache::store( ShardedClientCursorPtr cursor ) { + LOG(_myLogLevel) << "CursorCache::store cursor " << " id: " << cursor->getId() << endl; assert( cursor->getId() ); scoped_lock lk( _mutex ); _cursors[cursor->getId()] = cursor; _shardedTotal++; } - void CursorCache::remove( long long id ){ + void CursorCache::remove( long long id ) { assert( id ); scoped_lock lk( _mutex ); _cursors.erase( id ); } - - void CursorCache::storeRef( const string& server , long long id ){ + + void CursorCache::storeRef( const string& server , long long id ) { + LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl; assert( id ); scoped_lock lk( _mutex ); _refs[id] = server; } - - long long CursorCache::genId(){ - while ( true ){ + + string CursorCache::getRef( long long id ) const { + LOG(_myLogLevel) << "CursorCache::getRef id: " << id << endl; + assert( id ); + scoped_lock lk( _mutex ); + MapNormal::const_iterator i = _refs.find( id ); + if ( i == _refs.end() ) + return ""; + return i->second; + } + + + long long CursorCache::genId() { + while ( true ) { long long x = security.getNonce(); if ( x == 0 ) continue; if ( x < 0 ) x *= -1; - + scoped_lock lk( _mutex ); MapSharded::iterator i = _cursors.find( x ); if ( i != _cursors.end() ) continue; - + MapNormal::iterator j = _refs.find( x ); if ( j != _refs.end() ) continue; - + return x; } } - void CursorCache::gotKillCursors(Message& m ){ + void CursorCache::gotKillCursors(Message& m ) { int *x = (int *) m.singleData()->_data; x++; // reserved int n = *x++; - if ( n > 2000 ){ + if ( n > 2000 ) { log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl; } uassert( 13286 , "sent 0 cursors to kill" , n >= 1 ); uassert( 13287 , "too many cursors to kill" , n < 30000 ); - + long long * cursors = (long long *)x; - for ( int i=0; isecond; _refs.erase( j ); } - + + LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server << endl; + assert( server.size() ); ScopedDbConnection conn( server ); conn->killCursor( id ); @@ -236,7 +254,7 @@ namespace mongo { } } - void CursorCache::appendInfo( BSONObjBuilder& result ){ + void CursorCache::appendInfo( BSONObjBuilder& result ) const { scoped_lock lk( _mutex ); result.append( "sharded" , (int)_cursors.size() ); result.appendNumber( "shardedEver" , _shardedTotal ); @@ -244,12 +262,12 @@ namespace mongo { result.append( "totalOpen" , (int)(_cursors.size() + _refs.size() ) ); } - void CursorCache::doTimeouts(){ + void CursorCache::doTimeouts() { long long now = Listener::getElapsedTimeMillis(); scoped_lock lk( _mutex ); - for ( MapSharded::iterator i=_cursors.begin(); i!=_cursors.end(); ++i ){ + for ( MapSharded::iterator i=_cursors.begin(); i!=_cursors.end(); ++i ) { long long idleFor = i->second->idleTime( now ); - if ( idleFor < TIMEOUT ){ + if ( idleFor < TIMEOUT ) { continue; } log() << "killing old cursor " << i->second->getId() << " idle for: " << idleFor << "ms" << endl; // TODO: make log(1) @@ -258,18 +276,19 @@ namespace mongo { } CursorCache cursorCache; - - class CursorTimeoutThread : public PeriodicBackgroundJob { + + int CursorCache::_myLogLevel = 3; + + class CursorTimeoutTask : public task::Task { public: - CursorTimeoutThread() : PeriodicBackgroundJob( 4000 ){} - virtual string name() { return "cursorTimeout"; } - virtual void runLoop(){ + virtual string name() const { return "cursorTimeout"; } + virtual void doWork() { cursorCache.doTimeouts(); } - } cursorTimeoutThread; + } cursorTimeoutTask; - void CursorCache::startTimeoutThread(){ - cursorTimeoutThread.go(); + void CursorCache::startTimeoutThread() { + task::repeat( &cursorTimeoutTask , 400 ); } class CmdCursorInfo : public Command { @@ -280,7 +299,7 @@ namespace mongo { help << " example: { cursorInfo : 1 }"; } virtual LockType locktype() const { return NONE; } - bool run(const string&, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string&, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { cursorCache.appendInfo( result ); if ( jsobj["setTimeout"].isNumber() ) CursorCache::TIMEOUT = jsobj["setTimeout"].numberLong(); diff --git a/s/cursors.h b/s/cursors.h index 53c5b64..7b54af6 100644 --- a/s/cursors.h +++ b/s/cursors.h @@ -16,7 +16,7 @@ */ -#pragma once +#pragma once #include "../pch.h" @@ -35,21 +35,21 @@ namespace mongo { virtual ~ShardedClientCursor(); long long getId(); - + /** * @return whether there is more data left */ - bool sendNextBatch( Request& r ){ return sendNextBatch( r , _ntoreturn ); } + bool sendNextBatch( Request& r ) { return sendNextBatch( r , _ntoreturn ); } bool sendNextBatch( Request& r , int ntoreturn ); - + void accessed(); /** @return idle time in ms */ long long idleTime( long long now ); protected: - + ClusteredCursor * _cursor; - + int _skip; int _ntoreturn; @@ -62,10 +62,10 @@ namespace mongo { }; typedef boost::shared_ptr ShardedClientCursorPtr; - + class CursorCache { public: - + static long long TIMEOUT; typedef map MapSharded; @@ -73,29 +73,34 @@ namespace mongo { CursorCache(); ~CursorCache(); - - ShardedClientCursorPtr get( long long id ); + + ShardedClientCursorPtr get( long long id ) const; void store( ShardedClientCursorPtr cursor ); void remove( long long id ); void storeRef( const string& server , long long id ); - void gotKillCursors(Message& m ); - - void appendInfo( BSONObjBuilder& result ); + /** @return the server for id or "" */ + string getRef( long long id ) const ; + void gotKillCursors(Message& m ); + + void appendInfo( BSONObjBuilder& result ) const ; + long long genId(); void doTimeouts(); void startTimeoutThread(); private: - mutex _mutex; + mutable mongo::mutex _mutex; MapSharded _cursors; MapNormal _refs; - + long long _shardedTotal; + + static int _myLogLevel; }; - + extern CursorCache cursorCache; } diff --git a/s/d_chunk_manager.cpp b/s/d_chunk_manager.cpp new file mode 100644 index 0000000..d4fea30 --- /dev/null +++ b/s/d_chunk_manager.cpp @@ -0,0 +1,328 @@ +// @file d_chunk_manager.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" + +#include "../client/connpool.h" +#include "../client/dbclientmockcursor.h" +#include "../db/instance.h" + +#include "d_chunk_manager.h" + +namespace mongo { + + ShardChunkManager::ShardChunkManager( const string& configServer , const string& ns , const string& shardName ) { + + // have to get a connection to the config db + // special case if i'm the configdb since i'm locked and if i connect to myself + // its a deadlock + scoped_ptr scoped; + scoped_ptr direct; + DBClientBase * conn; + if ( configServer.empty() ) { + direct.reset( new DBDirectClient() ); + conn = direct.get(); + } + else { + scoped.reset( new ScopedDbConnection( configServer ) ); + conn = scoped->get(); + } + + // get this collection's sharding key + BSONObj collectionDoc = conn->findOne( "config.collections", BSON( "_id" << ns ) ); + uassert( 13539 , str::stream() << ns << " does not exist" , !collectionDoc.isEmpty() ); + uassert( 13540 , str::stream() << ns << " collection config entry corrupted" , collectionDoc["dropped"].type() ); + uassert( 13541 , str::stream() << ns << " dropped. Re-shard collection first." , !collectionDoc["dropped"].Bool() ); + _fillCollectionKey( collectionDoc ); + + // query for all the chunks for 'ns' that live in this shard, sorting so we can efficiently bucket them + BSONObj q = BSON( "ns" << ns << "shard" << shardName ); + auto_ptr cursor = conn->query( "config.chunks" , Query(q).sort( "min" ) ); + _fillChunks( cursor.get() ); + _fillRanges(); + + if ( scoped.get() ) + scoped->done(); + + if ( _chunksMap.empty() ) + log() << "no chunk for collection " << ns << " on shard " << shardName << endl; + } + + ShardChunkManager::ShardChunkManager( const BSONObj& collectionDoc , const BSONArray& chunksArr ) { + _fillCollectionKey( collectionDoc ); + + scoped_ptr c ( new DBClientMockCursor( chunksArr ) ); + _fillChunks( c.get() ); + _fillRanges(); + } + + void ShardChunkManager::_fillCollectionKey( const BSONObj& collectionDoc ) { + BSONElement e = collectionDoc["key"]; + uassert( 13542 , str::stream() << "collection doesn't have a key: " << collectionDoc , ! e.eoo() && e.isABSONObj() ); + + BSONObj keys = e.Obj().getOwned(); + BSONObjBuilder b; + BSONForEach( key , keys ) { + b.append( key.fieldName() , 1 ); + } + _key = b.obj(); + } + + void ShardChunkManager::_fillChunks( DBClientCursorInterface* cursor ) { + assert( cursor ); + + ShardChunkVersion version; + while ( cursor->more() ) { + BSONObj d = cursor->next(); + _chunksMap.insert( make_pair( d["min"].Obj().getOwned() , d["max"].Obj().getOwned() ) ); + + ShardChunkVersion currVersion( d["lastmod"] ); + if ( currVersion > version ) { + version = currVersion; + } + } + _version = version; + } + + void ShardChunkManager::_fillRanges() { + if ( _chunksMap.empty() ) + return; + + // load the chunk information, coallesceing their ranges + // the version for this shard would be the highest version for any of the chunks + RangeMap::const_iterator it = _chunksMap.begin(); + BSONObj min,max; + while ( it != _chunksMap.end() ) { + BSONObj currMin = it->first; + BSONObj currMax = it->second; + ++it; + + // coallesce the chunk's bounds in ranges if they are adjacent chunks + if ( min.isEmpty() ) { + min = currMin; + max = currMax; + continue; + } + if ( max == currMin ) { + max = currMax; + continue; + } + + _rangesMap.insert( make_pair( min , max ) ); + + min = currMin; + max = currMax; + } + assert( ! min.isEmpty() ); + + _rangesMap.insert( make_pair( min , max ) ); + } + + static bool contains( const BSONObj& min , const BSONObj& max , const BSONObj& point ) { + return point.woCompare( min ) >= 0 && point.woCompare( max ) < 0; + } + + bool ShardChunkManager::belongsToMe( const BSONObj& obj ) const { + if ( _rangesMap.size() == 0 ) + return false; + + BSONObj x = obj.extractFields(_key); + + RangeMap::const_iterator it = _rangesMap.upper_bound( x ); + if ( it != _rangesMap.begin() ) + it--; + + bool good = contains( it->first , it->second , x ); + +#if 0 + if ( ! good ) { + log() << "bad: " << x << " " << it->first << " " << x.woCompare( it->first ) << " " << x.woCompare( it->second ) << endl; + for ( RangeMap::const_iterator i=_rangesMap.begin(); i!=_rangesMap.end(); ++i ) { + log() << "\t" << i->first << "\t" << i->second << "\t" << endl; + } + } +#endif + + return good; + } + + bool ShardChunkManager::getNextChunk( const BSONObj& lookupKey, BSONObj* foundMin , BSONObj* foundMax ) const { + assert( foundMin ); + assert( foundMax ); + *foundMin = BSONObj(); + *foundMax = BSONObj(); + + if ( _chunksMap.empty() ) { + return true; + } + + RangeMap::const_iterator it; + if ( lookupKey.isEmpty() ) { + it = _chunksMap.begin(); + *foundMin = it->first; + *foundMax = it->second; + return _chunksMap.size() == 1; + } + + it = _chunksMap.upper_bound( lookupKey ); + if ( it != _chunksMap.end() ) { + *foundMin = it->first; + *foundMax = it->second; + return false; + } + + return true; + } + + void ShardChunkManager::_assertChunkExists( const BSONObj& min , const BSONObj& max ) const { + RangeMap::const_iterator it = _chunksMap.find( min ); + if ( it == _chunksMap.end() ) { + uasserted( 13586 , str::stream() << "couldn't find chunk " << min << "->" << max ); + } + + if ( it->second.woCompare( max ) != 0 ) { + ostringstream os; + os << "ranges differ, " + << "requested: " << min << " -> " << max << " " + << "existing: " << (it == _chunksMap.end()) ? "" : it->first.toString() + " -> " + it->second.toString(); + uasserted( 13587 , os.str() ); + } + } + + ShardChunkManager* ShardChunkManager::cloneMinus( const BSONObj& min, const BSONObj& max, const ShardChunkVersion& version ) { + + // check that we have the exact chunk that'll be subtracted + _assertChunkExists( min , max ); + + auto_ptr p( new ShardChunkManager ); + p->_key = this->_key; + + if ( _chunksMap.size() == 1 ) { + // if left with no chunks, just reset version + uassert( 13590 , str::stream() << "setting version to " << version << " on removing last chunk", version == 0 ); + + p->_version = 0; + + } + else { + // can't move version backwards when subtracting chunks + // this is what guarantees that no read or write would be taken once we subtract data from the current shard + if ( version <= _version ) { + uasserted( 13585 , str::stream() << "version " << version.toString() << " not greater than " << _version.toString() ); + } + + p->_chunksMap = this->_chunksMap; + p->_chunksMap.erase( min ); + p->_version = version; + p->_fillRanges(); + } + + return p.release(); + } + + static bool overlap( const BSONObj& l1 , const BSONObj& h1 , const BSONObj& l2 , const BSONObj& h2 ) { + return ! ( ( h1.woCompare( l2 ) <= 0 ) || ( h2.woCompare( l1 ) <= 0 ) ); + } + + ShardChunkManager* ShardChunkManager::clonePlus( const BSONObj& min , const BSONObj& max , const ShardChunkVersion& version ) { + + // it is acceptable to move version backwards (e.g., undoing a migration that went bad during commit) + // but only cloning away the last chunk may reset the version to 0 + uassert( 13591 , "version can't be set to zero" , version > 0 ); + + if ( ! _chunksMap.empty() ) { + + // check that there isn't any chunk on the interval to be added + RangeMap::const_iterator it = _chunksMap.lower_bound( max ); + if ( it != _chunksMap.begin() ) { + --it; + } + if ( overlap( min , max , it->first , it->second ) ) { + ostringstream os; + os << "ranges overlap, " + << "requested: " << min << " -> " << max << " " + << "existing: " << it->first.toString() + " -> " + it->second.toString(); + uasserted( 13588 , os.str() ); + } + } + + auto_ptr p( new ShardChunkManager ); + + p->_key = this->_key; + p->_chunksMap = this->_chunksMap; + p->_chunksMap.insert( make_pair( min.getOwned() , max.getOwned() ) ); + p->_version = version; + p->_fillRanges(); + + return p.release(); + } + + ShardChunkManager* ShardChunkManager::cloneSplit( const BSONObj& min , const BSONObj& max , const vector& splitKeys , + const ShardChunkVersion& version ) { + + // the version required in both resulting chunks could be simply an increment in the minor portion of the current version + // however, we are enforcing uniqueness over the attributes of the configdb collection 'chunks' + // so in practice, a migrate somewhere may force this split to pick up a version that has the major portion higher + // than the one that this shard has been using + // + // TODO drop the uniqueness constraint and tigthen the check below so that only the minor portion of version changes + if ( version <= _version ) { + uasserted( 13592 , str::stream() << "version " << version.toString() << " not greater than " << _version.toString() ); + } + + // check that we have the exact chunk that'll be split and that the split point is valid + _assertChunkExists( min , max ); + for ( vector::const_iterator it = splitKeys.begin() ; it != splitKeys.end() ; ++it ) { + if ( ! contains( min , max , *it ) ) { + uasserted( 13593 , str::stream() << "can split " << min << " -> " << max << " on " << *it ); + } + } + + auto_ptr p( new ShardChunkManager ); + + p->_key = this->_key; + p->_chunksMap = this->_chunksMap; + p->_version = version; // will increment second, third, ... chunks below + + BSONObj startKey = min; + for ( vector::const_iterator it = splitKeys.begin() ; it != splitKeys.end() ; ++it ) { + BSONObj split = *it; + p->_chunksMap[min] = split.getOwned(); + p->_chunksMap.insert( make_pair( split.getOwned() , max.getOwned() ) ); + p->_version.incMinor(); + startKey = split; + } + p->_fillRanges(); + + return p.release(); + } + + string ShardChunkManager::toString() const { + StringBuilder ss; + ss << " ShardChunkManager version: " << _version << " key: " << _key; + bool first = true; + for ( RangeMap::const_iterator i=_rangesMap.begin(); i!=_rangesMap.end(); ++i ) { + if ( first ) first = false; + else ss << " , "; + + ss << i->first << " -> " << i->second; + } + return ss.str(); + } + +} // namespace mongo diff --git a/s/d_chunk_manager.h b/s/d_chunk_manager.h new file mode 100644 index 0000000..9fb95e7 --- /dev/null +++ b/s/d_chunk_manager.h @@ -0,0 +1,150 @@ +// @file d_chunk_manager.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../pch.h" + +#include "../db/jsobj.h" +#include "util.h" + +namespace mongo { + + /** + * Controls the boundaries of all the chunks for a given collection that live in this shard. + * + * ShardChunkManager instances never change after construction. There are methods provided that would generate a + * new manager if new chunks are added, subtracted, or split. + * + * TODO + * The responsibility of maintaining the version for a shard is still shared between this class and its caller. The + * manager does check corner cases (e.g. cloning out the last chunk generates a manager with version 0) but ultimately + * still cannot be responsible to set all versions. Currently, they are a function of the global state as opposed to + * the per-shard one. + */ + class ShardChunkManager : public boost::noncopyable { + public: + + /** + * Loads the ShardChunkManager with all boundaries for chunks of a given collection that live in an given + * shard. + * + * @param configServer name of the server where the configDB currently is. Can be empty to indicate + * that the configDB is running locally + * @param ns namespace for the collections whose chunks we're interested + * @param shardName name of the shard that this chunk matcher should track + * + * This constructor throws if collection is dropped/malformed and on connectivity errors + */ + ShardChunkManager( const string& configServer , const string& ns , const string& shardName ); + + /** + * Same as the regular constructor but used in unittest (no access to configDB required). + * + * @param collectionDoc simulates config.collection's entry for one colleciton + * @param chunksDocs simulates config.chunks' entries for one collection's shard + */ + ShardChunkManager( const BSONObj& collectionDoc , const BSONArray& chunksDoc ); + + ~ShardChunkManager() {} + + /** + * Generates a new manager based on 'this's state minus a given chunk. + * + * @param min max chunk boundaries for the chunk to subtract + * @param version that the resulting manager should be at. The version has to be higher than the current one. + * When cloning away the last chunk, verstion must be 0. + * @return a new ShardChunkManager, to be owned by the caller + */ + ShardChunkManager* cloneMinus( const BSONObj& min , const BSONObj& max , const ShardChunkVersion& version ); + + /** + * Generates a new manager based on 'this's state plus a given chunk. + * + * @param min max chunk boundaries for the chunk to add + * @param version that the resulting manager should be at. It can never be 0, though (see CloneMinus). + * @return a new ShardChunkManager, to be owned by the caller + */ + ShardChunkManager* clonePlus( const BSONObj& min , const BSONObj& max , const ShardChunkVersion& version ); + + /** + * Generates a new manager by splitting an existing chunk at one or more points. + * + * @param min max boundaries of chunk to be split + * @param splitKeys points to split original chunk at + * @param version to be used in first chunk. The subsequent chunks would increment the minor version. + * @return a new ShardChunkManager with the chunk split, to be owned by the caller + */ + ShardChunkManager* cloneSplit( const BSONObj& min , const BSONObj& max , const vector& splitKeys , + const ShardChunkVersion& version ); + + /** + * Checks whether a document belongs to this shard. + * + * @param obj document containing sharding keys (and, optionally, other attributes) + * @return true if shards hold the object + */ + bool belongsToMe( const BSONObj& obj ) const; + + /** + * Given a chunk's min key (or empty doc), gets the boundary of the chunk following that one (the first). + * + * @param lookupKey is the min key for a previously obtained chunk or the empty document + * @param foundMin IN/OUT min for chunk following the one starting at lookupKey + * @param foundMax IN/OUT max for the above chunk + * @return true if the chunk returned is the last one + */ + bool getNextChunk( const BSONObj& lookupKey, BSONObj* foundMin , BSONObj* foundMax ) const; + + // accessors + + ShardChunkVersion getVersion() const { return _version; } + BSONObj getKey() const { return _key.getOwned(); } + unsigned getNumChunks() const { return _chunksMap.size(); } + + string toString() const; + private: + // highest ShardChunkVersion for which this ShardChunkManager's information is accurate + ShardChunkVersion _version; + + // key pattern for chunks under this range + BSONObj _key; + + // a map from a min key into the chunk's (or range's) max boundary + typedef map< BSONObj, BSONObj , BSONObjCmp > RangeMap; + RangeMap _chunksMap; + + // a map from a min key into a range or continguous chunks + // redundant but we expect high chunk continguity, expecially in small installations + RangeMap _rangesMap; + + /** constructors helpers */ + void _fillCollectionKey( const BSONObj& collectionDoc ); + void _fillChunks( DBClientCursorInterface* cursor ); + void _fillRanges(); + + /** throws if the exact chunk is not in the chunks' map */ + void _assertChunkExists( const BSONObj& min , const BSONObj& max ) const; + + /** can only be used in the cloning calls */ + ShardChunkManager() {} + }; + + typedef shared_ptr ShardChunkManagerPtr; + +} // namespace mongo diff --git a/s/d_logic.cpp b/s/d_logic.cpp index 62288ed..c032883 100644 --- a/s/d_logic.cpp +++ b/s/d_logic.cpp @@ -1,4 +1,4 @@ -// d_logic.cpp +// @file d_logic.cpp /** * Copyright (C) 2008 10gen Inc. @@ -37,32 +37,32 @@ #include "shard.h" #include "d_logic.h" +#include "d_writeback.h" using namespace std; namespace mongo { - bool handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ){ - if ( ! shardingState.enabled() ) - return false; + bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { + DEV assert( shardingState.enabled() ); int op = m.operation(); - if ( op < 2000 - || op >= 3000 - || op == dbGetMore // cursors are weird - ) + if ( op < 2000 + || op >= 3000 + || op == dbGetMore // cursors are weird + ) return false; - - DbMessage d(m); + + DbMessage d(m); const char *ns = d.getns(); string errmsg; - if ( shardVersionOk( ns , opIsWrite( op ) , errmsg ) ){ + if ( shardVersionOk( ns , opIsWrite( op ) , errmsg ) ) { return false; } log(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl; - - if ( doesOpGetAResponse( op ) ){ + + if ( doesOpGetAResponse( op ) ) { assert( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); @@ -70,7 +70,7 @@ namespace mongo { BSONObj obj = BSON( "$err" << errmsg ); b.appendBuf( obj.objdata() , obj.objsize() ); } - + QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); @@ -82,19 +82,19 @@ namespace mongo { Message * resp = new Message(); resp->setData( qr , true ); - + dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } - + OID writebackID; writebackID.init(); lastError.getSafe()->writeback( writebackID ); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); - + log(1) << "got write with an old config - writing back ns: " << ns << endl; if ( logLevel ) log(1) << debugString( m ) << endl; @@ -102,11 +102,12 @@ namespace mongo { b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "id" , writebackID ); + b.append( "connectionId" , cc().getConnectionId() ); b.appendTimestamp( "version" , shardingState.getVersion( ns ) ); b.appendTimestamp( "yourVersion" , ShardedConnectionInfo::get( true )->getVersion( ns ) ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); log(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; - queueWriteBack( clientID.str() , b.obj() ); + writeBackManager.queueWriteBack( clientID.str() , b.obj() ); return true; } diff --git a/s/d_logic.h b/s/d_logic.h index a000f6b..718836c 100644 --- a/s/d_logic.h +++ b/s/d_logic.h @@ -1,4 +1,4 @@ -// d_logic.h +// @file d_logic.h /* * Copyright (C) 2010 10gen Inc. * @@ -19,38 +19,20 @@ #pragma once #include "../pch.h" + #include "../db/jsobj.h" + +#include "d_chunk_manager.h" #include "util.h" namespace mongo { - - class ShardingState; - - typedef ShardChunkVersion ConfigVersion; - typedef map NSVersionMap; - - // ----------- - class ChunkMatcher { - typedef map,BSONObjCmp> MyMap; - public: - - bool belongsToMe( const BSONObj& key , const DiskLoc& loc ) const; + class Database; + class DiskLoc; - private: - ChunkMatcher( ConfigVersion version ); - - void gotRange( const BSONObj& min , const BSONObj& max ); - - ConfigVersion _version; - BSONObj _key; - MyMap _map; - - friend class ShardingState; - }; + typedef ShardChunkVersion ConfigVersion; + typedef map NSVersionMap; - typedef shared_ptr ChunkMatcherPtr; - // -------------- // --- global state --- // -------------- @@ -58,100 +40,182 @@ namespace mongo { class ShardingState { public: ShardingState(); - + bool enabled() const { return _enabled; } const string& getConfigServer() const { return _configServer; } void enable( const string& server ); void gotShardName( const string& name ); - void gotShardHost( const string& host ); - + void gotShardHost( string host ); + + /** Reverts back to a state where this mongod is not sharded. */ + void resetShardingState(); + + // versioning support + bool hasVersion( const string& ns ); bool hasVersion( const string& ns , ConfigVersion& version ); - ConfigVersion& getVersion( const string& ns ); // TODO: this is dangeroues - void setVersion( const string& ns , const ConfigVersion& version ); - + const ConfigVersion getVersion( const string& ns ) const; + + /** + * Uninstalls the manager for a given collection. This should be used when the collection is dropped. + * + * NOTE: + * An existing collection with no chunks on this shard will have a manager on version 0, which is different than a + * a dropped collection, which will not have a manager. + * + * TODO + * When sharding state is enabled, absolutely all collections should have a manager. (The non-sharded ones are + * a be degenerate case of one-chunk collections). + * For now, a dropped collection and an non-sharded one are indistinguishable (SERVER-1849) + * + * @param ns the collection to be dropped + */ + void resetVersion( const string& ns ); + + /** + * Requests to access a collection at a certain version. If the collection's manager is not at that version it + * will try to update itself to the newest version. The request is only granted if the version is the current or + * the newest one. + * + * @param ns collection to be accessed + * @param version (IN) the client belive this collection is on and (OUT) the version the manager is actually in + * @return true if the access can be allowed at the provided version + */ + bool trySetVersion( const string& ns , ConfigVersion& version ); + void appendInfo( BSONObjBuilder& b ); - - ChunkMatcherPtr getChunkMatcher( const string& ns ); - + + // querying support + + bool needShardChunkManager( const string& ns ) const; + ShardChunkManagerPtr getShardChunkManager( const string& ns ); + + // chunk migrate and split support + + /** + * Creates and installs a new chunk manager for a given collection by "forgetting" about one of its chunks. + * The new manager uses the provided version, which has to be higher than the current manager's. + * One exception: if the forgotten chunk is the last one in this shard for the collection, version has to be 0. + * + * If it runs successfully, clients need to grab the new version to access the collection. + * + * @param ns the collection + * @param min max the chunk to eliminate from the current manager + * @param version at which the new manager should be at + */ + void donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ); + + /** + * Creates and installs a new chunk manager for a given collection by reclaiming a previously donated chunk. + * The previous manager's version has to be provided. + * + * If it runs successfully, clients that became stale by the previous donateChunk will be able to access the + * collection again. + * + * @param ns the collection + * @param min max the chunk to reclaim and add to the current manager + * @param version at which the new manager should be at + */ + void undoDonateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ); + + /** + * Creates and installs a new chunk manager for a given collection by splitting one of its chunks in two or more. + * The version for the first split chunk should be provided. The subsequent chunks' version would be the latter with the + * minor portion incremented. + * + * The effect on clients will depend on the version used. If the major portion is the same as the current shards, + * clients shouldn't perceive the split. + * + * @param ns the collection + * @param min max the chunk that should be split + * @param splitKeys point in which to split + * @param version at which the new manager should be at + */ + void splitChunk( const string& ns , const BSONObj& min , const BSONObj& max , const vector& splitKeys , + ShardChunkVersion version ); + bool inCriticalMigrateSection(); + private: - bool _enabled; - + string _configServer; - + string _shardName; string _shardHost; - mongo::mutex _mutex; - NSVersionMap _versions; - map _chunks; + // protects state below + mutable mongo::mutex _mutex; + + // map from a namespace into the ensemble of chunk ranges that are stored in this mongod + // a ShardChunkManager carries all state we need for a collection at this shard, including its version information + typedef map ChunkManagersMap; + ChunkManagersMap _chunks; }; - + extern ShardingState shardingState; - // -------------- - // --- per connection --- - // -------------- - + /** + * one per connection from mongos + * holds version state for each namesapce + */ class ShardedConnectionInfo { public: ShardedConnectionInfo(); - + const OID& getID() const { return _id; } bool hasID() const { return _id.isSet(); } void setID( const OID& id ); - - ConfigVersion& getVersion( const string& ns ); // TODO: this is dangeroues + + const ConfigVersion getVersion( const string& ns ) const; void setVersion( const string& ns , const ConfigVersion& version ); - + static ShardedConnectionInfo* get( bool create ); static void reset(); - - bool inForceMode() const { - return _forceMode; + + bool inForceVersionOkMode() const { + return _forceVersionOk; } - - void enterForceMode(){ _forceMode = true; } - void leaveForceMode(){ _forceMode = false; } + + void enterForceVersionOkMode() { _forceVersionOk = true; } + void leaveForceVersionOkMode() { _forceVersionOk = false; } private: - + OID _id; NSVersionMap _versions; - bool _forceMode; + bool _forceVersionOk; // if this is true, then chunk version #s aren't check, and all ops are allowed static boost::thread_specific_ptr _tl; }; - struct ShardForceModeBlock { - ShardForceModeBlock(){ + struct ShardForceVersionOkModeBlock { + ShardForceVersionOkModeBlock() { info = ShardedConnectionInfo::get( false ); if ( info ) - info->enterForceMode(); + info->enterForceVersionOkMode(); } - ~ShardForceModeBlock(){ + ~ShardForceVersionOkModeBlock() { if ( info ) - info->leaveForceMode(); + info->leaveForceVersionOkMode(); } ShardedConnectionInfo * info; }; - + // ----------------- // --- core --- // ----------------- unsigned long long extractVersion( BSONElement e , string& errmsg ); - + /** * @return true if we have any shard info for the ns */ bool haveLocalShardingInfo( const string& ns ); - + /** * @return true if the current threads shard version is ok, or not in sharded version */ @@ -160,15 +224,18 @@ namespace mongo { /** * @return true if we took care of the message and nothing else should be done */ - bool handlePossibleShardedMessage( Message &m, DbResponse * dbresponse ); + struct DbResponse; - void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ); + bool _handlePossibleShardedMessage( Message &m, DbResponse * dbresponse ); - // ----------------- - // --- writeback --- - // ----------------- + /** What does this do? document please? */ + inline bool handlePossibleShardedMessage( Message &m, DbResponse * dbresponse ) { + if( !shardingState.enabled() ) + return false; + return _handlePossibleShardedMessage(m, dbresponse); + } - /* queue a write back on a remote server for a failed write */ - void queueWriteBack( const string& remote , const BSONObj& o ); + void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ); + void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); } diff --git a/s/d_migrate.cpp b/s/d_migrate.cpp index 8e9584c..2878276 100644 --- a/s/d_migrate.cpp +++ b/s/d_migrate.cpp @@ -25,18 +25,24 @@ #include "pch.h" #include #include +#include #include "../db/commands.h" #include "../db/jsobj.h" #include "../db/dbmessage.h" #include "../db/query.h" #include "../db/cmdline.h" +#include "../db/queryoptimizer.h" +#include "../db/btree.h" +#include "../db/repl_block.h" +#include "../db/dur.h" #include "../client/connpool.h" #include "../client/distlock.h" #include "../util/queue.h" #include "../util/unittest.h" +#include "../util/processinfo.h" #include "shard.h" #include "d_logic.h" @@ -49,131 +55,185 @@ namespace mongo { class MoveTimingHelper { public: - MoveTimingHelper( const string& where , const string& ns , BSONObj min , BSONObj max ) - : _where( where ) , _ns( ns ){ - _next = 1; + MoveTimingHelper( const string& where , const string& ns , BSONObj min , BSONObj max , int total ) + : _where( where ) , _ns( ns ) , _next( 0 ) , _total( total ) { + _nextNote = 0; _b.append( "min" , min ); _b.append( "max" , max ); } - ~MoveTimingHelper(){ - configServer.logChange( (string)"moveChunk." + _where , _ns, _b.obj() ); + ~MoveTimingHelper() { + // even if logChange doesn't throw, bson does + // sigh + try { + if ( _next != _total ) { + note( "aborted" ); + } + configServer.logChange( (string)"moveChunk." + _where , _ns, _b.obj() ); + } + catch ( const std::exception& e ) { + log( LL_WARNING ) << "couldn't record timing for moveChunk '" << _where << "': " << e.what() << endl; + } } - - void done( int step ){ - assert( step == _next++ ); - + + void done( int step ) { + assert( step == ++_next ); + assert( step <= _total ); + stringstream ss; ss << "step" << step; string s = ss.str(); - + CurOp * op = cc().curop(); if ( op ) op->setMessage( s.c_str() ); - else + else log( LL_WARNING ) << "op is null in MoveTimingHelper::done" << endl; - + _b.appendNumber( s , _t.millis() ); _t.reset(); + +#if 0 + // debugging for memory leak? + ProcessInfo pi; + ss << " v:" << pi.getVirtualMemorySize() + << " r:" << pi.getResidentSize(); + log() << ss.str() << endl; +#endif } - - + + + void note( const string& s ) { + string field = "note"; + if ( _nextNote > 0 ) { + StringBuilder buf; + buf << "note" << _nextNote; + field = buf.str(); + } + _nextNote++; + + _b.append( field , s ); + } + private: Timer _t; string _where; string _ns; - + int _next; - + int _total; // expected # of steps + int _nextNote; + BSONObjBuilder _b; + }; struct OldDataCleanup { + static AtomicUInt _numThreads; // how many threads are doing async cleanusp + string ns; BSONObj min; BSONObj max; set initial; - void doRemove(){ - ShardForceModeBlock sf; + + OldDataCleanup(){ + _numThreads++; + } + OldDataCleanup( const OldDataCleanup& other ) { + ns = other.ns; + min = other.min.getOwned(); + max = other.max.getOwned(); + initial = other.initial; + _numThreads++; + } + ~OldDataCleanup(){ + _numThreads--; + } + + void doRemove() { + ShardForceVersionOkModeBlock sf; writelock lk(ns); RemoveSaver rs("moveChunk",ns,"post-cleanup"); long long num = Helpers::removeRange( ns , min , max , true , false , cmdLine.moveParanoia ? &rs : 0 ); log() << "moveChunk deleted: " << num << endl; } + }; + AtomicUInt OldDataCleanup::_numThreads = 0; + static const char * const cleanUpThreadName = "cleanupOldData"; - - void _cleanupOldData( OldDataCleanup cleanup ){ + + void _cleanupOldData( OldDataCleanup cleanup ) { Client::initThread( cleanUpThreadName ); log() << " (start) waiting to cleanup " << cleanup.ns << " from " << cleanup.min << " -> " << cleanup.max << " # cursors:" << cleanup.initial.size() << endl; int loops = 0; Timer t; - while ( t.seconds() < 900 ){ // 15 minutes + while ( t.seconds() < 900 ) { // 15 minutes assert( dbMutex.getState() == 0 ); sleepmillis( 20 ); - + set now; - ClientCursor::find( cleanup.ns , now ); - + ClientCursor::find( cleanup.ns , now ); + set left; - for ( set::iterator i=cleanup.initial.begin(); i!=cleanup.initial.end(); ++i ){ + for ( set::iterator i=cleanup.initial.begin(); i!=cleanup.initial.end(); ++i ) { CursorId id = *i; if ( now.count(id) ) left.insert( id ); } - + if ( left.size() == 0 ) break; cleanup.initial = left; - - if ( ( loops++ % 200 ) == 0 ){ + + if ( ( loops++ % 200 ) == 0 ) { log() << " (looping " << loops << ") waiting to cleanup " << cleanup.ns << " from " << cleanup.min << " -> " << cleanup.max << " # cursors:" << cleanup.initial.size() << endl; - + stringstream ss; - for ( set::iterator i=cleanup.initial.begin(); i!=cleanup.initial.end(); ++i ){ + for ( set::iterator i=cleanup.initial.begin(); i!=cleanup.initial.end(); ++i ) { CursorId id = *i; ss << id << " "; } log() << " cursors: " << ss.str() << endl; } } - + cleanup.doRemove(); cc().shutdown(); } - void cleanupOldData( OldDataCleanup cleanup ){ + void cleanupOldData( OldDataCleanup cleanup ) { try { _cleanupOldData( cleanup ); } - catch ( std::exception& e ){ + catch ( std::exception& e ) { log() << " error cleaning old data:" << e.what() << endl; } - catch ( ... ){ + catch ( ... ) { log() << " unknown error cleaning old data" << endl; } } class ChunkCommandHelper : public Command { public: - ChunkCommandHelper( const char * name ) - : Command( name ){ + ChunkCommandHelper( const char * name ) + : Command( name ) { } - + virtual void help( stringstream& help ) const { - help << "internal should not be calling this directly" << endl; + help << "internal - should not be called directly" << endl; } virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } }; - bool isInRange( const BSONObj& obj , const BSONObj& min , const BSONObj& max ){ + bool isInRange( const BSONObj& obj , const BSONObj& min , const BSONObj& max ) { BSONObj k = obj.extractFields( min, true ); return k.woCompare( min ) >= 0 && k.woCompare( max ) < 0; @@ -182,48 +242,57 @@ namespace mongo { class MigrateFromStatus { public: - - MigrateFromStatus() - : _mutex( "MigrateFromStatus" ){ + + MigrateFromStatus() : _m("MigrateFromStatus") { _active = false; _inCriticalSection = false; + _memoryUsed = 0; } - void start( string ns , const BSONObj& min , const BSONObj& max ){ + void start( string ns , const BSONObj& min , const BSONObj& max ) { + scoped_lock l(_m); // reads and writes _active + assert( ! _active ); - + assert( ! min.isEmpty() ); assert( ! max.isEmpty() ); assert( ns.size() ); - + _ns = ns; _min = min; _max = max; - - _deleted.clear(); - _reload.clear(); - + + assert( _cloneLocs.size() == 0 ); + assert( _deleted.size() == 0 ); + assert( _reload.size() == 0 ); + assert( _memoryUsed == 0 ); + _active = true; } - - void done(){ - if ( ! _active ) - return; - _active = false; - _inCriticalSection = false; - scoped_lock lk( _mutex ); + void done() { + readlock lk( _ns ); + _deleted.clear(); _reload.clear(); + _cloneLocs.clear(); + _memoryUsed = 0; + + scoped_lock l(_m); + _active = false; + _inCriticalSection = false; } - - void logOp( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ){ - if ( ! _active ) + + void logOp( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) { + if ( ! _getActive() ) return; if ( _ns != ns ) return; - + + // no need to log if this is not an insertion, an update, or an actual deletion + // note: opstr 'db' isn't a deletion but a mention that a database exists (for replication + // machinery mostly) char op = opstr[0]; if ( op == 'n' || op =='c' || ( op == 'd' && opstr[1] == 'b' ) ) return; @@ -231,68 +300,68 @@ namespace mongo { BSONElement ide; if ( patt ) ide = patt->getField( "_id" ); - else + else ide = obj["_id"]; - - if ( ide.eoo() ){ + + if ( ide.eoo() ) { log( LL_WARNING ) << "logOpForSharding got mod with no _id, ignoring obj: " << obj << endl; return; } - + BSONObj it; - switch ( opstr[0] ){ - + switch ( opstr[0] ) { + case 'd': { - - if ( getThreadName() == cleanUpThreadName ){ + + if ( getThreadName() == cleanUpThreadName ) { // we don't want to xfer things we're cleaning // as then they'll be deleted on TO // which is bad return; } - + // can't filter deletes :( - scoped_lock lk( _mutex ); _deleted.push_back( ide.wrap() ); + _memoryUsed += ide.size() + 5; return; } - - case 'i': + + case 'i': it = obj; break; - - case 'u': - if ( ! Helpers::findById( cc() , _ns.c_str() , ide.wrap() , it ) ){ + + case 'u': + if ( ! Helpers::findById( cc() , _ns.c_str() , ide.wrap() , it ) ) { log( LL_WARNING ) << "logOpForSharding couldn't find: " << ide << " even though should have" << endl; return; } break; - + } - + if ( ! isInRange( it , _min , _max ) ) return; - - scoped_lock lk( _mutex ); + _reload.push_back( ide.wrap() ); + _memoryUsed += ide.size() + 5; } - void xfer( list * l , BSONObjBuilder& b , const char * name , long long& size , bool explode ){ + void xfer( list * l , BSONObjBuilder& b , const char * name , long long& size , bool explode ) { const long long maxSize = 1024 * 1024; - + if ( l->size() == 0 || size > maxSize ) return; - + BSONArrayBuilder arr(b.subarrayStart(name)); - - list::iterator i = l->begin(); - - while ( i != l->end() && size < maxSize ){ + + list::iterator i = l->begin(); + + while ( i != l->end() && size < maxSize ) { BSONObj t = *i; - if ( explode ){ + if ( explode ) { BSONObj it; - if ( Helpers::findById( cc() , _ns.c_str() , t, it ) ){ + if ( Helpers::findById( cc() , _ns.c_str() , t, it ) ) { arr.append( it ); size += it.objsize(); } @@ -303,12 +372,16 @@ namespace mongo { i = l->erase( i ); size += t.objsize(); } - + arr.done(); } - bool transferMods( string& errmsg , BSONObjBuilder& b ){ - if ( ! _active ){ + /** + * called from the dest of a migrate + * transfers mods from src to dest + */ + bool transferMods( string& errmsg , BSONObjBuilder& b ) { + if ( ! _getActive() ) { errmsg = "no active migration!"; return false; } @@ -318,8 +391,7 @@ namespace mongo { { readlock rl( _ns ); Client::Context cx( _ns ); - - scoped_lock lk( _mutex ); + xfer( &_deleted , b , "deleted" , size , false ); xfer( &_reload , b , "reload" , size , true ); } @@ -329,45 +401,201 @@ namespace mongo { return true; } - bool _inCriticalSection; + /** + * Get the disklocs that belong to the chunk migrated and sort them in _cloneLocs (to avoid seeking disk later) + * + * @param maxChunkSize number of bytes beyond which a chunk's base data (no indices) is considered too large to move + * @param errmsg filled with textual description of error if this call return false + * @return false if approximate chunk size is too big to move or true otherwise + */ + bool storeCurrentLocs( long long maxChunkSize , string& errmsg , BSONObjBuilder& result ) { + readlock l( _ns ); + Client::Context ctx( _ns ); + NamespaceDetails *d = nsdetails( _ns.c_str() ); + if ( ! d ) { + errmsg = "ns not found, should be impossible"; + return false; + } + + BSONObj keyPattern; + // the copies are needed because the indexDetailsForRange destroys the input + BSONObj min = _min.copy(); + BSONObj max = _max.copy(); + IndexDetails *idx = indexDetailsForRange( _ns.c_str() , errmsg , min , max , keyPattern ); + if ( idx == NULL ) { + errmsg = "can't find index in storeCurrentLocs"; + return false; + } + + scoped_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout , + shared_ptr( new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ) ) , + _ns ) ); + + // use the average object size to estimate how many objects a full chunk would carry + // do that while traversing the chunk's range using the sharding index, below + // there's a fair amout of slack before we determine a chunk is too large because object sizes will vary + unsigned long long maxRecsWhenFull; + long long avgRecSize; + const long long totalRecs = d->stats.nrecords; + if ( totalRecs > 0 ) { + avgRecSize = d->stats.datasize / totalRecs; + maxRecsWhenFull = maxChunkSize / avgRecSize; + maxRecsWhenFull = 130 * maxRecsWhenFull / 100; // slack + } + else { + avgRecSize = 0; + maxRecsWhenFull = numeric_limits::max(); + } + + // do a full traversal of the chunk and don't stop even if we think it is a large chunk + // we want the number of records to better report, in that case + bool isLargeChunk = false; + unsigned long long recCount = 0;; + while ( cc->ok() ) { + DiskLoc dl = cc->currLoc(); + if ( ! isLargeChunk ) { + _cloneLocs.insert( dl ); + } + cc->advance(); + + // we can afford to yield here because any change to the base data that we might miss is already being + // queued and will be migrated in the 'transferMods' stage + if ( ! cc->yieldSometimes() ) { + break; + } + + if ( ++recCount > maxRecsWhenFull ) { + isLargeChunk = true; + } + } + + if ( isLargeChunk ) { + warning() << "can't move chunk of size (aprox) " << recCount * avgRecSize + << " because maximum size allowed to move is " << maxChunkSize + << " ns: " << _ns << " " << _min << " -> " << _max + << endl; + result.appendBool( "chunkTooBig" , true ); + result.appendNumber( "chunkSize" , (long long)(recCount * avgRecSize) ); + errmsg = "chunk too big to move"; + return false; + } + + log() << "moveChunk number of documents: " << _cloneLocs.size() << endl; + return true; + } + + bool clone( string& errmsg , BSONObjBuilder& result ) { + if ( ! _getActive() ) { + errmsg = "not active"; + return false; + } + + readlock l( _ns ); + Client::Context ctx( _ns ); + + NamespaceDetails *d = nsdetails( _ns.c_str() ); + assert( d ); + + BSONArrayBuilder a( std::min( BSONObjMaxUserSize , (int)( ( 12 + d->averageObjectSize() )* _cloneLocs.size() ) ) ); + + set::iterator i = _cloneLocs.begin(); + for ( ; i!=_cloneLocs.end(); ++i ) { + DiskLoc dl = *i; + BSONObj o = dl.obj(); + + // use the builder size instead of accumulating 'o's size so that we take into consideration + // the overhead of BSONArray indices + if ( a.len() + o.objsize() + 1024 > BSONObjMaxUserSize ) { + break; + } + a.append( o ); + } + + result.appendArray( "objects" , a.arr() ); + _cloneLocs.erase( _cloneLocs.begin() , i ); + return true; + } + + void aboutToDelete( const Database* db , const DiskLoc& dl ) { + dbMutex.assertWriteLocked(); + + if ( ! _getActive() ) + return; + + if ( ! db->ownsNS( _ns ) ) + return; + + _cloneLocs.erase( dl ); + } + + long long mbUsed() const { return _memoryUsed / ( 1024 * 1024 ); } + + bool getInCriticalSection() const { scoped_lock l(_m); return _inCriticalSection; } + void setInCriticalSection( bool b ) { scoped_lock l(_m); _inCriticalSection = b; } + + bool isActive() const { return _getActive(); } private: - + mutable mongo::mutex _m; // protect _inCriticalSection and _active + bool _inCriticalSection; bool _active; string _ns; BSONObj _min; BSONObj _max; - list _reload; - list _deleted; + // disk locs yet to be transferred from here to the other side + // no locking needed because build by 1 thread in a read lock + // depleted by 1 thread in a read lock + // updates applied by 1 thread in a write lock + set _cloneLocs; + + list _reload; // objects that were modified that must be recloned + list _deleted; // objects deleted during clone that should be deleted later + long long _memoryUsed; // bytes in _reload + _deleted + + bool _getActive() const { scoped_lock l(_m); return _active; } + void _setActive( bool b ) { scoped_lock l(_m); _active = b; } - mongo::mutex _mutex; - } migrateFromStatus; - + struct MigrateStatusHolder { - MigrateStatusHolder( string ns , const BSONObj& min , const BSONObj& max ){ + MigrateStatusHolder( string ns , const BSONObj& min , const BSONObj& max ) { migrateFromStatus.start( ns , min , max ); } - ~MigrateStatusHolder(){ + ~MigrateStatusHolder() { migrateFromStatus.done(); } }; - void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ){ + void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) { migrateFromStatus.logOp( opstr , ns , obj , patt ); } - class TransferModsCommand : public ChunkCommandHelper{ + void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ) { + migrateFromStatus.aboutToDelete( db , dl ); + } + + class TransferModsCommand : public ChunkCommandHelper { public: - TransferModsCommand() : ChunkCommandHelper( "_transferMods" ){} + TransferModsCommand() : ChunkCommandHelper( "_transferMods" ) {} - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { return migrateFromStatus.transferMods( errmsg, result ); } } transferModsCommand; + + class InitialCloneCommand : public ChunkCommandHelper { + public: + InitialCloneCommand() : ChunkCommandHelper( "_migrateClone" ) {} + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + return migrateFromStatus.clone( errmsg, result ); + } + } initialCloneCommand; + + /** * this is the main entry for moveChunk * called to initial a move @@ -376,20 +604,22 @@ namespace mongo { */ class MoveChunkCommand : public Command { public: - MoveChunkCommand() : Command( "moveChunk" ){} + MoveChunkCommand() : Command( "moveChunk" ) {} virtual void help( stringstream& help ) const { help << "should not be calling this directly" << endl; } virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } - virtual LockType locktype() const { return NONE; } - - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + virtual LockType locktype() const { return NONE; } + + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { // 1. parse options // 2. make sure my view is complete and lock // 3. start migrate + // in a read lock, get all DiskLoc and sort so we can do as little seeking as possible + // tell to start transferring // 4. pause till migrate caught up // 5. LOCK // a) update my config, essentially locking @@ -398,10 +628,9 @@ namespace mongo { // d) logChange to config server // 6. wait for all current cursors to expire // 7. remove data locally - + // ------------------------------- - - + // 1. string ns = cmdObj.firstElement().str(); string to = cmdObj["to"].str(); @@ -409,38 +638,45 @@ namespace mongo { BSONObj min = cmdObj["min"].Obj(); BSONObj max = cmdObj["max"].Obj(); BSONElement shardId = cmdObj["shardId"]; - - if ( ns.empty() ){ + BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"]; + + if ( ns.empty() ) { errmsg = "need to specify namespace in command"; return false; } - - if ( to.empty() ){ - errmsg = "need to specify server to move shard to"; + + if ( to.empty() ) { + errmsg = "need to specify server to move chunk to"; return false; } - if ( from.empty() ){ - errmsg = "need to specify server to move shard from (redundat i know)"; + if ( from.empty() ) { + errmsg = "need to specify server to move chunk from"; return false; } - - if ( min.isEmpty() ){ + + if ( min.isEmpty() ) { errmsg = "need to specify a min"; return false; } - if ( max.isEmpty() ){ + if ( max.isEmpty() ) { errmsg = "need to specify a max"; return false; } - - if ( shardId.eoo() ){ + + if ( shardId.eoo() ) { errmsg = "need shardId"; return false; } - - if ( ! shardingState.enabled() ){ - if ( cmdObj["configdb"].type() != String ){ + + if ( maxSizeElem.eoo() || ! maxSizeElem.isNumber() ) { + errmsg = "need to specify maxChunkSizeBytes"; + return false; + } + const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes + + if ( ! shardingState.enabled() ) { + if ( cmdObj["configdb"].type() != String ) { errmsg = "sharding not enabled"; return false; } @@ -449,78 +685,107 @@ namespace mongo { configServer.init( configdb ); } - MoveTimingHelper timing( "from" , ns , min , max ); + MoveTimingHelper timing( "from" , ns , min , max , 6 /* steps */); Shard fromShard( from ); Shard toShard( to ); - - log() << "got movechunk: " << cmdObj << endl; + + log() << "received moveChunk request: " << cmdObj << endl; timing.done(1); - // 2. - + + // 2. DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC ) , ns ); dist_lock_try dlk( &lockSetup , (string)"migrate-" + min.toString() ); - if ( ! dlk.got() ){ - errmsg = "someone else has the lock"; + if ( ! dlk.got() ) { + errmsg = "the collection's metadata lock is taken"; result.append( "who" , dlk.other() ); return false; } + BSONObj chunkInfo = BSON("min" << min << "max" << max << "from" << fromShard.getName() << "to" << toShard.getName()); + configServer.logChange( "moveChunk.start" , ns , chunkInfo ); + ShardChunkVersion maxVersion; string myOldShard; { ScopedDbConnection conn( shardingState.getConfigServer() ); - + BSONObj x = conn->findOne( ShardNS::chunk , Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) ); maxVersion = x["lastmod"]; - x = conn->findOne( ShardNS::chunk , shardId.wrap( "_id" ) ); - assert( x["shard"].type() ); - myOldShard = x["shard"].String(); - - if ( myOldShard != fromShard.getName() ){ - errmsg = "i'm out of date"; + BSONObj currChunk = conn->findOne( ShardNS::chunk , shardId.wrap( "_id" ) ); + assert( currChunk["shard"].type() ); + assert( currChunk["min"].type() ); + assert( currChunk["max"].type() ); + myOldShard = currChunk["shard"].String(); + conn.done(); + + BSONObj currMin = currChunk["min"].Obj(); + BSONObj currMax = currChunk["max"].Obj(); + if ( currMin.woCompare( min ) || currMax.woCompare( max ) ) { + errmsg = "boundaries are outdated (likely a split occurred)"; + result.append( "currMin" , currMin ); + result.append( "currMax" , currMax ); + result.append( "requestedMin" , min ); + result.append( "requestedMax" , max ); + + log( LL_WARNING ) << "aborted moveChunk because" << errmsg << ": " << min << "->" << max + << " is now " << currMin << "->" << currMax << endl; + return false; + } + + if ( myOldShard != fromShard.getName() ) { + errmsg = "location is outdated (likely balance or migrate occurred)"; result.append( "from" , fromShard.getName() ); result.append( "official" , myOldShard ); + + log( LL_WARNING ) << "aborted moveChunk because " << errmsg << ": chunk is at " << myOldShard + << " and not at " << fromShard.getName() << endl; return false; } - - if ( maxVersion < shardingState.getVersion( ns ) ){ - errmsg = "official version less than mine?";; + + if ( maxVersion < shardingState.getVersion( ns ) ) { + errmsg = "official version less than mine?"; result.appendTimestamp( "officialVersion" , maxVersion ); result.appendTimestamp( "myVersion" , shardingState.getVersion( ns ) ); + + log( LL_WARNING ) << "aborted moveChunk because " << errmsg << ": official " << maxVersion + << " mine: " << shardingState.getVersion(ns) << endl; return false; } - conn.done(); + // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date + shardingState.gotShardName( myOldShard ); + ShardChunkVersion shardVersion; + shardingState.trySetVersion( ns , shardVersion /* will return updated */ ); + + log() << "moveChunk request accepted at version " << shardVersion << endl; } - + timing.done(2); - + // 3. MigrateStatusHolder statusHolder( ns , min , max ); { - dblock lk; - // this makes sure there wasn't a write inside the .cpp code we can miss - } - - { - - ScopedDbConnection conn( to ); - BSONObj res; - bool ok = conn->runCommand( "admin" , - BSON( "_recvChunkStart" << ns << - "from" << from << - "min" << min << - "max" << max << - "configServer" << configServer.modelServer() - ) , - res ); - conn.done(); + // this gets a read lock, so we know we have a checkpoint for mods + if ( ! migrateFromStatus.storeCurrentLocs( maxChunkSize , errmsg , result ) ) + return false; - if ( ! ok ){ - errmsg = "_recvChunkStart failed: "; + ScopedDbConnection connTo( to ); + BSONObj res; + bool ok = connTo->runCommand( "admin" , + BSON( "_recvChunkStart" << ns << + "from" << from << + "min" << min << + "max" << max << + "configServer" << configServer.modelServer() + ) , + res ); + connTo.done(); + + if ( ! ok ) { + errmsg = "moveChunk failed to engage TO-shard in the data transfer: "; assert( res["errmsg"].type() ); errmsg += res["errmsg"].String(); result.append( "cause" , res ); @@ -529,118 +794,275 @@ namespace mongo { } timing.done( 3 ); - - // 4. - for ( int i=0; i<86400; i++ ){ // don't want a single chunk move to take more than a day + + // 4. + for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day assert( dbMutex.getState() == 0 ); - sleepsecs( 1 ); + sleepsecs( 1 ); ScopedDbConnection conn( to ); BSONObj res; bool ok = conn->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res ); res = res.getOwned(); conn.done(); - - log(0) << "_recvChunkStatus : " << res << endl; - - if ( ! ok || res["state"].String() == "fail" ){ - log( LL_ERROR ) << "_recvChunkStatus error : " << res << endl; - errmsg = "_recvChunkStatus error"; - result.append( "cause" ,res ); + + log(0) << "moveChunk data transfer progress: " << res << " my mem used: " << migrateFromStatus.mbUsed() << endl; + + if ( ! ok || res["state"].String() == "fail" ) { + log( LL_WARNING ) << "moveChunk error transfering data caused migration abort: " << res << endl; + errmsg = "data transfer error"; + result.append( "cause" , res ); return false; } if ( res["state"].String() == "steady" ) break; + if ( migrateFromStatus.mbUsed() > (500 * 1024 * 1024) ) { + // this is too much memory for us to use for this + // so we're going to abort the migrate + ScopedDbConnection conn( to ); + BSONObj res; + conn->runCommand( "admin" , BSON( "_recvChunkAbort" << 1 ) , res ); + res = res.getOwned(); + conn.done(); + error() << "aborting migrate because too much memory used res: " << res << endl; + errmsg = "aborting migrate because too much memory used"; + result.appendBool( "split" , true ); + return false; + } + killCurrentOp.checkForInterrupt(); } timing.done(4); // 5. - { + { // 5.a - migrateFromStatus._inCriticalSection = true; - ShardChunkVersion myVersion = maxVersion; + // we're under the collection lock here, so no other migrate can change maxVersion or ShardChunkManager state + migrateFromStatus.setInCriticalSection( true ); + ShardChunkVersion currVersion = maxVersion; + ShardChunkVersion myVersion = currVersion; myVersion.incMajor(); - + { - dblock lk; + writelock lk( ns ); assert( myVersion > shardingState.getVersion( ns ) ); - shardingState.setVersion( ns , myVersion ); - assert( myVersion == shardingState.getVersion( ns ) ); - log() << "moveChunk locking myself to: " << myVersion << endl; + + // bump the chunks manager's version up and "forget" about the chunk being moved + // this is not the commit point but in practice the state in this shard won't until the commit it done + shardingState.donateChunk( ns , min , max , myVersion ); } - + log() << "moveChunk setting version to: " << myVersion << endl; + // 5.b + // we're under the collection lock here, too, so we can undo the chunk donation because no other state change + // could be ongoing { BSONObj res; - ScopedDbConnection conn( to ); - bool ok = conn->runCommand( "admin" , - BSON( "_recvChunkCommit" << 1 ) , - res ); - conn.done(); - log() << "moveChunk commit result: " << res << endl; - if ( ! ok ){ - log() << "_recvChunkCommit failed: " << res << endl; + ScopedDbConnection connTo( to ); + bool ok = connTo->runCommand( "admin" , + BSON( "_recvChunkCommit" << 1 ) , + res ); + connTo.done(); + + if ( ! ok ) { + { + writelock lk( ns ); + + // revert the chunk manager back to the state before "forgetting" about the chunk + shardingState.undoDonateChunk( ns , min , max , currVersion ); + } + + log() << "movChunk migrate commit not accepted by TO-shard: " << res + << " resetting shard version to: " << currVersion << endl; + errmsg = "_recvChunkCommit failed!"; result.append( "cause" , res ); return false; } + + log() << "moveChunk migrate commit accepted by TO-shard: " << res << endl; } - + // 5.c - ScopedDbConnection conn( shardingState.getConfigServer() ); - - BSONObjBuilder temp; - temp.append( "shard" , toShard.getName() ); - temp.appendTimestamp( "lastmod" , myVersion ); - - conn->update( ShardNS::chunk , shardId.wrap( "_id" ) , BSON( "$set" << temp.obj() ) ); - - { - // update another random chunk - BSONObj x = conn->findOne( ShardNS::chunk , Query( BSON( "ns" << ns << "shard" << myOldShard ) ).sort( BSON( "lastmod" << -1 ) ) ); - if ( ! x.isEmpty() ){ - - BSONObjBuilder temp2; - myVersion.incMinor(); - - temp2.appendTimestamp( "lastmod" , myVersion ); - - shardingState.setVersion( ns , myVersion ); - - conn->update( ShardNS::chunk , x["_id"].wrap() , BSON( "$set" << temp2.obj() ) ); - - log() << "moveChunk updating self to: " << myVersion << endl; + + // version at which the next highest lastmod will be set + // if the chunk being moved is the last in the shard, nextVersion is that chunk's lastmod + // otherwise the highest version is from the chunk being bumped on the FROM-shard + ShardChunkVersion nextVersion; + + // we want to go only once to the configDB but perhaps change two chunks, the one being migrated and another + // local one (so to bump version for the entire shard) + // we use the 'applyOps' mechanism to group the two updates and make them safer + // TODO pull config update code to a module + + BSONObjBuilder cmdBuilder; + + BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) ); + { + // update for the chunk being moved + BSONObjBuilder op; + op.append( "op" , "u" ); + op.appendBool( "b" , false /* no upserting */ ); + op.append( "ns" , ShardNS::chunk ); + + BSONObjBuilder n( op.subobjStart( "o" ) ); + n.append( "_id" , Chunk::genID( ns , min ) ); + n.appendTimestamp( "lastmod" , myVersion /* same as used on donateChunk */ ); + n.append( "ns" , ns ); + n.append( "min" , min ); + n.append( "max" , max ); + n.append( "shard" , toShard.getName() ); + n.done(); + + BSONObjBuilder q( op.subobjStart( "o2" ) ); + q.append( "_id" , Chunk::genID( ns , min ) ); + q.done(); + + updates.append( op.obj() ); + } + + nextVersion = myVersion; + + // if we have chunks left on the FROM shard, update the version of one of them as well + // we can figure that out by grabbing the chunkManager installed on 5.a + // TODO expose that manager when installing it + + ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns ); + if( chunkManager->getNumChunks() > 0 ) { + + // get another chunk on that shard + BSONObj lookupKey; + BSONObj bumpMin, bumpMax; + do { + chunkManager->getNextChunk( lookupKey , &bumpMin , &bumpMax ); + lookupKey = bumpMin; + } + while( bumpMin == min ); + + BSONObjBuilder op; + op.append( "op" , "u" ); + op.appendBool( "b" , false ); + op.append( "ns" , ShardNS::chunk ); + + nextVersion.incMinor(); // same as used on donateChunk + BSONObjBuilder n( op.subobjStart( "o" ) ); + n.append( "_id" , Chunk::genID( ns , bumpMin ) ); + n.appendTimestamp( "lastmod" , nextVersion ); + n.append( "ns" , ns ); + n.append( "min" , bumpMin ); + n.append( "max" , bumpMax ); + n.append( "shard" , fromShard.getName() ); + n.done(); + + BSONObjBuilder q( op.subobjStart( "o2" ) ); + q.append( "_id" , Chunk::genID( ns , bumpMin ) ); + q.done(); + + updates.append( op.obj() ); + + log() << "moveChunk updating self version to: " << nextVersion << " through " + << bumpMin << " -> " << bumpMax << " for collection '" << ns << "'" << endl; + + } + else { + + log() << "moveChunk moved last chunk out for collection '" << ns << "'" << endl; + } + + updates.done(); + + BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) ); + { + BSONObjBuilder b; + b.append( "ns" , ShardNS::chunk ); + b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) ); + { + BSONObjBuilder bb( b.subobjStart( "res" ) ); + bb.appendTimestamp( "lastmod" , maxVersion ); + bb.done(); } - else { - //++myVersion; - shardingState.setVersion( ns , 0 ); + preCond.append( b.obj() ); + } + + preCond.done(); + + BSONObj cmd = cmdBuilder.obj(); + log(7) << "moveChunk update: " << cmd << endl; + + bool ok = false; + BSONObj cmdResult; + try { + ScopedDbConnection conn( shardingState.getConfigServer() ); + ok = conn->runCommand( "config" , cmd , cmdResult ); + conn.done(); + } + catch ( DBException& e ) { + ok = false; + BSONObjBuilder b; + e.getInfo().append( b ); + cmdResult = b.obj(); + } + + if ( ! ok ) { + + // this could be a blip in the connectivity + // wait out a few seconds and check if the commit request made it + // + // if the commit made it to the config, we'll see the chunk in the new shard and there's no action + // if the commit did not make it, currently the only way to fix this state is to bounce the mongod so + // that the old state (before migrating) be brought in + + warning() << "moveChunk commit outcome ongoing: " << cmd << " for command :" << cmdResult << endl; + sleepsecs( 10 ); + + try { + ScopedDbConnection conn( shardingState.getConfigServer() ); + + // look for the chunk in this shard whose version got bumped + // we assume that if that mod made it to the config, the applyOps was successful + BSONObj doc = conn->findOne( ShardNS::chunk , Query(BSON( "ns" << ns )).sort( BSON("lastmod" << -1))); + ShardChunkVersion checkVersion = doc["lastmod"]; + + if ( checkVersion == nextVersion ) { + log() << "moveChunk commit confirmed" << endl; + + } + else { + error() << "moveChunk commit failed: version is at" + << checkVersion << " instead of " << nextVersion << endl; + error() << "TERMINATING" << endl; + dbexit( EXIT_SHARDING_ERROR ); + } + + conn.done(); - log() << "moveChunk now i'm empty" << endl; + } + catch ( ... ) { + error() << "moveChunk failed to get confirmation of commit" << endl; + error() << "TERMINATING" << endl; + dbexit( EXIT_SHARDING_ERROR ); } } - conn.done(); - migrateFromStatus._inCriticalSection = false; + migrateFromStatus.setInCriticalSection( false ); + // 5.d - configServer.logChange( "moveChunk" , ns , BSON( "min" << min << "max" << max << - "from" << fromShard.getName() << - "to" << toShard.getName() ) ); + configServer.logChange( "moveChunk.commit" , ns , chunkInfo ); } - + migrateFromStatus.done(); timing.done(5); - - { // 6. + { + // 6. OldDataCleanup c; c.ns = ns; c.min = min.getOwned(); c.max = max.getOwned(); ClientCursor::find( ns , c.initial ); - if ( c.initial.size() ){ + if ( c.initial.size() ) { log() << "forking for cleaning up chunk data" << endl; boost::thread t( boost::bind( &cleanupOldData , c ) ); } @@ -649,24 +1071,24 @@ namespace mongo { // 7. c.doRemove(); } - - + + } - timing.done(6); + timing.done(6); return true; - + } - + } moveChunkCmd; - bool ShardingState::inCriticalMigrateSection(){ - return migrateFromStatus._inCriticalSection; + bool ShardingState::inCriticalMigrateSection() { + return migrateFromStatus.getInCriticalSection(); } /* ----- below this are the "to" side commands - + command to initiate worker thread does initial clone @@ -679,71 +1101,74 @@ namespace mongo { class MigrateStatus { public: - - MigrateStatus(){ - active = false; - } - void prepare(){ + MigrateStatus() : m_active("MigrateStatus") { active = false; } + + void prepare() { + scoped_lock l(m_active); // reading and writing 'active' + assert( ! active ); state = READY; errmsg = ""; numCloned = 0; + clonedBytes = 0; numCatchup = 0; numSteady = 0; active = true; } - void go(){ + void go() { try { _go(); } - catch ( std::exception& e ){ + catch ( std::exception& e ) { state = FAIL; errmsg = e.what(); log( LL_ERROR ) << "migrate failed: " << e.what() << endl; } - catch ( ... ){ + catch ( ... ) { state = FAIL; errmsg = "UNKNOWN ERROR"; log( LL_ERROR ) << "migrate failed with unknown exception" << endl; } - active = false; + setActive( false ); } - - void _go(){ - assert( active ); + + void _go() { + assert( getActive() ); assert( state == READY ); assert( ! min.isEmpty() ); assert( ! max.isEmpty() ); - - MoveTimingHelper timing( "to" , ns , min , max ); - + + MoveTimingHelper timing( "to" , ns , min , max , 5 /* steps */ ); + ScopedDbConnection conn( from ); conn->getLastError(); // just test connection - { // 1. copy indexes + { + // 1. copy indexes auto_ptr indexes = conn->getIndexes( ns ); vector all; - while ( indexes->more() ){ + while ( indexes->more() ) { all.push_back( indexes->next().getOwned() ); } - + writelock lk( ns ); Client::Context ct( ns ); - + string system_indexes = cc().database()->name + ".system.indexes"; - for ( unsigned i=0; i cursor = conn->query( ns , Query().minKey( min ).maxKey( max ) , /* QueryOption_Exhaust */ 0 ); - assert( cursor.get() ); - while ( cursor->more() ){ - BSONObj o = cursor->next().getOwned(); - { - writelock lk( ns ); - Helpers::upsert( ns , o ); + + while ( true ) { + BSONObj res; + if ( ! conn->runCommand( "admin" , BSON( "_migrateClone" << 1 ) , res ) ) { + state = FAIL; + errmsg = "_migrateClone failed: "; + errmsg += res.toString(); + error() << errmsg << endl; + conn.done(); + return; + } + + BSONObj arr = res["objects"].Obj(); + int thisTime = 0; + + BSONObjIterator i( arr ); + while( i.more() ) { + BSONObj o = i.next().Obj(); + { + writelock lk( ns ); + Helpers::upsert( ns , o ); + } + thisTime++; + numCloned++; + clonedBytes += o.objsize(); } - numCloned++; + + if ( thisTime == 0 ) + break; } timing.done(3); } - - { // 4. do bulk of mods + + // if running on a replicated system, we'll need to flush the docs we cloned to the secondaries + ReplTime lastOpApplied; + + { + // 4. do bulk of mods state = CATCHUP; - while ( true ){ + while ( true ) { BSONObj res; - if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ){ + if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) { state = FAIL; errmsg = "_transferMods failed: "; errmsg += res.toString(); @@ -784,18 +1234,26 @@ namespace mongo { } if ( res["size"].number() == 0 ) break; - - apply( res ); + + apply( res , &lastOpApplied ); + + if ( state == ABORT ) { + timing.note( "aborted" ); + return; + } } timing.done(4); } - - { // 5. wait for commit + + { + // 5. wait for commit + Timer timeWaitingForCommit; + state = STEADY; - while ( state == STEADY || state == COMMIT_START ){ + while ( state == STEADY || state == COMMIT_START ) { BSONObj res; - if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ){ + if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) { log() << "_transferMods failed in STEADY state: " << res << endl; errmsg = res.toString(); state = FAIL; @@ -803,36 +1261,48 @@ namespace mongo { return; } - if ( res["size"].number() > 0 && apply( res ) ) + if ( res["size"].number() > 0 && apply( res , &lastOpApplied ) ) continue; - - if ( state == COMMIT_START ) + + if ( state == COMMIT_START && flushPendingWrites( lastOpApplied ) ) break; sleepmillis( 10 ); } - + + if ( state == ABORT ) { + timing.note( "aborted" ); + return; + } + + if ( timeWaitingForCommit.seconds() > 86400 ) { + state = FAIL; + errmsg = "timed out waiting for commit"; + return; + } + timing.done(5); } - + state = DONE; conn.done(); } - void status( BSONObjBuilder& b ){ - b.appendBool( "active" , active ); + void status( BSONObjBuilder& b ) { + b.appendBool( "active" , getActive() ); b.append( "ns" , ns ); b.append( "from" , from ); b.append( "min" , min ); b.append( "max" , max ); - + b.append( "state" , stateString() ); if ( state == FAIL ) b.append( "errmsg" , errmsg ); { BSONObjBuilder bb( b.subobjStart( "counts" ) ); bb.append( "cloned" , numCloned ); + bb.append( "clonedBytes" , clonedBytes ); bb.append( "catchup" , numCatchup ); bb.append( "steady" , numSteady ); bb.done(); @@ -841,17 +1311,22 @@ namespace mongo { } - bool apply( const BSONObj& xfer ){ + bool apply( const BSONObj& xfer , ReplTime* lastOpApplied ) { + ReplTime dummy; + if ( lastOpApplied == NULL ) { + lastOpApplied = &dummy; + } + bool didAnything = false; - - if ( xfer["deleted"].isABSONObj() ){ + + if ( xfer["deleted"].isABSONObj() ) { writelock lk(ns); Client::Context cx(ns); - + RemoveSaver rs( "moveChunk" , ns , "removedDuring" ); BSONObjIterator i( xfer["deleted"].Obj() ); - while ( i.more() ){ + while ( i.more() ) { BSONObj id = i.next().Obj(); // do not apply deletes if they do not belong to the chunk being migrated @@ -865,27 +1340,56 @@ namespace mongo { } Helpers::removeRange( ns , id , id, false , true , cmdLine.moveParanoia ? &rs : 0 ); + + *lastOpApplied = cx.getClient()->getLastOp(); didAnything = true; } } - - if ( xfer["reload"].isABSONObj() ){ + + if ( xfer["reload"].isABSONObj() ) { writelock lk(ns); Client::Context cx(ns); BSONObjIterator i( xfer["reload"].Obj() ); - while ( i.more() ){ + while ( i.more() ) { BSONObj it = i.next().Obj(); + Helpers::upsert( ns , it ); + + *lastOpApplied = cx.getClient()->getLastOp(); didAnything = true; } } return didAnything; } - - string stateString(){ - switch ( state ){ + + bool flushPendingWrites( const ReplTime& lastOpApplied ) { + // if replication is on, try to force enough secondaries to catch up + // TODO opReplicatedEnough should eventually honor priorities and geo-awareness + // for now, we try to replicate to a sensible number of secondaries + const int slaveCount = getSlaveCount() / 2 + 1; + if ( ! opReplicatedEnough( lastOpApplied , slaveCount ) ) { + log( LL_WARNING ) << "migrate commit attempt timed out contacting " << slaveCount + << " slaves for '" << ns << "' " << min << " -> " << max << endl; + return false; + } + log() << "migrate commit succeeded flushing to secondaries for '" << ns << "' " << min << " -> " << max << endl; + + { + readlock lk(ns); // commitNow() currently requires it + + // if durability is on, force a write to journal + if ( getDur().commitNow() ) { + log() << "migrate commit flushed to journal for '" << ns << "' " << min << " -> " << max << endl; + } + } + + return true; + } + + string stateString() { + switch ( state ) { case READY: return "ready"; case CLONE: return "clone"; case CATCHUP: return "catchup"; @@ -893,17 +1397,18 @@ namespace mongo { case COMMIT_START: return "commitStart"; case DONE: return "done"; case FAIL: return "fail"; + case ABORT: return "abort"; } assert(0); return ""; } - bool startCommit(){ + bool startCommit() { if ( state != STEADY ) return false; state = COMMIT_START; - - for ( int i=0; i<86400; i++ ){ + + for ( int i=0; i<86400; i++ ) { sleepmillis(1); if ( state == DONE ) return true; @@ -912,42 +1417,60 @@ namespace mongo { return false; } + void abort() { + state = ABORT; + errmsg = "aborted"; + } + + bool getActive() const { scoped_lock l(m_active); return active; } + void setActive( bool b ) { scoped_lock l(m_active); active = b; } + + mutable mongo::mutex m_active; bool active; - + string ns; string from; - + BSONObj min; BSONObj max; - + long long numCloned; + long long clonedBytes; long long numCatchup; long long numSteady; - enum State { READY , CLONE , CATCHUP , STEADY , COMMIT_START , DONE , FAIL } state; + enum State { READY , CLONE , CATCHUP , STEADY , COMMIT_START , DONE , FAIL , ABORT } state; string errmsg; - + } migrateStatus; - - void migrateThread(){ + + void migrateThread() { Client::initThread( "migrateThread" ); migrateStatus.go(); cc().shutdown(); } - + class RecvChunkStartCommand : public ChunkCommandHelper { public: - RecvChunkStartCommand() : ChunkCommandHelper( "_recvChunkStart" ){} + RecvChunkStartCommand() : ChunkCommandHelper( "_recvChunkStart" ) {} virtual LockType locktype() const { return WRITE; } // this is so don't have to do locking internally - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - - if ( migrateStatus.active ){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + + if ( migrateStatus.getActive() ) { errmsg = "migrate already in progress"; return false; } + if ( OldDataCleanup::_numThreads > 0 ) { + errmsg = + str::stream() + << "still waiting for a previous migrates data to get cleaned, can't accept new chunks, num threads: " + << OldDataCleanup::_numThreads; + return false; + } + if ( ! configServer.ok() ) configServer.init( cmdObj["configServer"].String() ); @@ -957,9 +1480,9 @@ namespace mongo { migrateStatus.from = cmdObj["from"].String(); migrateStatus.min = cmdObj["min"].Obj().getOwned(); migrateStatus.max = cmdObj["max"].Obj().getOwned(); - + boost::thread m( migrateThread ); - + result.appendBool( "started" , true ); return true; } @@ -968,20 +1491,20 @@ namespace mongo { class RecvChunkStatusCommand : public ChunkCommandHelper { public: - RecvChunkStatusCommand() : ChunkCommandHelper( "_recvChunkStatus" ){} + RecvChunkStatusCommand() : ChunkCommandHelper( "_recvChunkStatus" ) {} - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { migrateStatus.status( result ); return 1; } - + } recvChunkStatusCommand; class RecvChunkCommitCommand : public ChunkCommandHelper { public: - RecvChunkCommitCommand() : ChunkCommandHelper( "_recvChunkCommit" ){} - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + RecvChunkCommitCommand() : ChunkCommandHelper( "_recvChunkCommit" ) {} + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { bool ok = migrateStatus.startCommit(); migrateStatus.status( result ); return ok; @@ -989,10 +1512,22 @@ namespace mongo { } recvChunkCommitCommand; + class RecvChunkAbortCommand : public ChunkCommandHelper { + public: + RecvChunkAbortCommand() : ChunkCommandHelper( "_recvChunkAbort" ) {} + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + migrateStatus.abort(); + migrateStatus.status( result ); + return true; + } + + } recvChunkAboortCommand; + class IsInRangeTest : public UnitTest { public: - void run(){ + void run() { BSONObj min = BSON( "x" << 1 ); BSONObj max = BSON( "x" << 5 ); @@ -1002,6 +1537,8 @@ namespace mongo { assert( isInRange( BSON( "x" << 4 ) , min , max ) ); assert( ! isInRange( BSON( "x" << 5 ) , min , max ) ); assert( ! isInRange( BSON( "x" << 6 ) , min , max ) ); + + log(1) << "isInRangeTest passed" << endl; } } isInRangeTest; } diff --git a/s/d_split.cpp b/s/d_split.cpp index fdefc7e..0896803 100644 --- a/s/d_split.cpp +++ b/s/d_split.cpp @@ -1,4 +1,4 @@ -// d_split.cpp +// @file d_split.cpp /** * Copyright (C) 2008 10gen Inc. @@ -27,6 +27,13 @@ #include "../db/query.h" #include "../db/queryoptimizer.h" +#include "../client/connpool.h" +#include "../client/distlock.h" + +#include "chunk.h" // for static genID only +#include "config.h" +#include "d_logic.h" + namespace mongo { // TODO: Fold these checks into each command. @@ -43,19 +50,19 @@ namespace mongo { public: CmdMedianKey() : Command( "medianKey" ) {} virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { - help << - "Internal command.\n" - "example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n" - "NOTE: This command may take a while to run"; + help << + "Internal command.\n" + "example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n" + "NOTE: This command may take a while to run"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { const char *ns = jsobj.getStringField( "medianKey" ); BSONObj min = jsobj.getObjectField( "min" ); BSONObj max = jsobj.getObjectField( "max" ); BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); - + Client::Context ctx( ns ); IndexDetails *id = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern ); @@ -66,22 +73,22 @@ namespace mongo { int num = 0; NamespaceDetails *d = nsdetails(ns); int idxNo = d->idxNo(*id); - + // only yielding on firt half for now // after this it should be in ram, so 2nd should be fast { shared_ptr c( new BtreeCursor( d, idxNo, *id, min, max, false, 1 ) ); scoped_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); - while ( c->ok() ){ + while ( c->ok() ) { num++; c->advance(); if ( ! cc->yieldSometimes() ) break; } } - + num /= 2; - + BtreeCursor c( d, idxNo, *id, min, max, false, 1 ); for( ; num; c.advance(), --num ); @@ -99,15 +106,15 @@ namespace mongo { int x = median.woCompare( min , BSONObj() , false ); int y = median.woCompare( max , BSONObj() , false ); - if ( x == 0 || y == 0 ){ + if ( x == 0 || y == 0 ) { // its on an edge, ok } - else if ( x < 0 && y < 0 ){ + else if ( x < 0 && y < 0 ) { log( LL_ERROR ) << "median error (1) min: " << min << " max: " << max << " median: " << median << endl; errmsg = "median error 1"; return false; } - else if ( x > 0 && y > 0 ){ + else if ( x > 0 && y > 0 ) { log( LL_ERROR ) << "median error (2) min: " << min << " max: " << max << " median: " << median << endl; errmsg = "median error 2"; return false; @@ -117,95 +124,662 @@ namespace mongo { } } cmdMedianKey; - class SplitVector : public Command { - public: - SplitVector() : Command( "splitVector" , false ){} + class CheckShardingIndex : public Command { + public: + CheckShardingIndex() : Command( "checkShardingIndex" , false ) {} virtual bool slaveOk() const { return false; } virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { - help << - "Internal command.\n" - "example: { splitVector : \"myLargeCollection\" , keyPattern : {x:1} , maxChunkSize : 200 }\n" - "maxChunkSize unit in MBs\n" - "NOTE: This command may take a while to run"; + help << "Internal command.\n"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - const char* ns = jsobj.getStringField( "splitVector" ); + + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + const char* ns = jsobj.getStringField( "checkShardingIndex" ); BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); - long long maxChunkSize = 0; - BSONElement maxSizeElem = jsobj[ "maxChunkSize" ]; - if ( ! maxSizeElem.eoo() ){ - maxChunkSize = maxSizeElem.numberLong() * 1<<20; - } else { - errmsg = "need to specify the desired max chunk size"; + // If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern. + BSONObj min = jsobj.getObjectField( "min" ); + BSONObj max = jsobj.getObjectField( "max" ); + if ( min.isEmpty() && max.isEmpty() ) { + BSONObjBuilder minBuilder; + BSONObjBuilder maxBuilder; + BSONForEach(key, keyPattern) { + minBuilder.appendMinKey( key.fieldName() ); + maxBuilder.appendMaxKey( key.fieldName() ); + } + min = minBuilder.obj(); + max = maxBuilder.obj(); + } + else if ( min.isEmpty() || max.isEmpty() ) { + errmsg = "either provide both min and max or leave both empty"; return false; } - - Client::Context ctx( ns ); - BSONObjBuilder minBuilder; - BSONObjBuilder maxBuilder; - BSONForEach(key, keyPattern){ - minBuilder.appendMinKey( key.fieldName() ); - maxBuilder.appendMaxKey( key.fieldName() ); + Client::Context ctx( ns ); + NamespaceDetails *d = nsdetails( ns ); + if ( ! d ) { + errmsg = "ns not found"; + return false; } - BSONObj min = minBuilder.obj(); - BSONObj max = maxBuilder.obj(); IndexDetails *idx = cmdIndexDetailsForRange( ns , errmsg , min , max , keyPattern ); - if ( idx == NULL ){ + if ( idx == NULL ) { errmsg = "couldn't find index over splitting key"; return false; } - NamespaceDetails *d = nsdetails( ns ); - BtreeCursor c( d , d->idxNo(*idx) , *idx , min , max , false , 1 ); + BtreeCursor * bc = new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ); + shared_ptr c( bc ); + scoped_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + if ( ! cc->ok() ) { + // range is empty + return true; + } - // We'll use the average object size and number of object to find approximately how many keys - // each chunk should have. We'll split a little smaller than the specificied by 'maxSize' - // assuming a recently sharded collectio is still going to grow. + // for now, the only check is that all shard keys are filled + // null is ok, + // TODO if $exist for nulls were picking the index, it could be used instead efficiently + while ( cc->ok() ) { + BSONObj currKey = c->currKey(); + + BSONObjIterator i( currKey ); + int n = 0; + while ( i.more() ) { + BSONElement key = i.next(); + n++; - const long long dataSize = d->datasize; - const long long recCount = d->nrecords; - long long keyCount = 0; - if (( dataSize > 0 ) && ( recCount > 0 )){ - const long long avgRecSize = dataSize / recCount; - keyCount = 90 * maxChunkSize / (100 * avgRecSize); + if ( key.type() && key.type() != jstNULL ) + continue; + + BSONObj obj = c->current(); + BSONObjIterator j( keyPattern ); + BSONElement real; + for ( int x=0; xprettyKey( currKey ) << " for doc: " << real["_id"]; + log() << "checkShardingIndex for '" << ns << "' failed: " << os.str() << endl; + + errmsg = os.str(); + return false; + } + cc->advance(); } - // We traverse the index and add the keyCount-th key to the result vector. If that key - // appeared in the vector before, we omit it. The assumption here is that all the - // instances of a key value live in the same chunk. + return true; + } + } cmdCheckShardingIndex; - Timer timer; - long long currCount = 0; - vector splitKeys; - BSONObj currKey; - while ( c.ok() ){ - currCount++; - if ( currCount > keyCount ){ - if ( ! currKey.isEmpty() && (currKey.woCompare( c.currKey() ) == 0 ) ) - continue; - - currKey = c.currKey(); - splitKeys.push_back( c.prettyKey( currKey ) ); - currCount = 0; + class SplitVector : public Command { + public: + SplitVector() : Command( "splitVector" , false ) {} + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << + "Internal command.\n" + "examples:\n" + " { splitVector : \"blog.post\" , keyPattern:{x:1} , min:{x:10} , max:{x:20}, maxChunkSize:200 }\n" + " maxChunkSize unit in MBs\n" + " May optionally specify 'maxSplitPoints' and 'maxChunkObjects' to avoid traversing the whole chunk\n" + " \n" + " { splitVector : \"blog.post\" , keyPattern:{x:1} , min:{x:10} , max:{x:20}, force: true }\n" + " 'force' will produce one split point even if data is small; defaults to false\n" + "NOTE: This command may take a while to run"; + } + + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + // + // 1.a We'll parse the parameters in two steps. First, make sure the we can use the split index to get + // a good approximation of the size of the chunk -- without needing to access the actual data. + // + + const char* ns = jsobj.getStringField( "splitVector" ); + BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); + + // If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern. + BSONObj min = jsobj.getObjectField( "min" ); + BSONObj max = jsobj.getObjectField( "max" ); + if ( min.isEmpty() && max.isEmpty() ) { + BSONObjBuilder minBuilder; + BSONObjBuilder maxBuilder; + BSONForEach(key, keyPattern) { + minBuilder.appendMinKey( key.fieldName() ); + maxBuilder.appendMaxKey( key.fieldName() ); } - c.advance(); + min = minBuilder.obj(); + max = maxBuilder.obj(); + } + else if ( min.isEmpty() || max.isEmpty() ) { + errmsg = "either provide both min and max or leave both empty"; + return false; } - ostringstream os; - os << "Finding the split vector for " << ns << " over "<< keyPattern; - logIfSlow( timer , os.str() ); + long long maxSplitPoints = 0; + BSONElement maxSplitPointsElem = jsobj[ "maxSplitPoints" ]; + if ( maxSplitPointsElem.isNumber() ) { + maxSplitPoints = maxSplitPointsElem.numberLong(); + } - // Warning: we are sending back an array of keys but are currently limited to - // 4MB work of 'result' size. This should be okay for now. + long long maxChunkObjects = 0; + BSONElement MaxChunkObjectsElem = jsobj[ "maxChunkObjects" ]; + if ( MaxChunkObjectsElem.isNumber() ) { + maxChunkObjects = MaxChunkObjectsElem.numberLong(); + } + + vector splitKeys; + + { + // Get the size estimate for this namespace + Client::Context ctx( ns ); + NamespaceDetails *d = nsdetails( ns ); + if ( ! d ) { + errmsg = "ns not found"; + return false; + } + + IndexDetails *idx = cmdIndexDetailsForRange( ns , errmsg , min , max , keyPattern ); + if ( idx == NULL ) { + errmsg = "couldn't find index over splitting key"; + return false; + } + + const long long recCount = d->stats.nrecords; + const long long dataSize = d->stats.datasize; + + // + // 1.b Now that we have the size estimate, go over the remaining parameters and apply any maximum size + // restrictions specified there. + // + + // 'force'-ing a split is equivalent to having maxChunkSize be the size of the current chunk, i.e., the + // logic below will split that chunk in half + long long maxChunkSize = 0; + bool force = false; + { + BSONElement maxSizeElem = jsobj[ "maxChunkSize" ]; + BSONElement forceElem = jsobj[ "force" ]; + + if ( forceElem.trueValue() ) { + force = true; + maxChunkSize = dataSize; + + } + else if ( maxSizeElem.isNumber() ) { + maxChunkSize = maxSizeElem.numberLong() * 1<<20; + + } + else { + maxSizeElem = jsobj["maxChunkSizeBytes"]; + if ( maxSizeElem.isNumber() ) { + maxChunkSize = maxSizeElem.numberLong(); + } + } + + if ( maxChunkSize <= 0 ) { + errmsg = "need to specify the desired max chunk size (maxChunkSize or maxChunkSizeBytes)"; + return false; + } + } + + + // If there's not enough data for more than one chunk, no point continuing. + if ( dataSize < maxChunkSize || recCount == 0 ) { + vector emptyVector; + result.append( "splitKeys" , emptyVector ); + return true; + } + + log() << "request split points lookup for chunk " << ns << " " << min << " -->> " << max << endl; + + // We'll use the average object size and number of object to find approximately how many keys + // each chunk should have. We'll split at half the maxChunkSize or maxChunkObjects, if + // provided. + const long long avgRecSize = dataSize / recCount; + long long keyCount = maxChunkSize / (2 * avgRecSize); + if ( maxChunkObjects && ( maxChunkObjects < keyCount ) ) { + log() << "limiting split vector to " << maxChunkObjects << " (from " << keyCount << ") objects " << endl; + keyCount = maxChunkObjects; + } + + // + // 2. Traverse the index and add the keyCount-th key to the result vector. If that key + // appeared in the vector before, we omit it. The invariant here is that all the + // instances of a given key value live in the same chunk. + // + + Timer timer; + long long currCount = 0; + long long numChunks = 0; + + BtreeCursor * bc = new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ); + shared_ptr c( bc ); + scoped_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + if ( ! cc->ok() ) { + errmsg = "can't open a cursor for splitting (desired range is possibly empty)"; + return false; + } + + // Use every 'keyCount'-th key as a split point. We add the initial key as a sentinel, to be removed + // at the end. If a key appears more times than entries allowed on a chunk, we issue a warning and + // split on the following key. + set tooFrequentKeys; + splitKeys.push_back( c->currKey().getOwned() ); + while ( 1 ) { + while ( cc->ok() ) { + currCount++; + BSONObj currKey = c->currKey(); + + DEV assert( currKey.woCompare( max ) <= 0 ); + + if ( currCount > keyCount ) { + // Do not use this split key if it is the same used in the previous split point. + if ( currKey.woCompare( splitKeys.back() ) == 0 ) { + tooFrequentKeys.insert( currKey.getOwned() ); + + } + else { + splitKeys.push_back( currKey.getOwned() ); + currCount = 0; + numChunks++; + + LOG(4) << "picked a split key: " << bc->prettyKey( currKey ) << endl; + } + + } + + cc->advance(); + + // Stop if we have enough split points. + if ( maxSplitPoints && ( numChunks >= maxSplitPoints ) ) { + log() << "max number of requested split points reached (" << numChunks + << ") before the end of chunk " << ns << " " << min << " -->> " << max + << endl; + break; + } + + if ( ! cc->yieldSometimes() ) { + // we were near and and got pushed to the end + // i think returning the splits we've already found is fine + + // don't use the btree cursor pointer to acces keys beyond this point but ok + // to use it for format the keys we've got already + + break; + } + } + + if ( splitKeys.size() > 1 || ! force ) + break; + + force = false; + keyCount = currCount / 2; + currCount = 0; + log() << "splitVector doing another cycle because of force, keyCount now: " << keyCount << endl; + + c.reset( new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ) ); + cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + } + + // + // 3. Format the result and issue any warnings about the data we gathered while traversing the + // index + // + + // Warn for keys that are more numerous than maxChunkSize allows. + for ( set::const_iterator it = tooFrequentKeys.begin(); it != tooFrequentKeys.end(); ++it ) { + warning() << "chunk is larger than " << maxChunkSize + << " bytes because of key " << bc->prettyKey( *it ) << endl; + } + + // Remove the sentinel at the beginning before returning and add fieldnames. + splitKeys.erase( splitKeys.begin() ); + for ( vector::iterator it = splitKeys.begin(); it != splitKeys.end() ; ++it ) { + *it = bc->prettyKey( *it ); + } + + if ( timer.millis() > cmdLine.slowMS ) { + warning() << "Finding the split vector for " << ns << " over "<< keyPattern + << " keyCount: " << keyCount << " numSplits: " << splitKeys.size() + << " lookedAt: " << currCount << " took " << timer.millis() << "ms" + << endl; + } + + // Warning: we are sending back an array of keys but are currently limited to + // 4MB work of 'result' size. This should be okay for now. + + } result.append( "splitKeys" , splitKeys ); + return true; } } cmdSplitVector; + // ** temporary ** 2010-10-22 + // chunkInfo is a helper to collect and log information about the chunks generated in splitChunk. + // It should hold the chunk state for this module only, while we don't have min/max key info per chunk on the + // mongod side. Do not build on this; it will go away. + struct ChunkInfo { + BSONObj min; + BSONObj max; + ShardChunkVersion lastmod; + + ChunkInfo() { } + ChunkInfo( BSONObj aMin , BSONObj aMax , ShardChunkVersion aVersion ) : min(aMin) , max(aMax) , lastmod(aVersion) {} + void appendShortVersion( const char* name, BSONObjBuilder& b ) const; + string toString() const; + }; + + void ChunkInfo::appendShortVersion( const char * name , BSONObjBuilder& b ) const { + BSONObjBuilder bb( b.subobjStart( name ) ); + bb.append( "min" , min ); + bb.append( "max" , max ); + bb.appendTimestamp( "lastmod" , lastmod ); + bb.done(); + } + + string ChunkInfo::toString() const { + ostringstream os; + os << "lastmod: " << lastmod.toString() << " min: " << min << " max: " << endl; + return os.str(); + } + // ** end temporary ** + + class SplitChunkCommand : public Command { + public: + SplitChunkCommand() : Command( "splitChunk" ) {} + virtual void help( stringstream& help ) const { + help << + "internal command usage only\n" + "example:\n" + " { splitChunk:\"db.foo\" , keyPattern: {a:1} , min : {a:100} , max: {a:200} { splitKeys : [ {a:150} , ... ]}"; + } + + virtual bool slaveOk() const { return false; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + // + // 1. check whether parameters passed to splitChunk are sound + // + + const string ns = cmdObj.firstElement().str(); + if ( ns.empty() ) { + errmsg = "need to specify namespace in command"; + return false; + } + + BSONObj keyPattern = cmdObj["keyPattern"].Obj(); + if ( keyPattern.isEmpty() ) { + errmsg = "need to specify the key pattern the collection is sharded over"; + return false; + } + + BSONObj min = cmdObj["min"].Obj(); + if ( min.isEmpty() ) { + errmsg = "neet to specify the min key for the chunk"; + return false; + } + + BSONObj max = cmdObj["max"].Obj(); + if ( max.isEmpty() ) { + errmsg = "neet to specify the max key for the chunk"; + return false; + } + + string from = cmdObj["from"].str(); + if ( from.empty() ) { + errmsg = "need specify server to split chunk at"; + return false; + } + + BSONObj splitKeysElem = cmdObj["splitKeys"].Obj(); + if ( splitKeysElem.isEmpty() ) { + errmsg = "need to provide the split points to chunk over"; + return false; + } + vector splitKeys; + BSONObjIterator it( splitKeysElem ); + while ( it.more() ) { + splitKeys.push_back( it.next().Obj().getOwned() ); + } + + BSONElement shardId = cmdObj["shardId"]; + if ( shardId.eoo() ) { + errmsg = "need to provide shardId"; + return false; + } + + // It is possible that this is the first sharded command this mongod is asked to perform. If so, + // start sharding apparatus. We'd still be missing some more shard-related info but we'll get it + // in step 2. below. + if ( ! shardingState.enabled() ) { + if ( cmdObj["configdb"].type() != String ) { + errmsg = "sharding not enabled"; + return false; + } + string configdb = cmdObj["configdb"].String(); + shardingState.enable( configdb ); + configServer.init( configdb ); + } + + Shard myShard( from ); + + log() << "received splitChunk request: " << cmdObj << endl; + + // + // 2. lock the collection's metadata and get highest version for the current shard + // + + DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC) , ns ); + dist_lock_try dlk( &lockSetup, string("split-") + min.toString() ); + if ( ! dlk.got() ) { + errmsg = "the collection's metadata lock is taken"; + result.append( "who" , dlk.other() ); + return false; + } + + // TODO This is a check migrate does to the letter. Factor it out and share. 2010-10-22 + + ShardChunkVersion maxVersion; + string shard; + ChunkInfo origChunk; + { + ScopedDbConnection conn( shardingState.getConfigServer() ); + + BSONObj x = conn->findOne( ShardNS::chunk , Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) ); + maxVersion = x["lastmod"]; + + BSONObj currChunk = conn->findOne( ShardNS::chunk , shardId.wrap( "_id" ) ).getOwned(); + assert( currChunk["shard"].type() ); + assert( currChunk["min"].type() ); + assert( currChunk["max"].type() ); + shard = currChunk["shard"].String(); + conn.done(); + + BSONObj currMin = currChunk["min"].Obj(); + BSONObj currMax = currChunk["max"].Obj(); + if ( currMin.woCompare( min ) || currMax.woCompare( max ) ) { + errmsg = "chunk boundaries are outdated (likely a split occurred)"; + result.append( "currMin" , currMin ); + result.append( "currMax" , currMax ); + result.append( "requestedMin" , min ); + result.append( "requestedMax" , max ); + + log( LL_WARNING ) << "aborted split because " << errmsg << ": " << min << "->" << max + << " is now " << currMin << "->" << currMax << endl; + return false; + } + + if ( shard != myShard.getName() ) { + errmsg = "location is outdated (likely balance or migrate occurred)"; + result.append( "from" , myShard.getName() ); + result.append( "official" , shard ); + + log( LL_WARNING ) << "aborted split because " << errmsg << ": chunk is at " << shard + << " and not at " << myShard.getName() << endl; + return false; + } + + if ( maxVersion < shardingState.getVersion( ns ) ) { + errmsg = "official version less than mine?"; + result.appendTimestamp( "officialVersion" , maxVersion ); + result.appendTimestamp( "myVersion" , shardingState.getVersion( ns ) ); + + log( LL_WARNING ) << "aborted split because " << errmsg << ": official " << maxVersion + << " mine: " << shardingState.getVersion(ns) << endl; + return false; + } + + origChunk.min = currMin.getOwned(); + origChunk.max = currMax.getOwned(); + origChunk.lastmod = currChunk["lastmod"]; + + // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date + shardingState.gotShardName( shard ); + ShardChunkVersion shardVersion; + shardingState.trySetVersion( ns , shardVersion /* will return updated */ ); + + log() << "splitChunk accepted at version " << shardVersion << endl; + + } + + // + // 3. create the batch of updates to metadata ( the new chunks ) to be applied via 'applyOps' command + // + + BSONObjBuilder logDetail; + origChunk.appendShortVersion( "before" , logDetail ); + log(1) << "before split on " << origChunk << endl; + vector newChunks; + + ShardChunkVersion myVersion = maxVersion; + BSONObj startKey = min; + splitKeys.push_back( max ); // makes it easier to have 'max' in the next loop. remove later. + + BSONObjBuilder cmdBuilder; + BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) ); + + for ( vector::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it ) { + BSONObj endKey = *it; + + // splits only update the 'minor' portion of version + myVersion.incMinor(); + + // build an update operation against the chunks collection of the config database with + // upsert true + BSONObjBuilder op; + op.append( "op" , "u" ); + op.appendBool( "b" , true ); + op.append( "ns" , ShardNS::chunk ); + + // add the modified (new) chunk infomation as the update object + BSONObjBuilder n( op.subobjStart( "o" ) ); + n.append( "_id" , Chunk::genID( ns , startKey ) ); + n.appendTimestamp( "lastmod" , myVersion ); + n.append( "ns" , ns ); + n.append( "min" , startKey ); + n.append( "max" , endKey ); + n.append( "shard" , shard ); + n.done(); + + // add the chunk's _id as the query part of the update statement + BSONObjBuilder q( op.subobjStart( "o2" ) ); + q.append( "_id" , Chunk::genID( ns , startKey ) ); + q.done(); + + updates.append( op.obj() ); + + // remember this chunk info for logging later + newChunks.push_back( ChunkInfo( startKey , endKey, myVersion ) ); + + startKey = endKey; + } + + updates.done(); + + { + BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) ); + BSONObjBuilder b; + b.append( "ns" , ShardNS::chunk ); + b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) ); + { + BSONObjBuilder bb( b.subobjStart( "res" ) ); + bb.appendTimestamp( "lastmod" , maxVersion ); + bb.done(); + } + preCond.append( b.obj() ); + preCond.done(); + } + + // + // 4. apply the batch of updates to metadata and to the chunk manager + // + + BSONObj cmd = cmdBuilder.obj(); + + LOG(1) << "splitChunk update: " << cmd << endl; + + bool ok; + BSONObj cmdResult; + { + ScopedDbConnection conn( shardingState.getConfigServer() ); + ok = conn->runCommand( "config" , cmd , cmdResult ); + conn.done(); + } + + if ( ! ok ) { + stringstream ss; + ss << "saving chunks failed. cmd: " << cmd << " result: " << cmdResult; + error() << ss.str() << endl; + msgasserted( 13593 , ss.str() ); // assert(13593) + } + + // install a chunk manager with knowledge about newly split chunks in this shard's state + splitKeys.pop_back(); // 'max' was used as sentinel + maxVersion.incMinor(); + shardingState.splitChunk( ns , min , max , splitKeys , maxVersion ); + + // + // 5. logChanges + // + + // single splits are logged different than multisplits + if ( newChunks.size() == 2 ) { + newChunks[0].appendShortVersion( "left" , logDetail ); + newChunks[1].appendShortVersion( "right" , logDetail ); + configServer.logChange( "split" , ns , logDetail.obj() ); + + } + else { + BSONObj beforeDetailObj = logDetail.obj(); + BSONObj firstDetailObj = beforeDetailObj.getOwned(); + const int newChunksSize = newChunks.size(); + + for ( int i=0; i < newChunksSize; i++ ) { + BSONObjBuilder chunkDetail; + chunkDetail.appendElements( beforeDetailObj ); + chunkDetail.append( "number", i ); + chunkDetail.append( "of" , newChunksSize ); + newChunks[i].appendShortVersion( "chunk" , chunkDetail ); + configServer.logChange( "multi-split" , ns , chunkDetail.obj() ); + } + } + + return true; + } + } cmdSplitChunk; + } // namespace mongo diff --git a/s/d_state.cpp b/s/d_state.cpp index 3f13b79..11fbcef 100644 --- a/s/d_state.cpp +++ b/s/d_state.cpp @@ -1,4 +1,4 @@ -// d_state.cpp +// @file d_state.cpp /** * Copyright (C) 2008 10gen Inc. @@ -44,12 +44,12 @@ using namespace std; namespace mongo { // -----ShardingState START ---- - + ShardingState::ShardingState() - : _enabled(false) , _mutex( "ShardingState" ){ + : _enabled(false) , _mutex( "ShardingState" ) { } - - void ShardingState::enable( const string& server ){ + + void ShardingState::enable( const string& server ) { _enabled = true; assert( server.size() ); if ( _configServer.size() == 0 ) @@ -58,69 +58,177 @@ namespace mongo { assert( server == _configServer ); } } - - void ShardingState::gotShardName( const string& name ){ - if ( _shardName.size() == 0 ){ + + void ShardingState::gotShardName( const string& name ) { + scoped_lock lk(_mutex); + if ( _shardName.size() == 0 ) { + // TODO SERVER-2299 verify the name is sound w.r.t IPs _shardName = name; return; } - + if ( _shardName == name ) return; stringstream ss; - ss << "gotShardName different than what i had before " - << " before [" << _shardName << "] " - << " got [" << name << "] " - ; + ss << "gotShardName different than what i had before " + << " before [" << _shardName << "] " + << " got [" << name << "] " + ; uasserted( 13298 , ss.str() ); } - - void ShardingState::gotShardHost( const string& host ){ - if ( _shardHost.size() == 0 ){ + + void ShardingState::gotShardHost( string host ) { + scoped_lock lk(_mutex); + size_t slash = host.find( '/' ); + if ( slash != string::npos ) + host = host.substr( 0 , slash ); + + if ( _shardHost.size() == 0 ) { _shardHost = host; return; } - + if ( _shardHost == host ) return; stringstream ss; - ss << "gotShardHost different than what i had before " - << " before [" << _shardHost << "] " - << " got [" << host << "] " - ; + ss << "gotShardHost different than what i had before " + << " before [" << _shardHost << "] " + << " got [" << host << "] " + ; uasserted( 13299 , ss.str() ); } - - bool ShardingState::hasVersion( const string& ns ){ + + void ShardingState::resetShardingState() { scoped_lock lk(_mutex); - NSVersionMap::const_iterator i = _versions.find(ns); - return i != _versions.end(); + + _enabled = false; + _configServer.clear(); + _shardName.clear(); + _shardHost.clear(); + _chunks.clear(); } - - bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ){ + + // TODO we shouldn't need three ways for checking the version. Fix this. + bool ShardingState::hasVersion( const string& ns ) { scoped_lock lk(_mutex); - NSVersionMap::const_iterator i = _versions.find(ns); - if ( i == _versions.end() ) + + ChunkManagersMap::const_iterator it = _chunks.find(ns); + return it != _chunks.end(); + } + + bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ) { + scoped_lock lk(_mutex); + + ChunkManagersMap::const_iterator it = _chunks.find(ns); + if ( it == _chunks.end() ) return false; - version = i->second; + + ShardChunkManagerPtr p = it->second; + version = p->getVersion(); return true; } - - ConfigVersion& ShardingState::getVersion( const string& ns ){ + + const ConfigVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); - return _versions[ns]; + + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + if ( it != _chunks.end() ) { + ShardChunkManagerPtr p = it->second; + return p->getVersion(); + } + else { + return 0; + } } - - void ShardingState::setVersion( const string& ns , const ConfigVersion& version ){ - scoped_lock lk(_mutex); - ConfigVersion& me = _versions[ns]; - assert( version == 0 || version > me ); - me = version; + + void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { + scoped_lock lk( _mutex ); + + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + assert( it != _chunks.end() ) ; + ShardChunkManagerPtr p = it->second; + + // empty shards should have version 0 + version = ( p->getNumChunks() > 1 ) ? version : ShardChunkVersion( 0 , 0 ); + + ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) ); + _chunks[ns] = cloned; + } + + void ShardingState::undoDonateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { + scoped_lock lk( _mutex ); + + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + assert( it != _chunks.end() ) ; + ShardChunkManagerPtr p( it->second->clonePlus( min , max , version ) ); + _chunks[ns] = p; + } + + void ShardingState::splitChunk( const string& ns , const BSONObj& min , const BSONObj& max , const vector& splitKeys , + ShardChunkVersion version ) { + scoped_lock lk( _mutex ); + + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + assert( it != _chunks.end() ) ; + ShardChunkManagerPtr p( it->second->cloneSplit( min , max , splitKeys , version ) ); + _chunks[ns] = p; } - void ShardingState::appendInfo( BSONObjBuilder& b ){ + void ShardingState::resetVersion( const string& ns ) { + scoped_lock lk( _mutex ); + + _chunks.erase( ns ); + } + + bool ShardingState::trySetVersion( const string& ns , ConfigVersion& version /* IN-OUT */ ) { + + // fast path - requested version is at the same version as this chunk manager + // + // cases: + // + this shard updated the version for a migrate's commit (FROM side) + // a client reloaded chunk state from config and picked the newest version + // + two clients reloaded + // one triggered the 'slow path' (below) + // when the second's request gets here, the version is already current + { + scoped_lock lk( _mutex ); + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + if ( it != _chunks.end() && it->second->getVersion() == version ) + return true; + } + + // slow path - requested version is different than the current chunk manager's, if one exists, so must check for + // newest version in the config server + // + // cases: + // + a chunk moved TO here + // (we don't bump up the version on the TO side but the commit to config does use higher version) + // a client reloads from config an issued the request + // + there was a take over from a secondary + // the secondary had no state (managers) at all, so every client request will fall here + // + a stale client request a version that's not current anymore + + const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer; + ShardChunkManagerPtr p( new ShardChunkManager( c , ns , _shardName ) ); + { + scoped_lock lk( _mutex ); + + // since we loaded the chunk manager unlocked, other thread may have done the same + // make sure we keep the freshest config info only + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + if ( it == _chunks.end() || p->getVersion() >= it->second->getVersion() ) { + _chunks[ns] = p; + } + + ShardChunkVersion oldVersion = version; + version = p->getVersion(); + return oldVersion == version; + } + } + + void ShardingState::appendInfo( BSONObjBuilder& b ) { b.appendBool( "enabled" , _enabled ); if ( ! _enabled ) return; @@ -131,117 +239,56 @@ namespace mongo { { BSONObjBuilder bb( b.subobjStart( "versions" ) ); - + scoped_lock lk(_mutex); - for ( NSVersionMap::iterator i=_versions.begin(); i!=_versions.end(); ++i ){ - bb.appendTimestamp( i->first.c_str() , i->second ); + + for ( ChunkManagersMap::iterator it = _chunks.begin(); it != _chunks.end(); ++it ) { + ShardChunkManagerPtr p = it->second; + bb.appendTimestamp( it->first , p->getVersion() ); } bb.done(); } } - ChunkMatcherPtr ShardingState::getChunkMatcher( const string& ns ){ + bool ShardingState::needShardChunkManager( const string& ns ) const { if ( ! _enabled ) - return ChunkMatcherPtr(); - - if ( ! ShardedConnectionInfo::get( false ) ) - return ChunkMatcherPtr(); + return false; - ConfigVersion version; - { - scoped_lock lk( _mutex ); - version = _versions[ns]; - - if ( ! version ) - return ChunkMatcherPtr(); - - ChunkMatcherPtr p = _chunks[ns]; - if ( p && p->_version >= version ) - return p; - } + if ( ! ShardedConnectionInfo::get( false ) ) + return false; - BSONObj q; - { - BSONObjBuilder b; - b.append( "ns" , ns.c_str() ); - b.append( "shard" , BSON( "$in" << BSON_ARRAY( _shardHost << _shardName ) ) ); - q = b.obj(); - } + return true; + } - auto_ptr scoped; - auto_ptr direct; - - DBClientBase * conn; + ShardChunkManagerPtr ShardingState::getShardChunkManager( const string& ns ) { + scoped_lock lk( _mutex ); - if ( _configServer == _shardHost ){ - direct.reset( new DBDirectClient() ); - conn = direct.get(); + ChunkManagersMap::const_iterator it = _chunks.find( ns ); + if ( it == _chunks.end() ) { + return ShardChunkManagerPtr(); } else { - scoped.reset( new ScopedDbConnection( _configServer ) ); - conn = scoped->get(); + return it->second; } - - auto_ptr cursor = conn->query( "config.chunks" , Query(q).sort( "min" ) ); - assert( cursor.get() ); - if ( ! cursor->more() ){ - if ( scoped.get() ) - scoped->done(); - return ChunkMatcherPtr(); - } - - ChunkMatcherPtr p( new ChunkMatcher( version ) ); - - BSONObj min,max; - while ( cursor->more() ){ - BSONObj d = cursor->next(); - - if ( min.isEmpty() ){ - min = d["min"].Obj().getOwned(); - max = d["max"].Obj().getOwned(); - continue; - } - - if ( max == d["min"].Obj() ){ - max = d["max"].Obj().getOwned(); - continue; - } - - p->gotRange( min.getOwned() , max.getOwned() ); - min = d["min"].Obj().getOwned(); - max = d["max"].Obj().getOwned(); - } - assert( ! min.isEmpty() ); - p->gotRange( min.getOwned() , max.getOwned() ); - - if ( scoped.get() ) - scoped->done(); - - { - scoped_lock lk( _mutex ); - _chunks[ns] = p; - } - - return p; } ShardingState shardingState; // -----ShardingState END ---- - + // -----ShardedConnectionInfo START ---- boost::thread_specific_ptr ShardedConnectionInfo::_tl; - ShardedConnectionInfo::ShardedConnectionInfo(){ - _forceMode = false; + ShardedConnectionInfo::ShardedConnectionInfo() { + _forceVersionOk = false; _id.clear(); } - - ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ){ + + ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ) { ShardedConnectionInfo* info = _tl.get(); - if ( ! info && create ){ + if ( ! info && create ) { log(1) << "entering shard mode for connection" << endl; info = new ShardedConnectionInfo(); _tl.reset( info ); @@ -249,44 +296,50 @@ namespace mongo { return info; } - void ShardedConnectionInfo::reset(){ + void ShardedConnectionInfo::reset() { _tl.reset(); } - ConfigVersion& ShardedConnectionInfo::getVersion( const string& ns ){ - return _versions[ns]; + const ConfigVersion ShardedConnectionInfo::getVersion( const string& ns ) const { + NSVersionMap::const_iterator it = _versions.find( ns ); + if ( it != _versions.end() ) { + return it->second; + } + else { + return 0; + } } - - void ShardedConnectionInfo::setVersion( const string& ns , const ConfigVersion& version ){ + + void ShardedConnectionInfo::setVersion( const string& ns , const ConfigVersion& version ) { _versions[ns] = version; } - void ShardedConnectionInfo::setID( const OID& id ){ + void ShardedConnectionInfo::setID( const OID& id ) { _id = id; } // -----ShardedConnectionInfo END ---- - unsigned long long extractVersion( BSONElement e , string& errmsg ){ - if ( e.eoo() ){ + unsigned long long extractVersion( BSONElement e , string& errmsg ) { + if ( e.eoo() ) { errmsg = "no version"; return 0; } - + if ( e.isNumber() ) return (unsigned long long)e.number(); - + if ( e.type() == Date || e.type() == Timestamp ) return e._numberLong(); - + errmsg = "version is not a numeric type"; return 0; } class MongodShardCommand : public Command { public: - MongodShardCommand( const char * n ) : Command( n ){ + MongodShardCommand( const char * n ) : Command( n ) { } virtual bool slaveOk() const { return false; @@ -295,12 +348,12 @@ namespace mongo { return true; } }; - - - bool haveLocalShardingInfo( const string& ns ){ + + + bool haveLocalShardingInfo( const string& ns ) { if ( ! shardingState.enabled() ) return false; - + if ( ! shardingState.hasVersion( ns ) ) return false; @@ -309,266 +362,332 @@ namespace mongo { class UnsetShardingCommand : public MongodShardCommand { public: - UnsetShardingCommand() : MongodShardCommand("unsetSharding"){} + UnsetShardingCommand() : MongodShardCommand("unsetSharding") {} virtual void help( stringstream& help ) const { help << " example: { unsetSharding : 1 } "; } - - virtual LockType locktype() const { return NONE; } - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + + virtual LockType locktype() const { return NONE; } + + virtual bool slaveOk() const { return true; } + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { ShardedConnectionInfo::reset(); return true; - } - + } + } unsetShardingCommand; - class SetShardVersion : public MongodShardCommand { public: - SetShardVersion() : MongodShardCommand("setShardVersion"){} + SetShardVersion() : MongodShardCommand("setShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { setShardVersion : 'alleyinsider.foo' , version : 1 , configdb : '' } "; } - - virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock - - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - lastError.disableForCommand(); - ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); - - bool authoritative = cmdObj.getBoolField( "authoritative" ); - string configdb = cmdObj["configdb"].valuestrsafe(); - { // configdb checking - if ( configdb.size() == 0 ){ - errmsg = "no configdb"; - return false; - } + virtual LockType locktype() const { return NONE; } + + bool checkConfigOrInit( const string& configdb , bool authoritative , string& errmsg , BSONObjBuilder& result , bool locked=false ) const { + if ( configdb.size() == 0 ) { + errmsg = "no configdb"; + return false; + } + + if ( shardingState.enabled() ) { + if ( configdb == shardingState.getConfigServer() ) + return true; - if ( shardingState.enabled() ){ - if ( configdb != shardingState.getConfigServer() ){ - errmsg = "specified a different configdb!"; - return false; - } - } - else { - if ( ! authoritative ){ - result.appendBool( "need_authoritative" , true ); - errmsg = "first setShardVersion"; - return false; - } - shardingState.enable( configdb ); - configServer.init( configdb ); - } + result.append( "configdb" , BSON( "stored" << shardingState.getConfigServer() << + "given" << configdb ) ); + errmsg = "specified a different configdb!"; + return false; } - if ( cmdObj["shard"].type() == String ){ - shardingState.gotShardName( cmdObj["shard"].String() ); - shardingState.gotShardHost( cmdObj["shardHost"].String() ); + if ( ! authoritative ) { + result.appendBool( "need_authoritative" , true ); + errmsg = "first setShardVersion"; + return false; + } + + if ( locked ) { + shardingState.enable( configdb ); + configServer.init( configdb ); + return true; } - { // setting up ids - if ( cmdObj["serverID"].type() != jstOID ){ - // TODO: fix this - //errmsg = "need serverID to be an OID"; - //return 0; - } - else { - OID clientId = cmdObj["serverID"].__oid(); - if ( ! info->hasID() ){ - info->setID( clientId ); - } - else if ( clientId != info->getID() ){ - errmsg = "server id has changed!"; - return 0; - } - } + dblock lk; + return checkConfigOrInit( configdb , authoritative , errmsg , result , true ); + } + + bool checkMongosID( ShardedConnectionInfo* info, const BSONElement& id, string errmsg ) { + if ( id.type() != jstOID ) { + // TODO: fix this + //errmsg = "need serverID to be an OID"; + //return 0; + return true; + } + + OID clientId = id.__oid(); + if ( ! info->hasID() ) { + info->setID( clientId ); + return true; } - unsigned long long version = extractVersion( cmdObj["version"] , errmsg ); + if ( clientId != info->getID() ) { + errmsg = "server id has changed!"; + return false; + } + + return true; + } + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + + // Steps + // 1. check basic config + // 2. extract params from command + // 3. fast check + // 4. slow check (LOCKS) + + // step 1 - if ( errmsg.size() ){ + lastError.disableForCommand(); + ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); + + bool authoritative = cmdObj.getBoolField( "authoritative" ); + + // check config server is ok or enable sharding + if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; + + // check shard name/hosts are correct + if ( cmdObj["shard"].type() == String ) { + shardingState.gotShardName( cmdObj["shard"].String() ); + shardingState.gotShardHost( cmdObj["shardHost"].String() ); } + // make sure we have the mongos id for writebacks + if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) + return false; + + // step 2 + string ns = cmdObj["setShardVersion"].valuestrsafe(); - if ( ns.size() == 0 ){ - errmsg = "need to speciy fully namespace"; + if ( ns.size() == 0 ) { + errmsg = "need to speciy namespace"; return false; } + + const ConfigVersion version = extractVersion( cmdObj["version"] , errmsg ); + if ( errmsg.size() ) + return false; + + // step 3 + + const ConfigVersion oldVersion = info->getVersion(ns); + const ConfigVersion globalVersion = shardingState.getVersion(ns); + + result.appendTimestamp( "oldVersion" , oldVersion ); - ConfigVersion& oldVersion = info->getVersion(ns); - ConfigVersion& globalVersion = shardingState.getVersion(ns); + if ( globalVersion > 0 && version > 0 ) { + // this means there is no reset going on an either side + // so its safe to make some assuptions + + if ( version == globalVersion ) { + // mongos and mongod agree! + if ( oldVersion != version ) { + assert( oldVersion < globalVersion ); + info->setVersion( ns , version ); + } + return true; + } + + } + + // step 4 + dblock setShardVersionLock; // TODO: can we get rid of this?? - if ( oldVersion > 0 && globalVersion == 0 ){ + if ( oldVersion > 0 && globalVersion == 0 ) { // this had been reset - oldVersion = 0; + info->setVersion( ns , 0 ); } - if ( version == 0 && globalVersion == 0 ){ + if ( version == 0 && globalVersion == 0 ) { // this connection is cleaning itself - oldVersion = 0; - return 1; + info->setVersion( ns , 0 ); + return true; } - if ( version == 0 && globalVersion > 0 ){ - if ( ! authoritative ){ + if ( version == 0 && globalVersion > 0 ) { + if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); + result.append( "ns" , ns ); result.appendTimestamp( "globalVersion" , globalVersion ); - result.appendTimestamp( "oldVersion" , oldVersion ); errmsg = "dropping needs to be authoritative"; - return 0; + return false; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data - globalVersion = 0; - oldVersion = 0; - return 1; + shardingState.resetVersion( ns ); + info->setVersion( ns , 0 ); + return true; } - if ( version < oldVersion ){ - errmsg = "you already have a newer version"; - result.appendTimestamp( "oldVersion" , oldVersion ); + if ( version < oldVersion ) { + errmsg = "you already have a newer version of collection '" + ns + "'"; + result.append( "ns" , ns ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } - - if ( version < globalVersion ){ - while ( shardingState.inCriticalMigrateSection() ){ + + if ( version < globalVersion ) { + while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); - log() << "waiting till out of critical section" << endl; + OCCASIONALLY log() << "waiting till out of critical section" << endl; } - errmsg = "going to older version for global"; + errmsg = "going to older version for global for collection '" + ns + "'"; + result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } - - if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ){ + + if ( globalVersion == 0 && ! authoritative ) { // need authoritative for first look - result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); - errmsg = "first time for this ns"; + result.appendBool( "need_authoritative" , true ); + errmsg = "first time for collection '" + ns + "'"; return false; } + Timer relockTime; { dbtemprelease unlock; - shardingState.getChunkMatcher( ns ); + + ShardChunkVersion currVersion = version; + if ( ! shardingState.trySetVersion( ns , currVersion ) ) { + errmsg = str::stream() << "client version differs from config's for colleciton '" << ns << "'"; + result.append( "ns" , ns ); + result.appendTimestamp( "version" , version ); + result.appendTimestamp( "globalVersion" , currVersion ); + return false; + } } + if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) { + log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl; + } + + info->setVersion( ns , version ); + return true; + } - result.appendTimestamp( "oldVersion" , oldVersion ); - oldVersion = version; - globalVersion = version; + } setShardVersionCmd; - result.append( "ok" , 1 ); - return 1; - } - - } setShardVersion; - class GetShardVersion : public MongodShardCommand { public: - GetShardVersion() : MongodShardCommand("getShardVersion"){} + GetShardVersion() : MongodShardCommand("getShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { getShardVersion : 'alleyinsider.foo' } "; } - - virtual LockType locktype() const { return NONE; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + virtual LockType locktype() const { return NONE; } + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj["getShardVersion"].valuestrsafe(); - if ( ns.size() == 0 ){ + if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } - + result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns) ); - + ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns) ); - else + else result.appendTimestamp( "mine" , 0 ); - + return true; } - + } getShardVersion; class ShardingStateCmd : public MongodShardCommand { public: - ShardingStateCmd() : MongodShardCommand( "shardingState" ){} + ShardingStateCmd() : MongodShardCommand( "shardingState" ) {} virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { shardingState.appendInfo( result ); return true; } - + } shardingStateCmd; /** * @ return true if not in sharded mode or if version for this client is ok */ - bool shardVersionOk( const string& ns , bool isWriteOp , string& errmsg ){ + bool shardVersionOk( const string& ns , bool isWriteOp , string& errmsg ) { if ( ! shardingState.enabled() ) return true; ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); - if ( ! info ){ + if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } - - if ( info->inForceMode() ){ + + if ( info->inForceVersionOkMode() ) { return true; } - ConfigVersion version; - if ( ! shardingState.hasVersion( ns , version ) ){ + // TODO + // all collections at some point, be sharded or not, will have a version (and a ShardChunkManager) + // for now, we remove the sharding state of dropped collection + // so delayed request may come in. This has to be fixed. + ConfigVersion clientVersion = info->getVersion(ns); + ConfigVersion version; + if ( ! shardingState.hasVersion( ns , version ) && clientVersion == 0 ) { return true; } - ConfigVersion clientVersion = info->getVersion(ns); - if ( version == 0 && clientVersion > 0 ){ + if ( version == 0 && clientVersion > 0 ) { stringstream ss; ss << "collection was dropped or this shard no longer valied version: " << version << " clientVersion: " << clientVersion; errmsg = ss.str(); return false; } - + if ( clientVersion >= version ) return true; - - if ( clientVersion == 0 ){ + + if ( clientVersion == 0 ) { stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection: " << ns << " myVersion: " << version; errmsg = ss.str(); return false; } - if ( isWriteOp && version.majorVersion() == clientVersion.majorVersion() ){ - // this means there was just a split + if ( version.majorVersion() == clientVersion.majorVersion() ) { + // this means there was just a split // since on a split w/o a migrate this server is ok - // going to accept write + // going to accept return true; } @@ -578,51 +697,4 @@ namespace mongo { return false; } - // --- ChunkMatcher --- - - ChunkMatcher::ChunkMatcher( ConfigVersion version ) - : _version( version ){ - - } - - void ChunkMatcher::gotRange( const BSONObj& min , const BSONObj& max ){ - if (_key.isEmpty()){ - BSONObjBuilder b; - - BSONForEach(e, min) { - b.append(e.fieldName(), 1); - } - - _key = b.obj(); - } - - //TODO debug mode only? - assert(min.nFields() == _key.nFields()); - assert(max.nFields() == _key.nFields()); - - _map[min] = make_pair(min,max); - } - - bool ChunkMatcher::belongsToMe( const BSONObj& key , const DiskLoc& loc ) const { - if ( _map.size() == 0 ) - return false; - - BSONObj x = loc.obj().extractFields(_key); - - MyMap::const_iterator a = _map.upper_bound( x ); - if ( a != _map.begin() ) - a--; - - bool good = x.woCompare( a->second.first ) >= 0 && x.woCompare( a->second.second ) < 0; -#if 0 - if ( ! good ){ - cout << "bad: " << x << "\t" << a->second.first << "\t" << x.woCompare( a->second.first ) << "\t" << x.woCompare( a->second.second ) << endl; - for ( MyMap::const_iterator i=_map.begin(); i!=_map.end(); ++i ){ - cout << "\t" << i->first << "\t" << i->second.first << "\t" << i->second.second << endl; - } - } -#endif - return good; - } - } diff --git a/s/d_util.cpp b/s/d_util.cpp deleted file mode 100644 index a750fbc..0000000 --- a/s/d_util.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// util.cpp - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see . -*/ - - -/** - these are commands that live in mongod - mostly around shard management and checking - */ - -#include "pch.h" -#include "util.h" - -using namespace std; - -namespace mongo { - - bool checkShardVersion( DBClientBase & conn , const string& ns , bool authoritative , int tryNumber ){ - // no-op in mongod - return false; - } - - void resetShardVersion( DBClientBase * conn ){ - // no-op in mongod - } - -} diff --git a/s/d_writeback.cpp b/s/d_writeback.cpp index a18e5d5..401e0aa 100644 --- a/s/d_writeback.cpp +++ b/s/d_writeback.cpp @@ -19,62 +19,105 @@ #include "pch.h" #include "../db/commands.h" -#include "../db/jsobj.h" -#include "../db/dbmessage.h" -#include "../db/query.h" - -#include "../client/connpool.h" - #include "../util/queue.h" -#include "shard.h" +#include "d_writeback.h" using namespace std; namespace mongo { - map< string , BlockingQueue* > writebackQueue; - mongo::mutex writebackQueueLock("sharding:writebackQueueLock"); + // ---------- WriteBackManager class ---------- + + // TODO init at mongod startup + WriteBackManager writeBackManager; + + WriteBackManager::WriteBackManager() : _writebackQueueLock("sharding:writebackQueueLock") { + } + + WriteBackManager::~WriteBackManager() { + } + + void WriteBackManager::queueWriteBack( const string& remote , const BSONObj& o ) { + getWritebackQueue( remote )->push( o ); + } - BlockingQueue* getWritebackQueue( const string& remote ){ - scoped_lock lk (writebackQueueLock ); - BlockingQueue*& q = writebackQueue[remote]; + BlockingQueue* WriteBackManager::getWritebackQueue( const string& remote ) { + scoped_lock lk ( _writebackQueueLock ); + BlockingQueue*& q = _writebackQueues[remote]; if ( ! q ) q = new BlockingQueue(); return q; } - - void queueWriteBack( const string& remote , const BSONObj& o ){ - getWritebackQueue( remote )->push( o ); + + bool WriteBackManager::queuesEmpty() const { + scoped_lock lk( _writebackQueueLock ); + for ( WriteBackQueuesMap::const_iterator it = _writebackQueues.begin(); it != _writebackQueues.end(); ++it ) { + const BlockingQueue* queue = it->second; + if (! queue->empty() ) { + return false; + } + } + return true; } + // ---------- admin commands ---------- + // Note, this command will block until there is something to WriteBack class WriteBackCommand : public Command { public: - virtual LockType locktype() const { return NONE; } + virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } virtual bool adminOnly() const { return true; } - - WriteBackCommand() : Command( "writebacklisten" ){} + + WriteBackCommand() : Command( "writebacklisten" ) {} void help(stringstream& h) const { h<<"internal"; } - bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { BSONElement e = cmdObj.firstElement(); - if ( e.type() != jstOID ){ + if ( e.type() != jstOID ) { errmsg = "need oid as first value"; return 0; } - + + // get the command issuer's (a mongos) serverID const OID id = e.__oid(); - BSONObj z = getWritebackQueue(id.str())->blockingPop(); - log(1) << "WriteBackCommand got : " << z << endl; - - result.append( "data" , z ); - + + // the command issuer is blocked awaiting a response + // we want to do return at least at every 5 minutes so sockets don't timeout + BSONObj z; + if ( writeBackManager.getWritebackQueue(id.str())->blockingPop( z, 5 * 60 /* 5 minutes */ ) ) { + log(1) << "WriteBackCommand got : " << z << endl; + result.append( "data" , z ); + } + else { + result.appendBool( "noop" , true ); + } + return true; } } writeBackCommand; -} + class WriteBacksQueuedCommand : public Command { + public: + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + + WriteBacksQueuedCommand() : Command( "writeBacksQueued" ) {} + + void help(stringstream& help) const { + help << "Returns whether there are operations in the writeback queue at the time the command was called. " + << "This is an internal comand"; + } + + bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + result.appendBool( "hasOpsQueued" , ! writeBackManager.queuesEmpty() ); + return true; + } + + } writeBacksQueuedCommand; + +} // namespace mongo diff --git a/s/d_writeback.h b/s/d_writeback.h new file mode 100644 index 0000000..32f5b1c --- /dev/null +++ b/s/d_writeback.h @@ -0,0 +1,75 @@ +// @file d_writeback.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../pch.h" + +#include "../util/queue.h" + +namespace mongo { + + /* + * The WriteBackManager keeps one queue of pending operations per mongos. The operations get here + * if they were directed to a chunk that is no longer in this mongod server. The operations are + * "written back" to the mongos server per its request (command 'writebacklisten'). + * + * The class is thread safe. + */ + class WriteBackManager { + public: + WriteBackManager(); + ~WriteBackManager(); + + /* + * @param remote server ID this operation came from + * @param op the operation itself + * + * Enqueues opeartion 'op' in server 'remote's queue. The operation will be written back to + * remote at a later stager. + */ + void queueWriteBack( const string& remote , const BSONObj& op ); + + /* + * @param remote server ID + * @return the queue for operations that came from 'remote' + * + * Gets access to server 'remote's queue, which is synchronized. + */ + BlockingQueue* getWritebackQueue( const string& remote ); + + /* + * @return true if there is no operation queued for write back + */ + bool queuesEmpty() const; + + private: + // a map from mongos's serverIDs to queues of "rejected" operations + // an operation is rejected if it targets data that does not live on this shard anymore + typedef map< string , BlockingQueue* > WriteBackQueuesMap; + + // '_writebackQueueLock' protects only the map itself, since each queue is syncrhonized. + mutable mongo::mutex _writebackQueueLock; + WriteBackQueuesMap _writebackQueues; + + }; + + // TODO collect global state in a central place and init during startup + extern WriteBackManager writeBackManager; + +} // namespace mongo diff --git a/s/dbgrid.vcproj b/s/dbgrid.vcproj deleted file mode 100644 index 745d84e..0000000 --- a/s/dbgrid.vcproj +++ /dev/null @@ -1,1048 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/s/dbgrid.vcxproj b/s/dbgrid.vcxproj index 83fbf68..61a8458 100644 --- a/s/dbgrid.vcxproj +++ b/s/dbgrid.vcxproj @@ -85,6 +85,10 @@ + ..;$(IncludePath) + ..;$(IncludePath) + ..;$(IncludePath) + ..;$(IncludePath) @@ -183,20 +187,31 @@ + + + + + + NotUsing + + + + + @@ -212,6 +227,7 @@ + @@ -512,9 +528,11 @@ + + diff --git a/s/dbgrid.vcxproj.filters b/s/dbgrid.vcxproj.filters index bce75b4..b87a1f2 100755 --- a/s/dbgrid.vcxproj.filters +++ b/s/dbgrid.vcxproj.filters @@ -287,6 +287,39 @@ Shared Source Files + + Source Files + + + Shared Source Files + + + Shared Source Files + + + Source Files + + + Source Files + + + Source Files + + + Shared Source Files + + + Shared Source Files + + + client + + + Source Files + + + Shared Source Files + @@ -355,6 +388,9 @@ Header Files + + Shared Source Files + diff --git a/s/grid.cpp b/s/grid.cpp index 443cd9a..0045754 100644 --- a/s/grid.cpp +++ b/s/grid.cpp @@ -19,46 +19,47 @@ #include "pch.h" #include - #include "../client/connpool.h" #include "../util/stringutils.h" +#include "../util/unittest.h" #include "grid.h" #include "shard.h" namespace mongo { - - DBConfigPtr Grid::getDBConfig( string database , bool create , const string& shardNameHint ){ + + DBConfigPtr Grid::getDBConfig( string database , bool create , const string& shardNameHint ) { { string::size_type i = database.find( "." ); if ( i != string::npos ) database = database.substr( 0 , i ); } - + if ( database == "config" ) return configServerPtr; scoped_lock l( _lock ); DBConfigPtr& cc = _databases[database]; - if ( !cc ){ + if ( !cc ) { cc.reset(new DBConfig( database )); - if ( ! cc->load() ){ - if ( create ){ + if ( ! cc->load() ) { + if ( create ) { // note here that cc->primary == 0. log() << "couldn't find database [" << database << "] in config db" << endl; - - { // lets check case + + { + // lets check case ScopedDbConnection conn( configServer.modelServer() ); BSONObjBuilder b; b.appendRegex( "_id" , (string)"^" + database + "$" , "i" ); BSONObj d = conn->findOne( ShardNS::database , b.obj() ); conn.done(); - if ( ! d.isEmpty() ){ + if ( ! d.isEmpty() ) { cc.reset(); stringstream ss; - ss << "can't have 2 databases that just differ on case " + ss << "can't have 2 databases that just differ on case " << " have: " << d["_id"].String() << " want to add: " << database; @@ -67,20 +68,22 @@ namespace mongo { } Shard primary; - if ( database == "admin" ){ + if ( database == "admin" ) { primary = configServer.getPrimary(); - } else if ( shardNameHint.empty() ){ + } + else if ( shardNameHint.empty() ) { primary = Shard::pick(); - } else { + } + else { // use the shard name if provided Shard shard; shard.reset( shardNameHint ); primary = shard; } - if ( primary.ok() ){ + if ( primary.ok() ) { cc->setPrimary( primary.getName() ); // saves 'cc' to configDB log() << "\t put [" << database << "] on: " << primary << endl; } @@ -94,53 +97,63 @@ namespace mongo { cc.reset(); } } - + } - + return cc; } - void Grid::removeDB( string database ){ + void Grid::removeDB( string database ) { uassert( 10186 , "removeDB expects db name" , database.find( '.' ) == string::npos ); scoped_lock l( _lock ); _databases.erase( database ); - + } bool Grid::allowLocalHost() const { return _allowLocalShard; } - void Grid::setAllowLocalHost( bool allow ){ + void Grid::setAllowLocalHost( bool allow ) { _allowLocalShard = allow; } - bool Grid::addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg ){ + bool Grid::addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg ) { // name can be NULL, so privide a dummy one here to avoid testing it elsewhere string nameInternal; if ( ! name ) { name = &nameInternal; } - // Check whether the host (or set) exists and run several sanity checks on this request. + // Check whether the host (or set) exists and run several sanity checks on this request. // There are two set of sanity checks: making sure adding this particular shard is consistent - // with the replica set state (if it exists) and making sure this shards databases can be + // with the replica set state (if it exists) and making sure this shards databases can be // brought into the grid without conflict. vector dbNames; try { ScopedDbConnection newShardConn( servers ); newShardConn->getLastError(); - - if ( newShardConn->type() == ConnectionString::SYNC ){ + + if ( newShardConn->type() == ConnectionString::SYNC ) { newShardConn.done(); errMsg = "can't use sync cluster as a shard. for replica set, have to use /,,..."; return false; } + BSONObj resIsMongos; + bool ok = newShardConn->runCommand( "admin" , BSON( "isdbgrid" << 1 ) , resIsMongos ); + + // should return ok=0, cmd not found if it's a normal mongod + if ( ok ) { + errMsg = "can't add a mongos process as a shard"; + newShardConn.done(); + return false; + } + BSONObj resIsMaster; - bool ok = newShardConn->runCommand( "admin" , BSON( "isMaster" << 1 ) , resIsMaster ); - if ( !ok ){ + ok = newShardConn->runCommand( "admin" , BSON( "isMaster" << 1 ) , resIsMaster ); + if ( !ok ) { ostringstream ss; ss << "failed running isMaster: " << resIsMaster; errMsg = ss.str(); @@ -151,7 +164,7 @@ namespace mongo { // if the shard has only one host, make sure it is not part of a replica set string setName = resIsMaster["setName"].str(); string commandSetName = servers.getSetName(); - if ( commandSetName.empty() && ! setName.empty() ){ + if ( commandSetName.empty() && ! setName.empty() ) { ostringstream ss; ss << "host is part of set: " << setName << " use replica set url format /,,...."; errMsg = ss.str(); @@ -160,7 +173,7 @@ namespace mongo { } // if the shard is part of replica set, make sure it is the right one - if ( ! commandSetName.empty() && ( commandSetName != setName ) ){ + if ( ! commandSetName.empty() && ( commandSetName != setName ) ) { ostringstream ss; ss << "host is part of a different set: " << setName; errMsg = ss.str(); @@ -168,30 +181,39 @@ namespace mongo { return false; } - // if the shard is part of a replica set, make sure all the hosts mentioned in 'servers' are part of + // if the shard is part of a replica set, make sure all the hosts mentioned in 'servers' are part of // the set. It is fine if not all members of the set are present in 'servers'. bool foundAll = true; string offendingHost; - if ( ! commandSetName.empty() ){ + if ( ! commandSetName.empty() ) { set hostSet; BSONObjIterator iter( resIsMaster["hosts"].Obj() ); - while ( iter.more() ){ + while ( iter.more() ) { hostSet.insert( iter.next().String() ); // host:port } + if ( resIsMaster["passives"].isABSONObj() ) { + BSONObjIterator piter( resIsMaster["passives"].Obj() ); + while ( piter.more() ) { + hostSet.insert( piter.next().String() ); // host:port + } + } vector hosts = servers.getServers(); - for ( size_t i = 0 ; i < hosts.size() ; i++ ){ + for ( size_t i = 0 ; i < hosts.size() ; i++ ) { + if (!hosts[i].hasPort()) { + hosts[i].setPort(CmdLine::DefaultDBPort); + } string host = hosts[i].toString(); // host:port - if ( hostSet.find( host ) == hostSet.end() ){ + if ( hostSet.find( host ) == hostSet.end() ) { offendingHost = host; foundAll = false; break; } } } - if ( ! foundAll ){ + if ( ! foundAll ) { ostringstream ss; - ss << "host " << offendingHost << " does not belong to replica set " << setName;; + ss << "host " << offendingHost << " does not belong to replica set as a non-passive member" << setName;; errMsg = ss.str(); newShardConn.done(); return false; @@ -199,15 +221,15 @@ namespace mongo { // shard name defaults to the name of the replica set if ( name->empty() && ! setName.empty() ) - *name = setName; + *name = setName; - // In order to be accepted as a new shard, that mongod must not have any database name that exists already - // in any other shards. If that test passes, the new shard's databases are going to be entered as + // In order to be accepted as a new shard, that mongod must not have any database name that exists already + // in any other shards. If that test passes, the new shard's databases are going to be entered as // non-sharded db's whose primary is the newly added shard. BSONObj resListDB; ok = newShardConn->runCommand( "admin" , BSON( "listDatabases" << 1 ) , resListDB ); - if ( !ok ){ + if ( !ok ) { ostringstream ss; ss << "failed listing " << servers.toString() << "'s databases:" << resListDB; errMsg = ss.str(); @@ -216,20 +238,21 @@ namespace mongo { } BSONObjIterator i( resListDB["databases"].Obj() ); - while ( i.more() ){ + while ( i.more() ) { BSONObj dbEntry = i.next().Obj(); const string& dbName = dbEntry["name"].String(); - if ( _isSpecialLocalDB( dbName ) ){ + if ( _isSpecialLocalDB( dbName ) ) { // 'local', 'admin', and 'config' are system DBs and should be excluded here continue; - } else { + } + else { dbNames.push_back( dbName ); } } newShardConn.done(); } - catch ( DBException& e ){ + catch ( DBException& e ) { ostringstream ss; ss << "couldn't connect to new shard "; ss << e.what(); @@ -238,9 +261,9 @@ namespace mongo { } // check that none of the existing shard candidate's db's exist elsewhere - for ( vector::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ){ + for ( vector::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) { DBConfigPtr config = getDBConfig( *it , false ); - if ( config.get() != NULL ){ + if ( config.get() != NULL ) { ostringstream ss; ss << "can't add shard " << servers.toString() << " because a local database '" << *it; ss << "' exists in another " << config->getPrimary().toString(); @@ -250,26 +273,26 @@ namespace mongo { } // if a name for a shard wasn't provided, pick one. - if ( name->empty() && ! _getNewShardName( name ) ){ + if ( name->empty() && ! _getNewShardName( name ) ) { errMsg = "error generating new shard name"; return false; } - + // build the ConfigDB shard document BSONObjBuilder b; b.append( "_id" , *name ); b.append( "host" , servers.toString() ); - if ( maxSize > 0 ){ + if ( maxSize > 0 ) { b.append( ShardFields::maxSize.name() , maxSize ); } BSONObj shardDoc = b.obj(); { ScopedDbConnection conn( configServer.getPrimary() ); - + // check whether the set of hosts (or single host) is not an already a known shard BSONObj old = conn->findOne( ShardNS::shard , BSON( "host" << servers.toString() ) ); - if ( ! old.isEmpty() ){ + if ( ! old.isEmpty() ) { errMsg = "host already used"; conn.done(); return false; @@ -279,7 +302,7 @@ namespace mongo { conn->insert( ShardNS::shard , shardDoc ); errMsg = conn->getLastError(); - if ( ! errMsg.empty() ){ + if ( ! errMsg.empty() ) { log() << "error adding shard: " << shardDoc << " err: " << errMsg << endl; conn.done(); return false; @@ -291,37 +314,37 @@ namespace mongo { Shard::reloadShardInfo(); // add all databases of the new shard - for ( vector::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ){ + for ( vector::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) { DBConfigPtr config = getDBConfig( *it , true , *name ); - if ( ! config ){ - log() << "adding shard " << servers << " even though could not add database " << *it << endl; + if ( ! config ) { + log() << "adding shard " << servers << " even though could not add database " << *it << endl; } } return true; } - - bool Grid::knowAboutShard( const string& name ) const{ + + bool Grid::knowAboutShard( const string& name ) const { ShardConnection conn( configServer.getPrimary() , "" ); BSONObj shard = conn->findOne( ShardNS::shard , BSON( "host" << name ) ); conn.done(); return ! shard.isEmpty(); } - bool Grid::_getNewShardName( string* name ) const{ + bool Grid::_getNewShardName( string* name ) const { DEV assert( name ); bool ok = false; - int count = 0; + int count = 0; ShardConnection conn( configServer.getPrimary() , "" ); - BSONObj o = conn->findOne( ShardNS::shard , Query( fromjson ( "{_id: /^shard/}" ) ).sort( BSON( "_id" << -1 ) ) ); + BSONObj o = conn->findOne( ShardNS::shard , Query( fromjson ( "{_id: /^shard/}" ) ).sort( BSON( "_id" << -1 ) ) ); if ( ! o.isEmpty() ) { string last = o["_id"].String(); istringstream is( last.substr( 5 ) ); is >> count; count++; - } + } if (count < 9999) { stringstream ss; ss << "shard" << setfill('0') << setw(4) << count; @@ -337,14 +360,75 @@ namespace mongo { ShardConnection conn( configServer.getPrimary() , "" ); // look for the stop balancer marker - BSONObj stopMarker = conn->findOne( ShardNS::settings, BSON( "_id" << "balancer" << "stopped" << true ) ); + BSONObj balancerDoc = conn->findOne( ShardNS::settings, BSON( "_id" << "balancer" ) ); conn.done(); - return stopMarker.isEmpty(); + + boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); + if ( _balancerStopped( balancerDoc ) || ! _inBalancingWindow( balancerDoc , now ) ) { + return false; + } + + return true; + } + + bool Grid::_balancerStopped( const BSONObj& balancerDoc ) { + // check the 'stopped' marker maker + // if present, it is a simple bool + BSONElement stoppedElem = balancerDoc["stopped"]; + if ( ! stoppedElem.eoo() && stoppedElem.isBoolean() ) { + return stoppedElem.boolean(); + } + return false; + } + + bool Grid::_inBalancingWindow( const BSONObj& balancerDoc , const boost::posix_time::ptime& now ) { + // check the 'activeWindow' marker + // if present, it is an interval during the day when the balancer should be active + // { start: "08:00" , stop: "19:30" }, strftime format is %H:%M + BSONElement windowElem = balancerDoc["activeWindow"]; + if ( windowElem.eoo() ) { + return true; + } + + // check if both 'start' and 'stop' are present + if ( ! windowElem.isABSONObj() ) { + log(1) << "'activeWindow' format is { start: \"hh:mm\" , stop: ... }" << balancerDoc << endl; + return true; + } + BSONObj intervalDoc = windowElem.Obj(); + const string start = intervalDoc["start"].str(); + const string stop = intervalDoc["stop"].str(); + if ( start.empty() || stop.empty() ) { + log(1) << "must specify both start and end of balancing window: " << intervalDoc << endl; + return true; + } + + // check that both 'start' and 'stop' are valid time-of-day + boost::posix_time::ptime startTime, stopTime; + if ( ! toPointInTime( start , &startTime ) || ! toPointInTime( stop , &stopTime ) ) { + log(1) << "cannot parse active window (use hh:mm 24hs format): " << intervalDoc << endl; + return true; + } + + // allow balancing if during the activeWindow + // note that a window may be open during the night + if ( stopTime > startTime ) { + if ( ( now >= startTime ) && ( now <= stopTime ) ) { + return true; + } + } + else if ( startTime > stopTime ) { + if ( ( now >=startTime ) || ( now <= stopTime ) ) { + return true; + } + } + + return false; } unsigned long long Grid::getNextOpTime() const { ScopedDbConnection conn( configServer.getPrimary() ); - + BSONObj result; massert( 10421 , "getoptime failed" , conn->simpleCommand( "admin" , &result , "getoptime" ) ); conn.done(); @@ -352,10 +436,51 @@ namespace mongo { return result["optime"]._numberLong(); } - bool Grid::_isSpecialLocalDB( const string& dbName ){ + bool Grid::_isSpecialLocalDB( const string& dbName ) { return ( dbName == "local" ) || ( dbName == "admin" ) || ( dbName == "config" ); } Grid grid; -} + // unit tests + + class BalancingWindowUnitTest : public UnitTest { + public: + void run() { + // T0 < T1 < now < T2 < T3 and Error + const string T0 = "9:00"; + const string T1 = "11:00"; + boost::posix_time::ptime now( currentDate(), boost::posix_time::hours( 13 ) + boost::posix_time::minutes( 48 ) ); + const string T2 = "17:00"; + const string T3 = "21:30"; + const string E = "28:35"; + + BSONObj w1 = BSON( "activeWindow" << BSON( "start" << T0 << "stop" << T1 ) ); // closed in the past + BSONObj w2 = BSON( "activeWindow" << BSON( "start" << T2 << "stop" << T3 ) ); // not opened until the future + BSONObj w3 = BSON( "activeWindow" << BSON( "start" << T1 << "stop" << T2 ) ); // open now + BSONObj w4 = BSON( "activeWindow" << BSON( "start" << T3 << "stop" << T2 ) ); // open since last day + + assert( ! Grid::_inBalancingWindow( w1 , now ) ); + assert( ! Grid::_inBalancingWindow( w2 , now ) ); + assert( Grid::_inBalancingWindow( w3 , now ) ); + assert( Grid::_inBalancingWindow( w4 , now ) ); + + // bad input should not stop the balancer + + BSONObj w5; // empty window + BSONObj w6 = BSON( "activeWindow" << BSON( "start" << 1 ) ); // missing stop + BSONObj w7 = BSON( "activeWindow" << BSON( "stop" << 1 ) ); // missing start + BSONObj w8 = BSON( "wrongMarker" << 1 << "start" << 1 << "stop" << 1 ); // active window marker missing + BSONObj w9 = BSON( "activeWindow" << BSON( "start" << T3 << "stop" << E ) ); // garbage in window + + assert( Grid::_inBalancingWindow( w5 , now ) ); + assert( Grid::_inBalancingWindow( w6 , now ) ); + assert( Grid::_inBalancingWindow( w7 , now ) ); + assert( Grid::_inBalancingWindow( w8 , now ) ); + assert( Grid::_inBalancingWindow( w9 , now ) ); + + log(1) << "BalancingWidowObjTest passed" << endl; + } + } BalancingWindowObjTest; + +} diff --git a/s/grid.h b/s/grid.h index 4f3c2ac..5692a82 100644 --- a/s/grid.h +++ b/s/grid.h @@ -18,6 +18,9 @@ #pragma once +#include + +#include "../util/time_support.h" #include "../util/concurrency/mutex.h" #include "config.h" // DBConfigPtr @@ -37,7 +40,7 @@ namespace mongo { * will return an empty DBConfig if not in db already */ DBConfigPtr getDBConfig( string ns , bool create=true , const string& shardNameHint="" ); - + /** * removes db entry. * on next getDBConfig call will fetch from db @@ -57,14 +60,14 @@ namespace mongo { /** * * addShard will create a new shard in the grid. It expects a mongod process to be runing - * on the provided address. Adding a shard that is a replica set is supported. + * on the provided address. Adding a shard that is a replica set is supported. * * @param name is an optional string with the name of the shard. if ommited, grid will - * generate one and update the parameter. + * generate one and update the parameter. * @param servers is the connection string of the shard being added * @param maxSize is the optional space quota in bytes. Zeros means there's no limitation to - * space usage - * @param errMsg is the error description in case the operation failed. + * space usage + * @param errMsg is the error description in case the operation failed. * @return true if shard was successfully added. */ bool addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg ); @@ -73,7 +76,7 @@ namespace mongo { * @return true if the config database knows about a host 'name' */ bool knowAboutShard( const string& name ) const; - + /** * @return true if the chunk balancing functionality is enabled */ @@ -81,6 +84,15 @@ namespace mongo { unsigned long long getNextOpTime() const; + // exposed methods below are for testing only + + /** + * @param balancerDoc bson that may contain a window of time for the balancer to work + * format { ... , activeWindow: { start: "8:30" , stop: "19:00" } , ... } + * @return true if there is no window of time specified for the balancer or it we're currently in it + */ + static bool _inBalancingWindow( const BSONObj& balancerDoc , const boost::posix_time::ptime& now ); + private: mongo::mutex _lock; // protects _databases; TODO: change to r/w lock ?? map _databases; // maps ns to DBConfig's @@ -89,7 +101,7 @@ namespace mongo { /** * @param name is the chose name for the shard. Parameter is mandatory. * @return true if it managed to generate a shard name. May return false if (currently) - * 10000 shard + * 10000 shard */ bool _getNewShardName( string* name ) const; @@ -98,6 +110,13 @@ namespace mongo { */ static bool _isSpecialLocalDB( const string& dbName ); + /** + * @param balancerDoc bson that may contain a marker to stop the balancer + * format { ... , stopped: [ "true" | "false" ] , ... } + * @return true if the marker is present and is set to true + */ + static bool _balancerStopped( const BSONObj& balancerDoc ); + }; extern Grid grid; diff --git a/s/request.cpp b/s/request.cpp index ec245d7..52f2e54 100644 --- a/s/request.cpp +++ b/s/request.cpp @@ -1,7 +1,4 @@ -/* dbgrid/request.cpp - - Top level handling of requests (operations such as query, insert, ...) -*/ +// s/request.cpp /** * Copyright (C) 2008 10gen Inc. @@ -34,53 +31,56 @@ #include "stats.h" #include "cursors.h" #include "grid.h" +#include "client.h" namespace mongo { - Request::Request( Message& m, AbstractMessagingPort* p ) : - _m(m) , _d( m ) , _p(p) , _didInit(false){ - + Request::Request( Message& m, AbstractMessagingPort* p ) : + _m(m) , _d( m ) , _p(p) , _didInit(false) { + assert( _d.getns() ); _id = _m.header()->id; - + _clientId = p ? p->getClientId() : 0; _clientInfo = ClientInfo::get( _clientId ); _clientInfo->newRequest( p ); - + } - - void Request::init(){ + + void Request::init() { if ( _didInit ) return; _didInit = true; reset(); } - - void Request::reset( bool reload ){ - if ( _m.operation() == dbKillCursors ){ + + void Request::reset( bool reload ) { + if ( _m.operation() == dbKillCursors ) { return; } - + + uassert( 13644 , "can't use 'local' database through mongos" , ! str::startsWith( getns() , "local." ) ); + _config = grid.getDBConfig( getns() ); if ( reload ) uassert( 10192 , "db config reload failed!" , _config->reload() ); - if ( _config->isSharded( getns() ) ){ + if ( _config->isSharded( getns() ) ) { _chunkManager = _config->getChunkManager( getns() , reload ); uassert( 10193 , (string)"no shard info for: " + getns() , _chunkManager ); } else { _chunkManager.reset(); - } + } _m.header()->id = _id; - + } - + Shard Request::primaryShard() const { assert( _didInit ); - - if ( _chunkManager ){ + + if ( _chunkManager ) { if ( _chunkManager->numChunks() > 1 ) throw UserException( 8060 , "can't call primaryShard on a sharded collection" ); return _chunkManager->findChunk( _chunkManager->getShardKey().globalMin() )->getShard(); @@ -89,26 +89,26 @@ namespace mongo { uassert( 10194 , "can't call primaryShard on a sharded collection!" , s.ok() ); return s; } - - void Request::process( int attempt ){ + + void Request::process( int attempt ) { init(); int op = _m.operation(); assert( op > dbMsg ); - - if ( op == dbKillCursors ){ + + if ( op == dbKillCursors ) { cursorCache.gotKillCursors( _m ); return; } - + log(3) << "Request::process ns: " << getns() << " msg id:" << (int)(_m.header()->id) << " attempt: " << attempt << endl; - + Strategy * s = SINGLE; _counter = &opsNonSharded; - + _d.markSet(); - - if ( _chunkManager ){ + + if ( _chunkManager ) { s = SHARDED; _counter = &opsSharded; } @@ -119,7 +119,7 @@ namespace mongo { try { s->queryOp( *this ); } - catch ( StaleConfigException& staleConfig ){ + catch ( StaleConfigException& staleConfig ) { log() << staleConfig.what() << " attempt: " << attempt << endl; uassert( 10195 , "too many attempts to update config, failing" , attempt < 5 ); ShardConnection::checkMyConnectionVersions( getns() ); @@ -141,115 +141,31 @@ namespace mongo { globalOpCounters.gotOp( op , iscmd ); _counter->gotOp( op , iscmd ); } - + bool Request::isCommand() const { int x = _d.getQueryNToReturn(); return ( x == 1 || x == -1 ) && strstr( getns() , ".$cmd" ); } - void Request::gotInsert(){ + void Request::gotInsert() { globalOpCounters.gotInsert(); _counter->gotInsert(); } - void Request::reply( Message & response , const string& fromServer ){ + void Request::reply( Message & response , const string& fromServer ) { assert( _didInit ); long long cursor =response.header()->getCursor(); - if ( cursor ){ - cursorCache.storeRef( fromServer , cursor ); - } - _p->reply( _m , response , _id ); - } - - ClientInfo::ClientInfo( int clientId ) : _id( clientId ){ - _cur = &_a; - _prev = &_b; - newRequest(); - } - - ClientInfo::~ClientInfo(){ - if ( _lastAccess ){ - scoped_lock lk( _clientsLock ); - ClientCache::iterator i = _clients.find( _id ); - if ( i != _clients.end() ){ - _clients.erase( i ); + if ( cursor ) { + if ( fromServer.size() ) { + cursorCache.storeRef( fromServer , cursor ); } - } - } - - void ClientInfo::addShard( const string& shard ){ - _cur->insert( shard ); - _sinceLastGetError.insert( shard ); - } - - void ClientInfo::newRequest( AbstractMessagingPort* p ){ - - if ( p ){ - string r = p->remote().toString(); - if ( _remote == "" ) - _remote = r; - else if ( _remote != r ){ - stringstream ss; - ss << "remotes don't match old [" << _remote << "] new [" << r << "]"; - throw UserException( 13134 , ss.str() ); + else { + // probably a getMore + // make sure we have a ref for this + assert( cursorCache.getRef( cursor ).size() ); } } - - _lastAccess = (int) time(0); - - set * temp = _cur; - _cur = _prev; - _prev = temp; - _cur->clear(); - } - - void ClientInfo::disconnect(){ - _lastAccess = 0; - } - - ClientInfo * ClientInfo::get( int clientId , bool create ){ - - if ( ! clientId ) - clientId = getClientId(); - - if ( ! clientId ){ - ClientInfo * info = _tlInfo.get(); - if ( ! info ){ - info = new ClientInfo( 0 ); - _tlInfo.reset( info ); - } - info->newRequest(); - return info; - } - - scoped_lock lk( _clientsLock ); - ClientCache::iterator i = _clients.find( clientId ); - if ( i != _clients.end() ) - return i->second; - if ( ! create ) - return 0; - ClientInfo * info = new ClientInfo( clientId ); - _clients[clientId] = info; - return info; - } - - void ClientInfo::disconnect( int clientId ){ - if ( ! clientId ) - return; - - scoped_lock lk( _clientsLock ); - ClientCache::iterator i = _clients.find( clientId ); - if ( i == _clients.end() ) - return; - - ClientInfo* ci = i->second; - ci->disconnect(); - delete ci; - _clients.erase( i ); + _p->reply( _m , response , _id ); } - ClientCache& ClientInfo::_clients = *(new ClientCache()); - mongo::mutex ClientInfo::_clientsLock("_clientsLock"); - boost::thread_specific_ptr ClientInfo::_tlInfo; - } // namespace mongo diff --git a/s/request.h b/s/request.h index f063d0c..5b4c228 100644 --- a/s/request.h +++ b/s/request.h @@ -26,16 +26,16 @@ namespace mongo { - + class OpCounters; class ClientInfo; - + class Request : boost::noncopyable { public: Request( Message& m, AbstractMessagingPort* p ); // ---- message info ----- - + const char * getns() const { return _d.getns(); @@ -60,12 +60,12 @@ namespace mongo { assert( _didInit ); return _config->isShardingEnabled(); } - + ChunkManagerPtr getChunkManager() const { assert( _didInit ); return _chunkManager; } - + int getClientId() const { return _clientId; } @@ -74,14 +74,14 @@ namespace mongo { } // ---- remote location info ----- - - + + Shard primaryShard() const ; - + // ---- low level access ---- void reply( Message & response , const string& fromServer ); - + Message& m() { return _m; } DbMessage& d() { return _d; } AbstractMessagingPort* p() const { return _p; } @@ -93,16 +93,16 @@ namespace mongo { void init(); void reset( bool reload=false ); - + private: Message& _m; DbMessage _d; AbstractMessagingPort* _p; - + MSGID _id; DBConfigPtr _config; ChunkManagerPtr _chunkManager; - + int _clientId; ClientInfo * _clientInfo; @@ -111,45 +111,6 @@ namespace mongo { bool _didInit; }; - typedef map ClientCache; - - class ClientInfo { - public: - ClientInfo( int clientId ); - ~ClientInfo(); - - string getRemote() const { return _remote; } - - void addShard( const string& shard ); - set * getPrev() const { return _prev; }; - - void newRequest( AbstractMessagingPort* p = 0 ); - void disconnect(); - - static ClientInfo * get( int clientId = 0 , bool create = true ); - static void disconnect( int clientId ); - - const set& sinceLastGetError() const { return _sinceLastGetError; } - void clearSinceLastGetError(){ - _sinceLastGetError.clear(); - } - - private: - int _id; - string _remote; - - set _a; - set _b; - set * _cur; - set * _prev; - int _lastAccess; - - set _sinceLastGetError; - - static mongo::mutex _clientsLock; - static ClientCache& _clients; - static boost::thread_specific_ptr _tlInfo; - }; } #include "strategy.h" diff --git a/s/s_only.cpp b/s/s_only.cpp index 1f66e70..83bceac 100644 --- a/s/s_only.cpp +++ b/s/s_only.cpp @@ -16,6 +16,8 @@ */ #include "pch.h" +#include "request.h" +#include "client.h" #include "../client/dbclient.h" #include "../db/dbhelpers.h" #include "../db/matcher.h" @@ -27,53 +29,54 @@ */ namespace mongo { - auto_ptr Helpers::find( const char *ns , BSONObj query , bool requireIndex ){ - uassert( 10196 , "Helpers::find can't be used in mongos" , 0 ); - auto_ptr i; - return i; - } - boost::thread_specific_ptr currentClient; - Client::Client(const char *desc , MessagingPort *p) : - _context(0), - _shutdown(false), - _desc(desc), - _god(0), - _lastOp(0), - _mp(p) - { + Client::Client(const char *desc , MessagingPort *p) : + _context(0), + _shutdown(false), + _desc(desc), + _god(0), + _lastOp(0), + _mp(p) { } - Client::~Client(){} - bool Client::shutdown(){ return true; } + Client::~Client() {} + bool Client::shutdown() { return true; } - bool webHaveAdminUsers(){ - return false; + Client& Client::initThread(const char *desc, MessagingPort *mp) { + setThreadName(desc); + assert( currentClient.get() == 0 ); + Client *c = new Client(desc, mp); + currentClient.reset(c); + mongo::lastError.initThread(); + return *c; } - BSONObj webGetAdminUser( const string& username ){ - return BSONObj(); + string Client::clientAddress(bool includePort) const { + ClientInfo * ci = ClientInfo::get(); + if ( ci ) + return ci->getRemote(); + return ""; } - + bool execCommand( Command * c , - Client& client , int queryOptions , - const char *ns, BSONObj& cmdObj , - BSONObjBuilder& result, - bool fromRepl ){ + Client& client , int queryOptions , + const char *ns, BSONObj& cmdObj , + BSONObjBuilder& result, + bool fromRepl ) { assert(c); - + string dbname = nsToDatabase( ns ); - - if ( cmdObj["help"].trueValue() ){ + + if ( cmdObj["help"].trueValue() ) { stringstream ss; ss << "help for: " << c->name << " "; c->help( ss ); result.append( "help" , ss.str() ); result.append( "lockType" , c->locktype() ); return true; - } + } - if ( c->adminOnly() ){ + if ( c->adminOnly() ) { if ( dbname != "admin" ) { result.append( "errmsg" , "access denied- use admin db" ); log() << "command denied: " << cmdObj.toString() << endl; diff --git a/s/server.cpp b/s/server.cpp index c3dc24c..9bdeede 100644 --- a/s/server.cpp +++ b/s/server.cpp @@ -23,37 +23,41 @@ #include "../util/message_server.h" #include "../util/stringutils.h" #include "../util/version.h" +#include "../util/signal_handlers.h" +#include "../util/admin_access.h" #include "../db/dbwebserver.h" #include "server.h" #include "request.h" +#include "client.h" #include "config.h" #include "chunk.h" #include "balance.h" #include "grid.h" #include "cursors.h" +#include "shard_version.h" namespace mongo { - - CmdLine cmdLine; + + CmdLine cmdLine; Database *database = 0; string mongosCommand; bool dbexitCalled = false; - bool inShutdown(){ + bool inShutdown() { return dbexitCalled; } - + string getDbContext() { return "?"; } - bool haveLocalShardingInfo( const string& ns ){ + bool haveLocalShardingInfo( const string& ns ) { assert( 0 ); return false; } - - void usage( char * argv[] ){ + + void usage( char * argv[] ) { out() << argv[0] << " usage:\n\n"; out() << " -v+ verbose 1: general 2: more 3: per request 4: more\n"; out() << " --port \n"; @@ -64,23 +68,23 @@ namespace mongo { class ShardingConnectionHook : public DBConnectionHook { public: - virtual void onHandedOut( DBClientBase * conn ){ + virtual void onHandedOut( DBClientBase * conn ) { ClientInfo::get()->addShard( conn->getServerAddress() ); } } shardingConnectionHook; - + class ShardedMessageHandler : public MessageHandler { public: - virtual ~ShardedMessageHandler(){} + virtual ~ShardedMessageHandler() {} - virtual void process( Message& m , AbstractMessagingPort* p ){ + virtual void process( Message& m , AbstractMessagingPort* p ) { assert( p ); Request r( m , p ); LastError * le = lastError.startRequest( m , r.getClientId() ); assert( le ); - - if ( logLevel > 5 ){ + + if ( logLevel > 5 ) { log(5) << "client id: " << hex << r.getClientId() << "\t" << r.getns() << "\t" << dec << r.op() << endl; } try { @@ -88,43 +92,67 @@ namespace mongo { setClientId( r.getClientId() ); r.process(); } - catch ( DBException& e ){ + catch ( AssertionException & e ) { + log( e.isUserAssertion() ? 1 : 0 ) << "AssertionException in process: " << e.what() << endl; + + le->raiseError( e.getCode() , e.what() ); + + m.header()->id = r.id(); + + if ( r.expectResponse() ) { + BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() ); + replyToQuery( ResultFlag_ErrSet, p , m , err ); + } + } + catch ( DBException& e ) { log() << "DBException in process: " << e.what() << endl; - + le->raiseError( e.getCode() , e.what() ); - + m.header()->id = r.id(); - - if ( r.expectResponse() ){ + + if ( r.expectResponse() ) { BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() ); replyToQuery( ResultFlag_ErrSet, p , m , err ); } } } - virtual void disconnected( AbstractMessagingPort* p ){ + virtual void disconnected( AbstractMessagingPort* p ) { ClientInfo::disconnect( p->getClientId() ); lastError.disconnect( p->getClientId() ); } }; - void sighandler(int sig){ + void sighandler(int sig) { dbexit(EXIT_CLEAN, (string("received signal ") + BSONObjBuilder::numStr(sig)).c_str()); } - - void setupSignals(){ + + void setupSignals( bool inFork ) { signal(SIGTERM, sighandler); signal(SIGINT, sighandler); + +#if defined(SIGQUIT) + signal( SIGQUIT , printStackAndExit ); +#endif + signal( SIGSEGV , printStackAndExit ); + signal( SIGABRT , printStackAndExit ); + signal( SIGFPE , printStackAndExit ); +#if defined(SIGBUS) + signal( SIGBUS , printStackAndExit ); +#endif } - void init(){ + void init() { serverID.init(); setupSIGTRAPforGDB(); setupCoreSignals(); - setupSignals(); + setupSignals( false ); } - void start( const MessageServer::Options& opts ){ + void start( const MessageServer::Options& opts ) { + setThreadName( "mongosMain" ); + installChunkShardVersioning(); balancer.go(); cursorCache.startTimeoutThread(); @@ -137,12 +165,12 @@ namespace mongo { server->run(); } - DBClientBase *createDirectClient(){ + DBClientBase *createDirectClient() { uassert( 10197 , "createDirectClient not implemented for sharding yet" , 0 ); return 0; } - void printShardingVersionInfo(){ + void printShardingVersionInfo() { log() << mongosCommand << " " << mongodVersion() << " starting (--help for usage)" << endl; printGitVersion(); printSysInfo(); @@ -156,91 +184,108 @@ using namespace mongo; namespace po = boost::program_options; -int main(int argc, char* argv[], char *envp[] ) { +int _main(int argc, char* argv[]) { static StaticObserver staticObserver; mongosCommand = argv[0]; - po::options_description options("Sharding options"); + po::options_description options("General options"); + po::options_description sharding_options("Sharding options"); po::options_description hidden("Hidden options"); po::positional_options_description positional; - + CmdLine::addGlobalOptions( options , hidden ); - - options.add_options() - ( "configdb" , po::value() , "1 or 3 comma separated config servers" ) - ( "test" , "just run unit tests" ) - ( "upgrade" , "upgrade meta data version" ) - ( "chunkSize" , po::value(), "maximum amount of data per chunk" ) - ( "ipv6", "enable IPv6 support (disabled by default)" ) - ; - + sharding_options.add_options() + ( "configdb" , po::value() , "1 or 3 comma separated config servers" ) + ( "test" , "just run unit tests" ) + ( "upgrade" , "upgrade meta data version" ) + ( "chunkSize" , po::value(), "maximum amount of data per chunk" ) + ( "ipv6", "enable IPv6 support (disabled by default)" ) + ( "jsonp","allow JSONP access via http (has security implications)" ) + ; + + options.add(sharding_options); // parse options po::variables_map params; if ( ! CmdLine::store( argc , argv , options , hidden , positional , params ) ) return 0; - - if ( params.count( "help" ) ){ + + // The default value may vary depending on compile options, but for mongos + // we want durability to be disabled. + cmdLine.dur = false; + + if ( params.count( "help" ) ) { cout << options << endl; return 0; } - if ( params.count( "version" ) ){ + if ( params.count( "version" ) ) { printShardingVersionInfo(); return 0; } - if ( params.count( "chunkSize" ) ){ + if ( params.count( "chunkSize" ) ) { Chunk::MaxChunkSize = params["chunkSize"].as() * 1024 * 1024; } - if ( params.count( "ipv6" ) ){ + if ( params.count( "ipv6" ) ) { enableIPv6(); } - if ( params.count( "test" ) ){ + if ( params.count( "jsonp" ) ) { + cmdLine.jsonp = true; + } + + if ( params.count( "test" ) ) { logLevel = 5; UnitTest::runTests(); cout << "tests passed" << endl; return 0; } - - if ( ! params.count( "configdb" ) ){ + + if ( ! params.count( "configdb" ) ) { out() << "error: no args for --configdb" << endl; return 4; } vector configdbs; splitStringDelim( params["configdb"].as() , &configdbs , ',' ); - if ( configdbs.size() != 1 && configdbs.size() != 3 ){ + if ( configdbs.size() != 1 && configdbs.size() != 3 ) { out() << "need either 1 or 3 configdbs" << endl; return 5; } // we either have a seeting were all process are in localhost or none is - for ( vector::const_iterator it = configdbs.begin() ; it != configdbs.end() ; ++it ){ + for ( vector::const_iterator it = configdbs.begin() ; it != configdbs.end() ; ++it ) { try { HostAndPort configAddr( *it ); // will throw if address format is invalid - if ( it == configdbs.begin() ){ + if ( it == configdbs.begin() ) { grid.setAllowLocalHost( configAddr.isLocalHost() ); } - if ( configAddr.isLocalHost() != grid.allowLocalHost() ){ + if ( configAddr.isLocalHost() != grid.allowLocalHost() ) { out() << "cannot mix localhost and ip addresses in configdbs" << endl; return 10; } - } + } catch ( DBException& e) { out() << "configdb: " << e.what() << endl; return 9; } } + // set some global state + pool.addHook( &shardingConnectionHook ); + pool.setName( "mongos connectionpool" ); + + DBClientConnection::setLazyKillCursor( false ); + ReplicaSetMonitor::setConfigChangeHook( boost::bind( &ConfigServer::replicaSetChange , &configServer , _1 ) ); + if ( argc <= 1 ) { usage( argv ); return 3; @@ -252,22 +297,22 @@ int main(int argc, char* argv[], char *envp[] ) { usage( argv ); return 1; } - + printShardingVersionInfo(); - - if ( ! configServer.init( configdbs ) ){ + + if ( ! configServer.init( configdbs ) ) { cout << "couldn't resolve config db address" << endl; return 7; } - - if ( ! configServer.ok( true ) ){ + + if ( ! configServer.ok( true ) ) { cout << "configServer startup check failed" << endl; return 8; } - + int configError = configServer.checkConfigVersion( params.count( "upgrade" ) ); - if ( configError ){ - if ( configError > 0 ){ + if ( configError ) { + if ( configError > 0 ) { cout << "upgrade success!" << endl; } else { @@ -279,8 +324,8 @@ int main(int argc, char* argv[], char *envp[] ) { init(); - boost::thread web( webServerThread ); - + boost::thread web( boost::bind(&webServerThread, new NoAdminAccess() /* takes ownership */) ); + MessageServer::Options opts; opts.port = cmdLine.port; opts.ipList = cmdLine.bind_ip; @@ -289,10 +334,30 @@ int main(int argc, char* argv[], char *envp[] ) { dbexit( EXIT_CLEAN ); return 0; } +int main(int argc, char* argv[]) { + try { + return _main(argc, argv); + } + catch(DBException& e) { + cout << "uncaught exception in mongos main:" << endl; + cout << e.toString() << endl; + } + catch(std::exception& e) { + cout << "uncaught exception in mongos main:" << endl; + cout << e.what() << endl; + } + catch(...) { + cout << "uncaught exception in mongos main" << endl; + } + return 20; +} #undef exit -void mongo::dbexit( ExitCode rc, const char *why) { +void mongo::dbexit( ExitCode rc, const char *why, bool tryToGetLock ) { dbexitCalled = true; - log() << "dbexit: " << why << " rc:" << rc << endl; + log() << "dbexit: " << why + << " rc:" << rc + << " " << ( why ? why : "" ) + << endl; ::exit(rc); } diff --git a/s/server.h b/s/server.h index c45d77d..1a5c9ea 100644 --- a/s/server.h +++ b/s/server.h @@ -21,9 +21,9 @@ #include "../db/jsobj.h" namespace mongo { - + extern OID serverID; - + // from request.cpp void processRequest(Message& m, MessagingPort& p); } diff --git a/s/shard.cpp b/s/shard.cpp index 4d73a66..dbfd8f9 100644 --- a/s/shard.cpp +++ b/s/shard.cpp @@ -1,147 +1,155 @@ // shard.cpp /** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see . -*/ + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ #include "pch.h" #include "shard.h" #include "config.h" #include "request.h" +#include "../db/commands.h" #include namespace mongo { - + class StaticShardInfo { public: StaticShardInfo() : _mutex("StaticShardInfo") { } - void reload(){ + void reload() { list all; { ScopedDbConnection conn( configServer.getPrimary() ); auto_ptr c = conn->query( ShardNS::shard , Query() ); - assert( c.get() ); - while ( c->more() ){ + massert( 13632 , "couldn't get updated shard list from config server" , c.get() ); + while ( c->more() ) { all.push_back( c->next().getOwned() ); } conn.done(); } - + scoped_lock lk( _mutex ); - + // We use the _lookup table for all shards and for the primary config DB. The config DB info, // however, does not come from the ShardNS::shard. So when cleaning the _lookup table we leave // the config state intact. The rationale is that this way we could drop shards that // were removed without reinitializing the config DB information. map::iterator i = _lookup.find( "config" ); - if ( i != _lookup.end() ){ + if ( i != _lookup.end() ) { Shard config = i->second; _lookup.clear(); _lookup[ "config" ] = config; - } else { + } + else { _lookup.clear(); } - for ( list::iterator i=all.begin(); i!=all.end(); ++i ){ + for ( list::iterator i=all.begin(); i!=all.end(); ++i ) { BSONObj o = *i; string name = o["_id"].String(); string host = o["host"].String(); long long maxSize = 0; BSONElement maxSizeElem = o[ ShardFields::maxSize.name() ]; - if ( ! maxSizeElem.eoo() ){ + if ( ! maxSizeElem.eoo() ) { maxSize = maxSizeElem.numberLong(); } bool isDraining = false; BSONElement isDrainingElem = o[ ShardFields::draining.name() ]; - if ( ! isDrainingElem.eoo() ){ + if ( ! isDrainingElem.eoo() ) { isDraining = isDrainingElem.Bool(); } Shard s( name , host , maxSize , isDraining ); _lookup[name] = s; - _lookup[host] = s; - - // add rs name to lookup (if it exists) - size_t pos; - if ((pos = host.find('/', 0)) != string::npos) { - _lookup[host.substr(0, pos)] = s; - } + _installHost( host , s ); } } - bool isMember( const string& addr ){ - scoped_lock lk( _mutex ); - map::iterator i = _lookup.find( addr ); - return i != _lookup.end(); - } + const Shard& find( const string& ident ) { + string mykey = ident; - const Shard& find( const string& ident ){ { - scoped_lock lk( _mutex ); - map::iterator i = _lookup.find( ident ); + // if its a replica set, just use set name + size_t pos = mykey.find( '/' ); + if ( pos != string::npos ) + mykey = mykey.substr(0,pos); + } - // if normal find didn't find anything, try to find by rs name - size_t pos; - if ( i == _lookup.end() && (pos = ident.find('/', 0)) != string::npos) { - i = _lookup.find( ident.substr(0, pos) ); - } + { + scoped_lock lk( _mutex ); + map::iterator i = _lookup.find( mykey ); if ( i != _lookup.end() ) return i->second; } - + // not in our maps, re-load all reload(); scoped_lock lk( _mutex ); - map::iterator i = _lookup.find( ident ); - uassert( 13129 , (string)"can't find shard for: " + ident , i != _lookup.end() ); - return i->second; + map::iterator i = _lookup.find( mykey ); + massert( 13129 , (string)"can't find shard for: " + mykey , i != _lookup.end() ); + return i->second; } - - void set( const string& name , const string& addr , bool setName = true , bool setAddr = true ){ - Shard s(name,addr); + + void set( const string& name , const Shard& s , bool setName = true , bool setAddr = true ) { scoped_lock lk( _mutex ); if ( setName ) _lookup[name] = s; if ( setAddr ) - _lookup[addr] = s; + _installHost( s.getConnString() , s ); + } + + void _installHost( const string& host , const Shard& s ) { + _lookup[host] = s; + + const ConnectionString& cs = s.getAddress(); + if ( cs.type() == ConnectionString::SET ) { + if ( cs.getSetName().size() ) + _lookup[ cs.getSetName() ] = s; + + vector servers = cs.getServers(); + for ( unsigned i=0; i::iterator i = _lookup.begin(); i!=_lookup.end(); ){ + for ( map::iterator i = _lookup.begin(); i!=_lookup.end(); ) { Shard s = i->second; - if ( s.getName() == name ){ + if ( s.getName() == name ) { _lookup.erase(i++); - } else { + } + else { ++i; } } } - - void getAllShards( vector& all ){ + + void getAllShards( vector& all ) const { scoped_lock lk( _mutex ); std::set seen; - for ( map::iterator i = _lookup.begin(); i!=_lookup.end(); ++i ){ - Shard s = i->second; + for ( map::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) { + const Shard& s = i->second; if ( s.getName() == "config" ) continue; if ( seen.count( s.getName() ) ) @@ -150,49 +158,131 @@ namespace mongo { all.push_back( s ); } } + + bool isAShardNode( const string& addr ) const { + scoped_lock lk( _mutex ); + + // check direct nods or set names + map::const_iterator i = _lookup.find( addr ); + if ( i != _lookup.end() ) + return true; + + // check for set nodes + for ( map::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) { + if ( i->first == "config" ) + continue; + + const Shard& s = i->second; + if ( s.containsNode( addr ) ) + return true; + } + + return false; + } + + bool getShardMap( BSONObjBuilder& result , string& errmsg ) const { + scoped_lock lk( _mutex ); + + BSONObjBuilder b( _lookup.size() + 50 ); + + for ( map::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) { + b.append( i->first , i->second.getConnString() ); + } + + result.append( "map" , b.obj() ); + + return true; + } private: map _lookup; - mongo::mutex _mutex; + mutable mongo::mutex _mutex; } staticShardInfo; + - void Shard::setAddress( const string& addr , bool authoritative ){ - assert( _name.size() ); + class CmdGetShardMap : public Command { + public: + CmdGetShardMap() : Command( "getShardMap" ){} + virtual void help( stringstream &help ) const { help<<"internal"; } + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + + virtual bool run(const string&, mongo::BSONObj&, std::string& errmsg , mongo::BSONObjBuilder& result, bool) { + return staticShardInfo.getShardMap( result , errmsg ); + } + } cmdGetShardMap; + + + void Shard::_setAddr( const string& addr ) { _addr = addr; - if ( authoritative ) - staticShardInfo.set( _name , _addr , true , false ); + if ( _addr.size() ) { + _cs = ConnectionString( addr , ConnectionString::SET ); + _rsInit(); + } } - - void Shard::reset( const string& ident ){ + + void Shard::_rsInit() { + if ( _cs.type() == ConnectionString::SET ) { + string x = _cs.getSetName(); + if ( x.size() == 0 ) { + warning() << "no set name for shard: " << _name << " " << _cs.toString() << endl; + } + assert( x.size() ); + _rs = ReplicaSetMonitor::get( x , _cs.getServers() ); + } + } + + void Shard::setAddress( const ConnectionString& cs) { + assert( _name.size() ); + _addr = cs.toString(); + _cs = cs; + _rsInit(); + staticShardInfo.set( _name , *this , true , false ); + } + + void Shard::reset( const string& ident ) { const Shard& s = staticShardInfo.find( ident ); - uassert( 13128 , (string)"can't find shard for: " + ident , s.ok() ); + massert( 13128 , (string)"can't find shard for: " + ident , s.ok() ); _name = s._name; _addr = s._addr; + _cs = s._cs; + _rsInit(); _maxSize = s._maxSize; _isDraining = s._isDraining; } - - void Shard::getAllShards( vector& all ){ + + bool Shard::containsNode( const string& node ) const { + if ( _addr == node ) + return true; + + if ( _rs && _rs->contains( node ) ) + return true; + + return false; + } + + void Shard::getAllShards( vector& all ) { staticShardInfo.getAllShards( all ); } - bool Shard::isAShard( const string& ident ){ - return staticShardInfo.isMember( ident ); + bool Shard::isAShardNode( const string& ident ) { + return staticShardInfo.isAShardNode( ident ); } - void Shard::printShardInfo( ostream& out ){ + void Shard::printShardInfo( ostream& out ) { vector all; getAllShards( all ); for ( unsigned i=0; irunCommand( db , cmd , res ); - if ( ! ok ){ + if ( ! ok ) { stringstream ss; ss << "runCommand (" << cmd << ") on shard (" << _name << ") failed : " << res; throw UserException( 13136 , ss.str() ); @@ -201,49 +291,50 @@ namespace mongo { conn.done(); return res; } - + ShardStatus Shard::getStatus() const { return ShardStatus( *this , runCommand( "admin" , BSON( "serverStatus" << 1 ) ) ); } - - void Shard::reloadShardInfo(){ + + void Shard::reloadShardInfo() { staticShardInfo.reload(); } - bool Shard::isMember( const string& addr ){ - return staticShardInfo.isMember( addr ); - } - - void Shard::removeShard( const string& name ){ + void Shard::removeShard( const string& name ) { staticShardInfo.remove( name ); } - Shard Shard::pick(){ + Shard Shard::pick( const Shard& current ) { vector all; staticShardInfo.getAllShards( all ); - if ( all.size() == 0 ){ + if ( all.size() == 0 ) { staticShardInfo.reload(); staticShardInfo.getAllShards( all ); if ( all.size() == 0 ) return EMPTY; } - + + // if current shard was provided, pick a different shard only if it is a better choice ShardStatus best = all[0].getStatus(); - - for ( size_t i=1; i_name ) , _addr( other->_addr ), _maxSize( other->_maxSize ) , _isDraining( other->_isDraining ){ + : _name( other->_name ) , _addr( other->_addr ), _cs( other->_cs ) , + _maxSize( other->_maxSize ) , _isDraining( other->_isDraining ) , _rs( other->_rs ) { } - - static Shard make( const string& ident ){ + + static Shard make( const string& ident ) { Shard s; s.reset( ident ); return s; } - static bool isAShard( const string& ident ); - /** * @param ident either name or address */ void reset( const string& ident ); + + void setAddress( const ConnectionString& cs ); - void setAddress( const string& addr , bool authoritative = false ); + ConnectionString getAddress() const { return _cs; } string getName() const { assert( _name.size() ); return _name; } - + string getConnString() const { assert( _addr.size() ); return _addr; @@ -92,7 +100,7 @@ namespace mongo { bool operator==( const Shard& s ) const { bool n = _name == s._name; bool a = _addr == s._addr; - + assert( n == a ); // names and address are 1 to 1 return n; } @@ -107,7 +115,7 @@ namespace mongo { bool operator==( const string& s ) const { return _name == s || _addr == s; } - + bool operator!=( const string& s ) const { return _name != s && _addr != s; } @@ -115,44 +123,58 @@ namespace mongo { bool operator<(const Shard& o) const { return _name < o._name; } - + bool ok() const { return _addr.size() > 0 && _addr.size() > 0; } - + BSONObj runCommand( const string& db , const string& simple ) const { return runCommand( db , BSON( simple << 1 ) ); } BSONObj runCommand( const string& db , const BSONObj& cmd ) const ; - + ShardStatus getStatus() const ; + /** + * mostly for replica set + * retursn true if node is the shard + * of if the replica set contains node + */ + bool containsNode( const string& node ) const; + static void getAllShards( vector& all ); static void printShardInfo( ostream& out ); - + /** - * picks a Shard for more load + * @parm current - shard where the chunk/database currently lives in + * @return the currently emptiest shard, if best then current, or EMPTY */ - static Shard pick(); - + static Shard pick( const Shard& current = EMPTY ); + static void reloadShardInfo(); static void removeShard( const string& name ); - static bool isMember( const string& addr ); + static bool isAShardNode( const string& ident ); static Shard EMPTY; - + private: + + void _rsInit(); + void _setAddr( const string& addr ); + string _name; string _addr; - long long _maxSize; // in MBytes, 0 is unlimited + ConnectionString _cs; + long long _maxSize; // in MBytes, 0 is unlimited bool _isDraining; // shard is currently being removed + ReplicaSetMonitorPtr _rs; }; class ShardStatus { public: - + ShardStatus( const Shard& shard , const BSONObj& obj ); friend ostream& operator << (ostream& out, const ShardStatus& s) { @@ -162,14 +184,14 @@ namespace mongo { string toString() const { stringstream ss; - ss << "shard: " << _shard << " mapped: " << _mapped << " writeLock: " << _writeLock; + ss << "shard: " << _shard << " mapped: " << _mapped << " writeLock: " << _writeLock; return ss.str(); } - bool operator<( const ShardStatus& other ) const{ + bool operator<( const ShardStatus& other ) const { return _mapped < other._mapped; } - + Shard shard() const { return _shard; } @@ -178,9 +200,14 @@ namespace mongo { return _mapped; } + bool hasOpsQueued() const { + return _hasOpsQueued; + } + private: Shard _shard; long long _mapped; + bool _hasOpsQueued; // true if 'writebacks' are pending double _writeLock; }; @@ -195,19 +222,19 @@ namespace mongo { void done(); void kill(); - DBClientBase& conn(){ + DBClientBase& conn() { _finishInit(); assert( _conn ); return *_conn; } - - DBClientBase* operator->(){ + + DBClientBase* operator->() { _finishInit(); assert( _conn ); return _conn; } - DBClientBase* get(){ + DBClientBase* get() { _finishInit(); assert( _conn ); return _conn; @@ -224,7 +251,7 @@ namespace mongo { static void sync(); - void donotCheckVersion(){ + void donotCheckVersion() { _setVersion = false; _finishedInit = true; } @@ -236,11 +263,11 @@ namespace mongo { /** checks all of my thread local connections for the version of this ns */ static void checkMyConnectionVersions( const string & ns ); - + private: void _init(); void _finishInit(); - + bool _finishedInit; string _addr; diff --git a/s/shard_version.cpp b/s/shard_version.cpp new file mode 100644 index 0000000..0f3e80f --- /dev/null +++ b/s/shard_version.cpp @@ -0,0 +1,151 @@ +// @file shard_version.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" + +#include "chunk.h" +#include "config.h" +#include "grid.h" +#include "util.h" +#include "shard.h" +#include "writeback_listener.h" + +#include "shard_version.h" + +namespace mongo { + + // when running in sharded mode, use chunk shard version control + + static bool checkShardVersion( DBClientBase & conn , const string& ns , bool authoritative = false , int tryNumber = 1 ); + static void resetShardVersion( DBClientBase * conn ); + + void installChunkShardVersioning() { + // + // Overriding no-op behavior in shardconnection.cpp + // + // TODO: Better encapsulate this mechanism. + // + checkShardVersionCB = checkShardVersion; + resetShardVersionCB = resetShardVersion; + } + + struct ConnectionShardStatus { + + typedef unsigned long long S; + + ConnectionShardStatus() + : _mutex( "ConnectionShardStatus" ) { + } + + S getSequence( DBClientBase * conn , const string& ns ) { + scoped_lock lk( _mutex ); + return _map[conn][ns]; + } + + void setSequence( DBClientBase * conn , const string& ns , const S& s ) { + scoped_lock lk( _mutex ); + _map[conn][ns] = s; + } + + void reset( DBClientBase * conn ) { + scoped_lock lk( _mutex ); + _map.erase( conn ); + } + + // protects _map + mongo::mutex _mutex; + + // a map from a connection into ChunkManager's sequence number for each namespace + map > _map; + + } connectionShardStatus; + + void resetShardVersion( DBClientBase * conn ) { + connectionShardStatus.reset( conn ); + } + + /** + * @return true if had to do something + */ + bool checkShardVersion( DBClientBase& conn , const string& ns , bool authoritative , int tryNumber ) { + // TODO: cache, optimize, etc... + + WriteBackListener::init( conn ); + + DBConfigPtr conf = grid.getDBConfig( ns ); + if ( ! conf ) + return false; + + unsigned long long officialSequenceNumber = 0; + + ChunkManagerPtr manager; + const bool isSharded = conf->isSharded( ns ); + if ( isSharded ) { + manager = conf->getChunkManager( ns , authoritative ); + officialSequenceNumber = manager->getSequenceNumber(); + } + + // has the ChunkManager been reloaded since the last time we updated the connection-level version? + // (ie, last time we issued the setShardVersions below) + unsigned long long sequenceNumber = connectionShardStatus.getSequence(&conn,ns); + if ( sequenceNumber == officialSequenceNumber ) { + return false; + } + + + ShardChunkVersion version = 0; + if ( isSharded ) { + version = manager->getVersion( Shard::make( conn.getServerAddress() ) ); + } + + log(2) << " have to set shard version for conn: " << &conn << " ns:" << ns + << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber + << " version: " << version << " manager: " << manager.get() + << endl; + + BSONObj result; + if ( setShardVersion( conn , ns , version , authoritative , result ) ) { + // success! + log(1) << " setShardVersion success!" << endl; + connectionShardStatus.setSequence( &conn , ns , officialSequenceNumber ); + return true; + } + + log(1) << " setShardVersion failed!\n" << result << endl; + + if ( result.getBoolField( "need_authoritative" ) ) + massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); + + if ( ! authoritative ) { + checkShardVersion( conn , ns , 1 , tryNumber + 1 ); + return true; + } + + if ( tryNumber < 4 ) { + log(1) << "going to retry checkShardVersion" << endl; + sleepmillis( 10 ); + checkShardVersion( conn , ns , 1 , tryNumber + 1 ); + return true; + } + + log() << " setShardVersion failed: " << result << endl; + massert( 10429 , (string)"setShardVersion failed! " + result.jsonString() , 0 ); + return true; + } + +} // namespace mongo diff --git a/s/shard_version.h b/s/shard_version.h new file mode 100644 index 0000000..023b7fc --- /dev/null +++ b/s/shard_version.h @@ -0,0 +1,31 @@ +// @file shard_version.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +namespace mongo { + + /* + * Install chunk shard vesion callbaks in shardconnection code. This activates + * the chunk shard version control that mongos needs. + * + * MUST be called before accepting any connections. + */ + void installChunkShardVersioning(); + +} // namespace mongo diff --git a/s/shardconnection.cpp b/s/shardconnection.cpp index 694693b..d05f5b1 100644 --- a/s/shardconnection.cpp +++ b/s/shardconnection.cpp @@ -23,7 +23,24 @@ #include namespace mongo { - + + // The code in shardconnection may run not only in mongos context. When elsewhere, chunk shard versioning + // is disabled. To enable chunk shard versioning, provide the check/resetShardVerionCB's below + // + // TODO: better encapsulate this mechanism. + + bool defaultCheckShardVersion( DBClientBase & conn , const string& ns , bool authoritative , int tryNumber ) { + // no-op in mongod + return false; + } + + void defaultResetShardVersion( DBClientBase * conn ) { + // no-op in mongod + } + + boost::function4 checkShardVersionCB = defaultCheckShardVersion; + boost::function1 resetShardVersionCB = defaultResetShardVersion; + /** * holds all the actual db connections for a client to various servers * 1 pre thread, so don't have to worry about thread safety @@ -31,39 +48,22 @@ namespace mongo { class ClientConnections : boost::noncopyable { public: struct Status : boost::noncopyable { - Status() : created(0), avail(0){} + Status() : created(0), avail(0) {} - long long created; + long long created; DBClientBase* avail; }; - Nullstream& debug( Status * s = 0 , const string& addr = "" ){ - static int ll = 9; + ClientConnections() {} - if ( logLevel < ll ) - return nullstream; - Nullstream& l = log(ll); - - l << "ClientConnections DEBUG " << this << " "; - if ( s ){ - l << "s: " << s << " addr: " << addr << " "; - } - return l; - } - - ClientConnections() : _mutex("ClientConnections") { - debug() << " NEW " << endl; - } - - ~ClientConnections(){ - debug() << " KILLING " << endl; - for ( map::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ){ + ~ClientConnections() { + for ( HostMap::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ) { string addr = i->first; Status* ss = i->second; assert( ss ); - if ( ss->avail ){ - /* if we're shutting down, don't want to initiate release mechanism as it is slow, + if ( ss->avail ) { + /* if we're shutting down, don't want to initiate release mechanism as it is slow, and isn't needed since all connections will be closed anyway */ if ( inShutdown() ) delete ss->avail; @@ -75,49 +75,41 @@ namespace mongo { } _hosts.clear(); } - - DBClientBase * get( const string& addr , const string& ns ){ + + DBClientBase * get( const string& addr , const string& ns ) { _check( ns ); - scoped_lock lk( _mutex ); + Status* &s = _hosts[addr]; if ( ! s ) s = new Status(); - - debug( s , addr ) << "WANT ONE pool avail: " << s->avail << endl; - - if ( s->avail ){ + + if ( s->avail ) { DBClientBase* c = s->avail; s->avail = 0; - debug( s , addr ) << "GOT " << c << endl; pool.onHandedOut( c ); return c; } - debug() << "CREATING NEW CONNECTION" << endl; s->created++; return pool.get( addr ); } - - void done( const string& addr , DBClientBase* conn ){ - scoped_lock lk( _mutex ); + + void done( const string& addr , DBClientBase* conn ) { Status* s = _hosts[addr]; assert( s ); - if ( s->avail ){ - debug( s , addr ) << "DONE WITH TEMP" << endl; + if ( s->avail ) { release( addr , conn ); return; } s->avail = conn; - debug( s , addr ) << "PUSHING: " << conn << endl; } - - void sync(){ - scoped_lock lk( _mutex ); - for ( map::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ){ + + void sync() { + for ( HostMap::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ) { string addr = i->first; Status* ss = i->second; - if ( ss->avail ){ + if ( ss->avail ) { ss->avail->getLastError(); release( addr , ss->avail ); ss->avail = 0; @@ -127,63 +119,67 @@ namespace mongo { _hosts.clear(); } - void checkVersions( const string& ns ){ + void checkVersions( const string& ns ) { vector all; Shard::getAllShards( all ); - scoped_lock lk( _mutex ); - for ( unsigned i=0; i::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ){ - if ( ! Shard::isAShard( i->first ) ) + for ( HostMap::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ) { + if ( ! Shard::isAShardNode( i->first ) ) continue; Status* ss = i->second; assert( ss ); if ( ! ss->avail ) ss->avail = pool.get( i->first ); - checkShardVersion( *ss->avail , ns ); + checkShardVersionCB( *ss->avail , ns , false , 1 ); } } - void release( const string& addr , DBClientBase * conn ){ - resetShardVersion( conn ); + void release( const string& addr , DBClientBase * conn ) { + resetShardVersionCB( conn ); BSONObj res; - + try { - if ( conn->simpleCommand( "admin" , &res , "unsetSharding" ) ){ + if ( conn->simpleCommand( "admin" , &res , "unsetSharding" ) ) { pool.release( addr , conn ); } else { - log(LL_ERROR) << " couldn't unset sharding :( " << res << endl; + error() << "unset sharding failed : " << res << endl; delete conn; } } - catch ( std::exception& e ){ - log(LL_ERROR) << "couldn't unsert sharding : " << e.what() << endl; + catch ( SocketException& e ) { + // server down or something + LOG(1) << "socket exception trying to unset sharding: " << e.toString() << endl; + delete conn; + } + catch ( std::exception& e ) { + error() << "couldn't unset sharding : " << e.what() << endl; delete conn; } } - - void _check( const string& ns ){ + + void _check( const string& ns ) { if ( ns.size() == 0 || _seenNS.count( ns ) ) return; _seenNS.insert( ns ); checkVersions( ns ); } - - map _hosts; - mongo::mutex _mutex; + + typedef map HostMap; + HostMap _hosts; set _seenNS; // ----- - + static thread_specific_ptr _perThread; - static ClientConnections* get(){ + static ClientConnections* threadInstance() { ClientConnections* cc = _perThread.get(); - if ( ! cc ){ + if ( ! cc ) { cc = new ClientConnections(); _perThread.reset( cc ); } @@ -202,57 +198,58 @@ namespace mongo { : _addr( s.getConnString() ) , _ns( ns ) { _init(); } - + ShardConnection::ShardConnection( const string& addr , const string& ns ) : _addr( addr ) , _ns( ns ) { _init(); } - - void ShardConnection::_init(){ + + void ShardConnection::_init() { assert( _addr.size() ); - _conn = ClientConnections::get()->get( _addr , _ns ); + _conn = ClientConnections::threadInstance()->get( _addr , _ns ); _finishedInit = false; } - void ShardConnection::_finishInit(){ + void ShardConnection::_finishInit() { if ( _finishedInit ) return; _finishedInit = true; - - if ( _ns.size() ){ - _setVersion = checkShardVersion( *_conn , _ns ); + + if ( _ns.size() ) { + _setVersion = checkShardVersionCB( *_conn , _ns , false , 1 ); } else { _setVersion = false; } - + } - void ShardConnection::done(){ - if ( _conn ){ - ClientConnections::get()->done( _addr , _conn ); + void ShardConnection::done() { + if ( _conn ) { + ClientConnections::threadInstance()->done( _addr , _conn ); _conn = 0; _finishedInit = true; } } - void ShardConnection::kill(){ - if ( _conn ){ + void ShardConnection::kill() { + if ( _conn ) { + resetShardVersionCB( _conn ); delete _conn; _conn = 0; _finishedInit = true; } } - void ShardConnection::sync(){ - ClientConnections::get()->sync(); + void ShardConnection::sync() { + ClientConnections::threadInstance()->sync(); } - bool ShardConnection::runCommand( const string& db , const BSONObj& cmd , BSONObj& res ){ + bool ShardConnection::runCommand( const string& db , const BSONObj& cmd , BSONObj& res ) { assert( _conn ); bool ok = _conn->runCommand( db , cmd , res ); - if ( ! ok ){ - if ( res["code"].numberInt() == StaleConfigInContextCode ){ + if ( ! ok ) { + if ( res["code"].numberInt() == StaleConfigInContextCode ) { string big = res["errmsg"].String(); string ns,raw; massert( 13409 , (string)"can't parse ns from: " + big , StaleConfigException::parse( big , ns , raw ) ); @@ -263,12 +260,12 @@ namespace mongo { return ok; } - void ShardConnection::checkMyConnectionVersions( const string & ns ){ - ClientConnections::get()->checkVersions( ns ); + void ShardConnection::checkMyConnectionVersions( const string & ns ) { + ClientConnections::threadInstance()->checkVersions( ns ); } ShardConnection::~ShardConnection() { - if ( _conn ){ + if ( _conn ) { if ( ! _conn->isFailed() ) { /* see done() comments above for why we log this line */ log() << "~ScopedDBConnection: _conn != null" << endl; diff --git a/s/shardkey.cpp b/s/shardkey.cpp index e4deeec..84cdb4b 100644 --- a/s/shardkey.cpp +++ b/s/shardkey.cpp @@ -20,6 +20,7 @@ #include "chunk.h" #include "../db/jsobj.h" #include "../util/unittest.h" +#include "../util/timer.h" namespace mongo { @@ -30,12 +31,12 @@ namespace mongo { BSONObjBuilder max; BSONObjIterator it(p); - while (it.more()){ + while (it.more()) { BSONElement e (it.next()); min.appendMinKey(e.fieldName()); max.appendMaxKey(e.fieldName()); } - + gMin = min.obj(); gMax = max.obj(); } @@ -49,11 +50,11 @@ namespace mongo { } bool ShardKeyPattern::hasShardKey( const BSONObj& obj ) const { - /* this is written s.t. if obj has lots of fields, if the shard key fields are early, + /* this is written s.t. if obj has lots of fields, if the shard key fields are early, it is fast. so a bit more work to try to be semi-fast. */ - for(set::const_iterator it = patternfields.begin(); it != patternfields.end(); ++it){ + for(set::const_iterator it = patternfields.begin(); it != patternfields.end(); ++it) { if(obj.getFieldDotted(it->c_str()).eoo()) return false; } @@ -63,28 +64,90 @@ namespace mongo { bool ShardKeyPattern::isPrefixOf( const BSONObj& otherPattern ) const { BSONObjIterator a( pattern ); BSONObjIterator b( otherPattern ); - - while ( a.more() && b.more() ){ + + while ( a.more() && b.more() ) { BSONElement x = a.next(); BSONElement y = b.next(); if ( strcmp( x.fieldName() , y.fieldName() ) ) return false; } - + return ! a.more(); } - + string ShardKeyPattern::toString() const { return pattern.toString(); } - - /* things to test for compound : + + BSONObj ShardKeyPattern::moveToFront(const BSONObj& obj) const { + vector keysToMove; + keysToMove.push_back("_id"); + BSONForEach(e, pattern) { + if (strchr(e.fieldName(), '.') == NULL) + keysToMove.push_back(e.fieldName()); + } + + if (keysToMove.size() == 1) { + return obj; + + } + else { + BufBuilder buf (obj.objsize()); + buf.appendNum(obj.objsize()); + + vector > copies; + pair toCopy ((const char*)NULL, 0); // C++ NULL isn't a pointer type yet + + BSONForEach(e, obj) { + bool moveToFront = false; + for (vector::const_iterator it(keysToMove.begin()), end(keysToMove.end()); it!=end; ++it) { + if (strcmp(e.fieldName(), *it) == 0) { + moveToFront = true; + break; + } + } + + if (moveToFront) { + buf.appendBuf(e.fieldName()-1, e.size()); + if (toCopy.first) { + copies.push_back(toCopy); + toCopy.first = NULL; + } + } + else { + if (!toCopy.first) { + toCopy.first = e.fieldName()-1; + toCopy.second = e.size(); + } + else { + toCopy.second += e.size(); + } + } + } + + for (vector >::const_iterator it(copies.begin()), end(copies.end()); it!=end; ++it) { + buf.appendBuf(it->first, it->second); + } + + if (toCopy.first) { + buf.appendBuf(toCopy.first, toCopy.second); + } + + buf.appendChar('\0'); + + BSONObj out (buf.buf(), true); + buf.decouple(); + return out; + } + } + + /* things to test for compound : \ middle (deprecating?) */ class ShardKeyUnitTest : public UnitTest { public: - - void testIsPrefixOf(){ + + void testIsPrefixOf() { { ShardKeyPattern k( BSON( "x" << 1 ) ); assert( ! k.isPrefixOf( BSON( "a" << 1 ) ) ); @@ -92,7 +155,7 @@ namespace mongo { assert( k.isPrefixOf( BSON( "x" << 1 << "a" << 1 ) ) ); assert( ! k.isPrefixOf( BSON( "a" << 1 << "x" << 1 ) ) ); } - { + { ShardKeyPattern k( BSON( "x" << 1 << "y" << 1 ) ); assert( ! k.isPrefixOf( BSON( "x" << 1 ) ) ); assert( ! k.isPrefixOf( BSON( "x" << 1 << "z" << 1 ) ) ); @@ -100,8 +163,8 @@ namespace mongo { assert( k.isPrefixOf( BSON( "x" << 1 << "y" << 1 << "z" << 1 ) ) ); } } - - void hasshardkeytest() { + + void hasshardkeytest() { BSONObj x = fromjson("{ zid : \"abcdefg\", num: 1.0, name: \"eliot\" }"); ShardKeyPattern k( BSON( "num" << 1 ) ); assert( k.hasShardKey(x) ); @@ -117,31 +180,68 @@ namespace mongo { } - void extractkeytest() { + void extractkeytest() { ShardKeyPattern k( fromjson("{a:1,'sub.b':-1,'sub.c':1}") ); BSONObj x = fromjson("{a:1,'sub.b':2,'sub.c':3}"); assert( k.extractKey( fromjson("{a:1,sub:{b:2,c:3}}") ).woEqual(x) ); assert( k.extractKey( fromjson("{sub:{b:2,c:3},a:1}") ).woEqual(x) ); } - void run(){ + void moveToFrontTest() { + ShardKeyPattern sk (BSON("a" << 1 << "b" << 1)); + + BSONObj ret; + + ret = sk.moveToFront(BSON("z" << 1 << "_id" << 1 << "y" << 1 << "a" << 1 << "x" << 1 << "b" << 1 << "w" << 1)); + assert(ret.woEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1))); + + ret = sk.moveToFront(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)); + assert(ret.woEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1))); + + ret = sk.moveToFront(BSON("z" << 1 << "y" << 1 << "a" << 1 << "b" << 1 << "Z" << 1 << "Y" << 1)); + assert(ret.woEqual(BSON("a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "Z" << 1 << "Y" << 1))); + + } + + void moveToFrontBenchmark(int numFields) { + BSONObjBuilder bb; + bb.append("_id", 1); + for (int i=0; i < numFields; i++) + bb.append(BSONObjBuilder::numStr(i), 1); + bb.append("key", 1); + BSONObj o = bb.obj(); + + ShardKeyPattern sk (BSON("key" << 1)); + + Timer t; + const int iterations = 100*1000; + for (int i=0; i< iterations; i++) { + sk.moveToFront(o); + } + + const double secs = t.micros() / 1000000.0; + const double ops_per_sec = iterations / secs; + + cout << "moveToFront (" << numFields << " fields) secs: " << secs << " ops_per_sec: " << ops_per_sec << endl; + } + void run() { extractkeytest(); ShardKeyPattern k( BSON( "key" << 1 ) ); - + BSONObj min = k.globalMin(); // cout << min.jsonString(TenGen) << endl; BSONObj max = k.globalMax(); - + BSONObj k1 = BSON( "key" << 5 ); assert( k.compare( min , max ) < 0 ); assert( k.compare( min , k1 ) < 0 ); assert( k.compare( max , min ) > 0 ); assert( k.compare( min , min ) == 0 ); - + hasshardkeytest(); assert( k.hasShardKey( k1 ) ); assert( ! k.hasShardKey( BSON( "key2" << 1 ) ) ); @@ -150,12 +250,20 @@ namespace mongo { BSONObj b = BSON( "key" << 999 ); assert( k.compare(a,b) < 0 ); - + testIsPrefixOf(); // add middle multitype tests + moveToFrontTest(); + + if (0) { // toggle to run benchmark + moveToFrontBenchmark(0); + moveToFrontBenchmark(10); + moveToFrontBenchmark(100); + } + log(1) << "shardKeyTest passed" << endl; } } shardKeyTest; - + } // namespace mongo diff --git a/s/shardkey.h b/s/shardkey.h index 976bbef..96301ff 100644 --- a/s/shardkey.h +++ b/s/shardkey.h @@ -21,7 +21,7 @@ #include "../client/dbclient.h" namespace mongo { - + class Chunk; /* A ShardKeyPattern is a pattern indicating what data to extract from the object to make the shard key from. @@ -30,10 +30,10 @@ namespace mongo { class ShardKeyPattern { public: ShardKeyPattern( BSONObj p = BSONObj() ); - + /** global min is the lowest possible value for this key - e.g. { num : MinKey } + e.g. { num : MinKey } */ BSONObj globalMin() const { return gMin; } @@ -42,15 +42,15 @@ namespace mongo { */ BSONObj globalMax() const { return gMax; } - bool isGlobalMin( const BSONObj& k ) const{ + bool isGlobalMin( const BSONObj& k ) const { return k.woCompare( globalMin() ) == 0; } - bool isGlobalMax( const BSONObj& k ) const{ + bool isGlobalMax( const BSONObj& k ) const { return k.woCompare( globalMax() ) == 0; } - - bool isGlobal( const BSONObj& k ) const{ + + bool isGlobal( const BSONObj& k ) const { return isGlobalMin( k ) || isGlobalMax( k ); } @@ -60,22 +60,25 @@ namespace mongo { l > r positive */ int compare( const BSONObj& l , const BSONObj& r ) const; - + /** @return whether or not obj has all fields in this shard key pattern - e.g. - ShardKey({num:1}).hasShardKey({ name:"joe", num:3 }) is true + e.g. + ShardKey({num:1}).hasShardKey({ name:"joe", num:3 }) is true */ bool hasShardKey( const BSONObj& obj ) const; - + BSONObj key() const { return pattern; } string toString() const; BSONObj extractKey(const BSONObj& from) const; - + + bool partOfShardKey(const char* key ) const { + return pattern.hasField(key); + } bool partOfShardKey(const string& key ) const { - return patternfields.count( key ) > 0; + return pattern.hasField(key.c_str()); } /** @@ -83,7 +86,12 @@ namespace mongo { * true if 'this' is a prefix (not necessarily contained) of 'otherPattern'. */ bool isPrefixOf( const BSONObj& otherPattern ) const; - + + /** + * @return BSONObj with _id and shardkey at front. May return original object. + */ + BSONObj moveToFront(const BSONObj& obj) const; + private: BSONObj pattern; BSONObj gMin; @@ -93,10 +101,10 @@ namespace mongo { set patternfields; }; - inline BSONObj ShardKeyPattern::extractKey(const BSONObj& from) const { + inline BSONObj ShardKeyPattern::extractKey(const BSONObj& from) const { BSONObj k = from.extractFields(pattern); uassert(13334, "Shard Key must be less than 512 bytes", k.objsize() < 512); return k; } -} +} diff --git a/s/stats.cpp b/s/stats.cpp index bb7a975..460ada3 100644 --- a/s/stats.cpp +++ b/s/stats.cpp @@ -20,7 +20,7 @@ #include "stats.h" namespace mongo { - + OpCounters opsNonSharded; OpCounters opsSharded; diff --git a/s/stats.h b/s/stats.h index cbabf25..a7cc784 100644 --- a/s/stats.h +++ b/s/stats.h @@ -22,7 +22,7 @@ #include "../db/stats/counters.h" namespace mongo { - + extern OpCounters opsNonSharded; extern OpCounters opsSharded; diff --git a/s/strategy.cpp b/s/strategy.cpp index b3c8f5b..7c1fb0b 100644 --- a/s/strategy.cpp +++ b/s/strategy.cpp @@ -1,3 +1,5 @@ +// @file strategy.cpp + /* * Copyright (C) 2010 10gen Inc. * @@ -14,312 +16,64 @@ * along with this program. If not, see . */ -// stragegy.cpp - #include "pch.h" -#include "request.h" -#include "../util/background.h" + #include "../client/connpool.h" #include "../db/commands.h" -#include "server.h" #include "grid.h" +#include "request.h" +#include "server.h" +#include "writeback_listener.h" + +#include "strategy.h" namespace mongo { // ----- Strategy ------ - void Strategy::doWrite( int op , Request& r , const Shard& shard , bool checkVersion ){ + void Strategy::doWrite( int op , Request& r , const Shard& shard , bool checkVersion ) { ShardConnection conn( shard , r.getns() ); if ( ! checkVersion ) conn.donotCheckVersion(); - else if ( conn.setVersion() ){ + else if ( conn.setVersion() ) { conn.done(); throw StaleConfigException( r.getns() , "doWRite" , true ); } conn->say( r.m() ); conn.done(); } - - void Strategy::doQuery( Request& r , const Shard& shard ){ - try{ - ShardConnection dbcon( shard , r.getns() ); - DBClientBase &c = dbcon.conn(); - - Message response; - bool ok = c.call( r.m(), response); - { - QueryResult *qr = (QueryResult *) response.singleData(); - if ( qr->resultFlags() & ResultFlag_ShardConfigStale ){ - dbcon.done(); - throw StaleConfigException( r.getns() , "Strategy::doQuery" ); - } - } + void Strategy::doQuery( Request& r , const Shard& shard ) { - uassert( 10200 , "mongos: error calling db", ok); - r.reply( response , c.getServerAddress() ); - dbcon.done(); - } - catch ( AssertionException& e ) { - BSONObjBuilder err; - e.getInfo().append( err ); - BSONObj errObj = err.done(); - replyToQuery(ResultFlag_ErrSet, r.p() , r.m() , errObj); - } - } - - void Strategy::insert( const Shard& shard , const char * ns , const BSONObj& obj ){ - ShardConnection dbcon( shard , ns ); - if ( dbcon.setVersion() ){ - dbcon.done(); - throw StaleConfigException( ns , "for insert" ); - } - dbcon->insert( ns , obj ); - dbcon.done(); - } - - class WriteBackListener : public BackgroundJob { - protected: - string name() { return "WriteBackListener"; } - WriteBackListener( const string& addr ) : _addr( addr ){ - log() << "creating WriteBackListener for: " << addr << endl; - } - - void run(){ - OID lastID; - lastID.clear(); - int secsToSleep = 0; - while ( Shard::isMember( _addr ) ){ - - if ( lastID.isSet() ){ - scoped_lock lk( _seenWritebacksLock ); - _seenWritebacks.insert( lastID ); - lastID.clear(); - } - - try { - ScopedDbConnection conn( _addr ); - - BSONObj result; - - { - BSONObjBuilder cmd; - cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data - if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ){ - log() << "writebacklisten command failed! " << result << endl; - conn.done(); - continue; - } - - } - - log(1) << "writebacklisten result: " << result << endl; - - BSONObj data = result.getObjectField( "data" ); - if ( data.getBoolField( "writeBack" ) ){ - string ns = data["ns"].valuestrsafe(); - { - BSONElement e = data["id"]; - if ( e.type() == jstOID ) - lastID = e.OID(); - } - int len; + ShardConnection dbcon( shard , r.getns() ); + DBClientBase &c = dbcon.conn(); - Message m( (void*)data["msg"].binData( len ) , false ); - massert( 10427 , "invalid writeback message" , m.header()->valid() ); + string actualServer; - DBConfigPtr db = grid.getDBConfig( ns ); - ShardChunkVersion needVersion( data["version"] ); - - log(1) << "writeback id: " << lastID << " needVersion : " << needVersion.toString() - << " mine : " << db->getChunkManager( ns )->getVersion().toString() << endl;// TODO change to log(3) - - if ( logLevel ) log(1) << debugString( m ) << endl; + Message response; + bool ok = c.call( r.m(), response, true , &actualServer ); + uassert( 10200 , "mongos: error calling db", ok ); - if ( needVersion.isSet() && needVersion <= db->getChunkManager( ns )->getVersion() ){ - // this means when the write went originally, the version was old - // if we're here, it means we've already updated the config, so don't need to do again - //db->getChunkManager( ns , true ); // SERVER-1349 - } - else { - db->getChunkManager( ns , true ); - } - - Request r( m , 0 ); - r.init(); - r.process(); - } - else { - log() << "unknown writeBack result: " << result << endl; - } - - conn.done(); - secsToSleep = 0; - continue; - } - catch ( std::exception e ){ - log() << "WriteBackListener exception : " << e.what() << endl; - - // It's possible this shard was removed - Shard::reloadShardInfo(); - } - catch ( ... ){ - log() << "WriteBackListener uncaught exception!" << endl; - } - secsToSleep++; - sleepsecs(secsToSleep); - if ( secsToSleep > 10 ) - secsToSleep = 0; + { + QueryResult *qr = (QueryResult *) response.singleData(); + if ( qr->resultFlags() & ResultFlag_ShardConfigStale ) { + dbcon.done(); + throw StaleConfigException( r.getns() , "Strategy::doQuery" ); } - - log() << "WriteBackListener exiting : address no longer in cluster " << _addr; - } - - private: - string _addr; - static map _cache; - static mongo::mutex _cacheLock; - - static set _seenWritebacks; - static mongo::mutex _seenWritebacksLock; - - public: - static void init( DBClientBase& conn ){ - scoped_lock lk( _cacheLock ); - WriteBackListener*& l = _cache[conn.getServerAddress()]; - if ( l ) - return; - l = new WriteBackListener( conn.getServerAddress() ); - l->go(); - } - - - static void waitFor( const OID& oid ){ - Timer t; - for ( int i=0; i<5000; i++ ){ - { - scoped_lock lk( _seenWritebacksLock ); - if ( _seenWritebacks.count( oid ) ) - return; - } - sleepmillis( 10 ); - } - stringstream ss; - ss << "didn't get writeback for: " << oid << " after: " << t.millis() << " ms"; - uasserted( 13403 , ss.str() ); - } - }; - - void waitForWriteback( const OID& oid ){ - WriteBackListener::waitFor( oid ); - } - - map WriteBackListener::_cache; - mongo::mutex WriteBackListener::_cacheLock("WriteBackListener"); - - set WriteBackListener::_seenWritebacks; - mongo::mutex WriteBackListener::_seenWritebacksLock( "WriteBackListener::seen" ); - - struct ConnectionShardStatus { - - typedef unsigned long long S; - - ConnectionShardStatus() - : _mutex( "ConnectionShardStatus" ){ - } - - S getSequence( DBClientBase * conn , const string& ns ){ - scoped_lock lk( _mutex ); - return _map[conn][ns]; - } - - void setSequence( DBClientBase * conn , const string& ns , const S& s ){ - scoped_lock lk( _mutex ); - _map[conn][ns] = s; - } - - void reset( DBClientBase * conn ){ - scoped_lock lk( _mutex ); - _map.erase( conn ); - } - - map > _map; - mongo::mutex _mutex; - } connectionShardStatus; - - void resetShardVersion( DBClientBase * conn ){ - connectionShardStatus.reset( conn ); + r.reply( response , actualServer.size() ? actualServer : c.getServerAddress() ); + dbcon.done(); } - - /** - * @return true if had to do something - */ - bool checkShardVersion( DBClientBase& conn , const string& ns , bool authoritative , int tryNumber ){ - // TODO: cache, optimize, etc... - - WriteBackListener::init( conn ); - DBConfigPtr conf = grid.getDBConfig( ns ); - if ( ! conf ) - return false; - - unsigned long long officialSequenceNumber = 0; - - ChunkManagerPtr manager; - const bool isSharded = conf->isSharded( ns ); - if ( isSharded ){ - manager = conf->getChunkManager( ns , authoritative ); - officialSequenceNumber = manager->getSequenceNumber(); - } - - unsigned long long sequenceNumber = connectionShardStatus.getSequence(&conn,ns); - if ( sequenceNumber == officialSequenceNumber ){ - return false; - } - - - ShardChunkVersion version = 0; - if ( isSharded ){ - version = manager->getVersion( Shard::make( conn.getServerAddress() ) ); - } - - log(2) << " have to set shard version for conn: " << &conn << " ns:" << ns - << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber - << " version: " << version << " manager: " << manager.get() - << endl; - - BSONObj result; - if ( setShardVersion( conn , ns , version , authoritative , result ) ){ - // success! - log(1) << " setShardVersion success!" << endl; - connectionShardStatus.setSequence( &conn , ns , officialSequenceNumber ); - return true; - } - - log(1) << " setShardVersion failed!\n" << result << endl; - - if ( result.getBoolField( "need_authoritative" ) ) - massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); - - if ( ! authoritative ){ - checkShardVersion( conn , ns , 1 , tryNumber + 1 ); - return true; - } - - if ( tryNumber < 4 ){ - log(1) << "going to retry checkShardVersion" << endl; - sleepmillis( 10 ); - checkShardVersion( conn , ns , 1 , tryNumber + 1 ); - return true; + void Strategy::insert( const Shard& shard , const char * ns , const BSONObj& obj ) { + ShardConnection dbcon( shard , ns ); + if ( dbcon.setVersion() ) { + dbcon.done(); + throw StaleConfigException( ns , "for insert" ); } - - log() << " setShardVersion failed: " << result << endl; - massert( 10429 , (string)"setShardVersion failed! " + result.jsonString() , 0 ); - return true; + dbcon->insert( ns , obj ); + dbcon.done(); } - - } diff --git a/s/strategy.h b/s/strategy.h index 2aa4434..10a5a3f 100644 --- a/s/strategy.h +++ b/s/strategy.h @@ -23,28 +23,25 @@ #include "request.h" namespace mongo { - + class Strategy { public: - Strategy(){} + Strategy() {} virtual ~Strategy() {} virtual void queryOp( Request& r ) = 0; virtual void getMore( Request& r ) = 0; virtual void writeOp( int op , Request& r ) = 0; - + protected: void doWrite( int op , Request& r , const Shard& shard , bool checkVersion = true ); void doQuery( Request& r , const Shard& shard ); - + void insert( const Shard& shard , const char * ns , const BSONObj& obj ); - + }; extern Strategy * SINGLE; extern Strategy * SHARDED; - bool setShardVersion( DBClientBase & conn , const string& ns , ShardChunkVersion version , bool authoritative , BSONObj& result ); - - void waitForWriteback( const OID& oid ); } diff --git a/s/strategy_shard.cpp b/s/strategy_shard.cpp index 144bf79..2eca0c6 100644 --- a/s/strategy_shard.cpp +++ b/s/strategy_shard.cpp @@ -21,6 +21,7 @@ #include "chunk.h" #include "cursors.h" #include "stats.h" +#include "client.h" #include "../client/connpool.h" #include "../db/commands.h" @@ -28,45 +29,45 @@ // error codes 8010-8040 namespace mongo { - + class ShardStrategy : public Strategy { - virtual void queryOp( Request& r ){ + virtual void queryOp( Request& r ) { QueryMessage q( r.d() ); log(3) << "shard query: " << q.ns << " " << q.query << endl; - + if ( q.ntoreturn == 1 && strstr(q.ns, ".$cmd") ) throw UserException( 8010 , "something is wrong, shouldn't see a command here" ); ChunkManagerPtr info = r.getChunkManager(); assert( info ); - + Query query( q.query ); set shards; info->getShardsForQuery( shards , query.getFilter() ); - + set servers; - for ( set::iterator i = shards.begin(); i != shards.end(); i++ ){ - servers.insert( ServerAndQuery( i->getConnString() , BSONObj() ) ); + for ( set::iterator i = shards.begin(); i != shards.end(); i++ ) { + servers.insert( ServerAndQuery( i->getConnString() , BSONObj() ) ); } - - if ( logLevel > 4 ){ + + if ( logLevel > 4 ) { StringBuilder ss; ss << " shard query servers: " << servers.size() << '\n'; - for ( set::iterator i = servers.begin(); i!=servers.end(); i++ ){ + for ( set::iterator i = servers.begin(); i!=servers.end(); i++ ) { const ServerAndQuery& s = *i; ss << " " << s.toString() << '\n'; } - log() << ss.str(); + log() << ss.str() << endl; } ClusteredCursor * cursor = 0; - + BSONObj sort = query.getSort(); - - if ( sort.isEmpty() ){ + + if ( sort.isEmpty() ) { cursor = new SerialServerClusteredCursor( servers , q ); } else { @@ -80,85 +81,90 @@ namespace mongo { log(5) << " cursor type: " << cursor->type() << endl; shardedCursorTypes.hit( cursor->type() ); - - if ( query.isExplain() ){ + + if ( query.isExplain() ) { BSONObj explain = cursor->explain(); replyToQuery( 0 , r.p() , r.m() , explain ); delete( cursor ); return; } - } catch(...) { + } + catch(...) { delete cursor; throw; } ShardedClientCursorPtr cc (new ShardedClientCursor( q , cursor )); - if ( ! cc->sendNextBatch( r ) ){ + if ( ! cc->sendNextBatch( r ) ) { return; } log(6) << "storing cursor : " << cc->getId() << endl; cursorCache.store( cc ); } - - virtual void getMore( Request& r ){ + + virtual void getMore( Request& r ) { int ntoreturn = r.d().pullInt(); long long id = r.d().pullInt64(); log(6) << "want cursor : " << id << endl; ShardedClientCursorPtr cursor = cursorCache.get( id ); - if ( ! cursor ){ + if ( ! cursor ) { log(6) << "\t invalid cursor :(" << endl; replyToQuery( ResultFlag_CursorNotFound , r.p() , r.m() , 0 , 0 , 0 ); return; } - - if ( cursor->sendNextBatch( r , ntoreturn ) ){ + + if ( cursor->sendNextBatch( r , ntoreturn ) ) { // still more data cursor->accessed(); return; } - + // we've exhausted the cursor cursorCache.remove( id ); } - - void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ){ - - while ( d.moreJSObjs() ){ + + void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) { + + while ( d.moreJSObjs() ) { BSONObj o = d.nextJsObj(); - if ( ! manager->hasShardKey( o ) ){ + if ( ! manager->hasShardKey( o ) ) { bool bad = true; - if ( manager->getShardKey().partOfShardKey( "_id" ) ){ + if ( manager->getShardKey().partOfShardKey( "_id" ) ) { BSONObjBuilder b; b.appendOID( "_id" , 0 , true ); b.appendElements( o ); o = b.obj(); bad = ! manager->hasShardKey( o ); } - - if ( bad ){ + + if ( bad ) { log() << "tried to insert object without shard key: " << r.getns() << " " << o << endl; throw UserException( 8011 , "tried to insert object without shard key" ); } - + } - + + // Many operations benefit from having the shard key early in the object + o = manager->getShardKey().moveToFront(o); + bool gotThrough = false; - for ( int i=0; i<10; i++ ){ + for ( int i=0; i<10; i++ ) { try { ChunkPtr c = manager->findChunk( o ); log(4) << " server:" << c->getShard().toString() << " " << o << endl; insert( c->getShard() , r.getns() , o ); - + r.gotInsert(); - c->splitIfShould( o.objsize() ); + if ( r.getClientInfo()->autoSplitOk() ) + c->splitIfShould( o.objsize() ); gotThrough = true; break; } - catch ( StaleConfigException& ){ + catch ( StaleConfigException& ) { log(1) << "retrying insert because of StaleConfigException: " << o << endl; r.reset(); manager = r.getChunkManager(); @@ -168,34 +174,38 @@ namespace mongo { assert( gotThrough ); - } + } } - void _update( Request& r , DbMessage& d, ChunkManagerPtr manager ){ + void _update( Request& r , DbMessage& d, ChunkManagerPtr manager ) { int flags = d.pullInt(); - + BSONObj query = d.nextJsObj(); + uassert( 13506 , "$atomic not supported sharded" , query["$atomic"].eoo() ); uassert( 10201 , "invalid update" , d.moreJSObjs() ); BSONObj toupdate = d.nextJsObj(); BSONObj chunkFinder = query; - + bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; - uassert( 10202 , "can't mix multi and upsert and sharding" , ! ( upsert && multi ) ); + if (upsert) { + uassert(8012, "can't upsert something without shard key", + (manager->hasShardKey(toupdate) || + (toupdate.firstElement().fieldName()[0] == '$' && manager->hasShardKey(query)))); - if ( upsert && !(manager->hasShardKey(toupdate) || - (toupdate.firstElement().fieldName()[0] == '$' && manager->hasShardKey(query)))) - { - throw UserException( 8012 , "can't upsert something without shard key" ); + BSONObj key = manager->getShardKey().extractKey(query); + BSONForEach(e, key) { + uassert(13465, "shard key in upsert query must be an exact match", getGtLtOp(e) == BSONObj::Equality); + } } bool save = false; - if ( ! manager->hasShardKey( query ) ){ - if ( multi ){ + if ( ! manager->hasShardKey( query ) ) { + if ( multi ) { } - else if ( strcmp( query.firstElement().fieldName() , "_id" ) || query.nFields() != 1 ){ + else if ( strcmp( query.firstElement().fieldName() , "_id" ) || query.nFields() != 1 ) { throw UserException( 8013 , "can't do non-multi update with query that doesn't have the shard key" ); } else { @@ -204,50 +214,59 @@ namespace mongo { } } - - if ( ! save ){ - if ( toupdate.firstElement().fieldName()[0] == '$' ){ + + if ( ! save ) { + if ( toupdate.firstElement().fieldName()[0] == '$' ) { BSONObjIterator ops(toupdate); - while(ops.more()){ + while(ops.more()) { BSONElement op(ops.next()); if (op.type() != Object) continue; BSONObjIterator fields(op.embeddedObject()); - while(fields.more()){ + while(fields.more()) { const string field = fields.next().fieldName(); - uassert(13123, "Can't modify shard key's value", ! manager->getShardKey().partOfShardKey(field)); + uassert(13123, + str::stream() << "Can't modify shard key's value field" << field + << " for collection: " << manager->getns(), + ! manager->getShardKey().partOfShardKey(field)); } } - } else if ( manager->hasShardKey( toupdate ) ){ - uassert( 8014, "change would move shards!", manager->getShardKey().compare( query , toupdate ) == 0 ); - } else { - uasserted(12376, "shard key must be in update object"); + } + else if ( manager->hasShardKey( toupdate ) ) { + uassert( 8014, + str::stream() << "cannot modify shard key for collection: " << manager->getns(), + manager->getShardKey().compare( query , toupdate ) == 0 ); + } + else { + uasserted(12376, + str::stream() << "shard key must be in update object for collection: " << manager->getns() ); } } - - if ( multi ){ + + if ( multi ) { set shards; manager->getShardsForQuery( shards , chunkFinder ); int * x = (int*)(r.d().afterNS()); x[0] |= UpdateOption_Broadcast; - for ( set::iterator i=shards.begin(); i!=shards.end(); i++){ + for ( set::iterator i=shards.begin(); i!=shards.end(); i++) { doWrite( dbUpdate , r , *i , false ); } } else { int left = 5; - while ( true ){ + while ( true ) { try { ChunkPtr c = manager->findChunk( chunkFinder ); doWrite( dbUpdate , r , c->getShard() ); - c->splitIfShould( d.msg().header()->dataLen() ); + if ( r.getClientInfo()->autoSplitOk() ) + c->splitIfShould( d.msg().header()->dataLen() ); break; } - catch ( StaleConfigException& e ){ + catch ( StaleConfigException& e ) { if ( left <= 0 ) throw e; left--; - log() << "update failed b/c of StaleConfigException, retrying " + log() << "update failed b/c of StaleConfigException, retrying " << " left:" << left << " ns: " << r.getns() << " query: " << query << endl; r.reset( false ); manager = r.getChunkManager(); @@ -256,74 +275,75 @@ namespace mongo { } } - - void _delete( Request& r , DbMessage& d, ChunkManagerPtr manager ){ + + void _delete( Request& r , DbMessage& d, ChunkManagerPtr manager ) { int flags = d.pullInt(); bool justOne = flags & 1; - + uassert( 10203 , "bad delete message" , d.moreJSObjs() ); BSONObj pattern = d.nextJsObj(); + uassert( 13505 , "$atomic not supported sharded" , pattern["$atomic"].eoo() ); set shards; int left = 5; - - while ( true ){ + + while ( true ) { try { manager->getShardsForQuery( shards , pattern ); log(2) << "delete : " << pattern << " \t " << shards.size() << " justOne: " << justOne << endl; - if ( shards.size() == 1 ){ + if ( shards.size() == 1 ) { doWrite( dbDelete , r , *shards.begin() ); return; } break; } - catch ( StaleConfigException& e ){ + catch ( StaleConfigException& e ) { if ( left <= 0 ) throw e; left--; - log() << "delete failed b/c of StaleConfigException, retrying " + log() << "delete failed b/c of StaleConfigException, retrying " << " left:" << left << " ns: " << r.getns() << " patt: " << pattern << endl; r.reset( false ); shards.clear(); manager = r.getChunkManager(); } } - + if ( justOne && ! pattern.hasField( "_id" ) ) throw UserException( 8015 , "can only delete with a non-shard key pattern if can delete as many as we find" ); - - for ( set::iterator i=shards.begin(); i!=shards.end(); i++){ + + for ( set::iterator i=shards.begin(); i!=shards.end(); i++) { int * x = (int*)(r.d().afterNS()); x[0] |= RemoveOption_Broadcast; doWrite( dbDelete , r , *i , false ); } } - - virtual void writeOp( int op , Request& r ){ + + virtual void writeOp( int op , Request& r ) { const char *ns = r.getns(); log(3) << "write: " << ns << endl; - + DbMessage& d = r.d(); ChunkManagerPtr info = r.getChunkManager(); assert( info ); - - if ( op == dbInsert ){ + + if ( op == dbInsert ) { _insert( r , d , info ); } - else if ( op == dbUpdate ){ - _update( r , d , info ); + else if ( op == dbUpdate ) { + _update( r , d , info ); } - else if ( op == dbDelete ){ + else if ( op == dbDelete ) { _delete( r , d , info ); } else { log() << "sharding can't do write op: " << op << endl; throw UserException( 8016 , "can't do this write op on sharded collection" ); } - + } }; - + Strategy * SHARDED = new ShardStrategy(); } diff --git a/s/strategy_single.cpp b/s/strategy_single.cpp index b840c9b..b3b5502 100644 --- a/s/strategy_single.cpp +++ b/s/strategy_single.cpp @@ -18,117 +18,102 @@ #include "pch.h" #include "request.h" +#include "cursors.h" #include "../client/connpool.h" #include "../db/commands.h" namespace mongo { class SingleStrategy : public Strategy { - + public: - SingleStrategy(){ + SingleStrategy() { _commandsSafeToPass.insert( "$eval" ); _commandsSafeToPass.insert( "create" ); } private: - virtual void queryOp( Request& r ){ + virtual void queryOp( Request& r ) { QueryMessage q( r.d() ); - - bool lateAssert = false; - + log(3) << "single query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << endl; - - try { - if ( r.isCommand() ){ - - if ( handleSpecialNamespaces( r , q ) ) - return; - - int loops = 5; - while ( true ){ - BSONObjBuilder builder; - try { - bool ok = Command::runAgainstRegistered(q.ns, q.query, builder); - if ( ok ) { - BSONObj x = builder.done(); - replyToQuery(0, r.p(), r.m(), x); - return; - } - break; - } - catch ( StaleConfigException& e ){ - if ( loops <= 0 ) - throw e; - - loops--; - log() << "retrying command: " << q.query << endl; - ShardConnection::checkMyConnectionVersions( e.getns() ); - } - catch ( AssertionException& e ){ - e.getInfo().append( builder , "assertion" , "assertionCode" ); - builder.append( "errmsg" , "db assertion failure" ); - builder.append( "ok" , 0 ); + + if ( r.isCommand() ) { + + if ( handleSpecialNamespaces( r , q ) ) + return; + + int loops = 5; + while ( true ) { + BSONObjBuilder builder; + try { + bool ok = Command::runAgainstRegistered(q.ns, q.query, builder); + if ( ok ) { BSONObj x = builder.done(); replyToQuery(0, r.p(), r.m(), x); return; } + break; } - - string commandName = q.query.firstElement().fieldName(); + catch ( StaleConfigException& e ) { + if ( loops <= 0 ) + throw e; - uassert(13390, "unrecognized command: " + commandName, _commandsSafeToPass.count(commandName) != 0); - } - - lateAssert = true; - doQuery( r , r.primaryShard() ); - } - catch ( AssertionException& e ) { - if ( lateAssert ){ - log() << "lateAssert: " << e.getInfo() << endl; - assert( !lateAssert ); + loops--; + log() << "retrying command: " << q.query << endl; + ShardConnection::checkMyConnectionVersions( e.getns() ); + } + catch ( AssertionException& e ) { + e.getInfo().append( builder , "assertion" , "assertionCode" ); + builder.append( "errmsg" , "db assertion failure" ); + builder.append( "ok" , 0 ); + BSONObj x = builder.done(); + replyToQuery(0, r.p(), r.m(), x); + return; + } } - BSONObjBuilder err; - e.getInfo().append( err ); - BSONObj errObj = err.done(); - replyToQuery(ResultFlag_ErrSet, r.p() , r.m() , errObj); - return; + string commandName = q.query.firstElement().fieldName(); + + uassert(13390, "unrecognized command: " + commandName, _commandsSafeToPass.count(commandName) != 0); } + doQuery( r , r.primaryShard() ); } - - virtual void getMore( Request& r ){ + + virtual void getMore( Request& r ) { const char *ns = r.getns(); - - log(3) << "single getmore: " << ns << endl; - ShardConnection conn( r.primaryShard() , ns ); + LOG(3) << "single getmore: " << ns << endl; + + long long id = r.d().getInt64( 4 ); + + ShardConnection conn( cursorCache.getRef( id ) , ns ); Message response; bool ok = conn->callRead( r.m() , response); uassert( 10204 , "dbgrid: getmore: error calling db", ok); - r.reply( response , conn->getServerAddress() ); - + r.reply( response , "" /*conn->getServerAddress() */ ); + conn.done(); } - - void handleIndexWrite( int op , Request& r ){ - + + void handleIndexWrite( int op , Request& r ) { + DbMessage& d = r.d(); - if ( op == dbInsert ){ - while( d.moreJSObjs() ){ + if ( op == dbInsert ) { + while( d.moreJSObjs() ) { BSONObj o = d.nextJsObj(); const char * ns = o["ns"].valuestr(); - if ( r.getConfig()->isSharded( ns ) ){ + if ( r.getConfig()->isSharded( ns ) ) { BSONObj newIndexKey = o["key"].embeddedObjectUserCheck(); - - uassert( 10205 , (string)"can't use unique indexes with sharding ns:" + ns + - " key: " + o["key"].embeddedObjectUserCheck().toString() , + + uassert( 10205 , (string)"can't use unique indexes with sharding ns:" + ns + + " key: " + o["key"].embeddedObjectUserCheck().toString() , IndexDetails::isIdIndexPattern( newIndexKey ) || - ! o["unique"].trueValue() || + ! o["unique"].trueValue() || r.getConfig()->getChunkManager( ns )->getShardKey().isPrefixOf( newIndexKey ) ); ChunkManagerPtr cm = r.getConfig()->getChunkManager( ns ); @@ -145,10 +130,10 @@ namespace mongo { r.gotInsert(); } } - else if ( op == dbUpdate ){ + else if ( op == dbUpdate ) { throw UserException( 8050 , "can't update system.indexes" ); } - else if ( op == dbDelete ){ + else if ( op == dbDelete ) { // TODO throw UserException( 8051 , "can't delete indexes on sharded collection yet" ); } @@ -156,26 +141,26 @@ namespace mongo { log() << "handleIndexWrite invalid write op: " << op << endl; throw UserException( 8052 , "handleIndexWrite invalid write op" ); } - + } - virtual void writeOp( int op , Request& r ){ + virtual void writeOp( int op , Request& r ) { const char *ns = r.getns(); - - if ( r.isShardingEnabled() && - strstr( ns , ".system.indexes" ) == strchr( ns , '.' ) && - strchr( ns , '.' ) ) { + + if ( r.isShardingEnabled() && + strstr( ns , ".system.indexes" ) == strchr( ns , '.' ) && + strchr( ns , '.' ) ) { log(1) << " .system.indexes write for: " << ns << endl; handleIndexWrite( op , r ); return; } - + log(3) << "single write: " << ns << endl; doWrite( op , r , r.primaryShard() ); r.gotInsert(); // Won't handle mulit-insert correctly. Not worth parsing the request. } - bool handleSpecialNamespaces( Request& r , QueryMessage& q ){ + bool handleSpecialNamespaces( Request& r , QueryMessage& q ) { const char * ns = r.getns(); ns = strstr( r.getns() , ".$cmd.sys." ); if ( ! ns ) @@ -184,29 +169,32 @@ namespace mongo { BSONObjBuilder b; vector shards; - - if ( strcmp( ns , "inprog" ) == 0 ){ + + if ( strcmp( ns , "inprog" ) == 0 ) { Shard::getAllShards( shards ); - + BSONArrayBuilder arr( b.subarrayStart( "inprog" ) ); - for ( unsigned i=0; ifindOne( r.getns() , BSONObj() ); - if ( temp["inprog"].isABSONObj() ){ + if ( temp["inprog"].isABSONObj() ) { BSONObjIterator i( temp["inprog"].Obj() ); - while ( i.more() ){ + while ( i.more() ) { BSONObjBuilder x; - + BSONObjIterator j( i.next().Obj() ); - while( j.more() ){ + while( j.more() ) { BSONElement e = j.next(); - if ( strcmp( e.fieldName() , "opid" ) == 0 ){ + if ( str::equals( e.fieldName() , "opid" ) ) { stringstream ss; ss << shard.getName() << ':' << e.numberInt(); x.append( "opid" , ss.str() ); } + else if ( str::equals( e.fieldName() , "client" ) ) { + x.appendAs( e , "client_s" ); + } else { x.append( e ); } @@ -216,15 +204,15 @@ namespace mongo { } conn.done(); } - + arr.done(); } - else if ( strcmp( ns , "killop" ) == 0 ){ + else if ( strcmp( ns , "killop" ) == 0 ) { BSONElement e = q.query["op"]; - if ( strstr( r.getns() , "admin." ) != 0 ){ + if ( strstr( r.getns() , "admin." ) != 0 ) { b.append( "err" , "unauthorized" ); } - else if ( e.type() != String ){ + else if ( e.type() != String ) { b.append( "err" , "bad op" ); b.append( e ); } @@ -232,7 +220,7 @@ namespace mongo { b.append( e ); string s = e.String(); string::size_type i = s.find( ':' ); - if ( i == string::npos ){ + if ( i == string::npos ) { b.append( "err" , "bad opid" ); } else { @@ -243,14 +231,14 @@ namespace mongo { log() << "want to kill op: " << e << endl; Shard s(shard); - + ScopedDbConnection conn( s ); conn->findOne( r.getns() , BSON( "op" << opid ) ); conn.done(); } } } - else if ( strcmp( ns , "unlock" ) == 0 ){ + else if ( strcmp( ns , "unlock" ) == 0 ) { b.append( "err" , "can't do unlock through mongos" ); } else { @@ -265,6 +253,6 @@ namespace mongo { set _commandsSafeToPass; }; - + Strategy * SINGLE = new SingleStrategy(); } diff --git a/s/util.h b/s/util.h index 7695eda..b3f63d8 100644 --- a/s/util.h +++ b/s/util.h @@ -36,29 +36,30 @@ namespace mongo { }; unsigned long long _combined; }; - + ShardChunkVersion( int major=0, int minor=0 ) - : _minor(minor),_major(major){ + : _minor(minor),_major(major) { } - + ShardChunkVersion( unsigned long long ll ) - : _combined( ll ){ + : _combined( ll ) { } - - ShardChunkVersion( const BSONElement& e ){ - if ( e.type() == Date || e.type() == Timestamp ){ + + ShardChunkVersion( const BSONElement& e ) { + if ( e.type() == Date || e.type() == Timestamp ) { _combined = e._numberLong(); } - else if ( e.eoo() ){ + else if ( e.eoo() ) { _combined = 0; } else { + _combined = 0; log() << "ShardChunkVersion can't handle type (" << (int)(e.type()) << ") " << e << endl; assert(0); } } - void inc( bool major ){ + void inc( bool major ) { if ( major ) incMajor(); else @@ -69,7 +70,7 @@ namespace mongo { _major++; _minor = 0; } - + void incMinor() { _minor++; } @@ -82,19 +83,19 @@ namespace mongo { return _combined > 0; } - string toString() const { - stringstream ss; - ss << _major << "|" << _minor; - return ss.str(); + string toString() const { + stringstream ss; + ss << _major << "|" << _minor; + return ss.str(); } int majorVersion() const { return _major; } int minorVersion() const { return _minor; } - + operator unsigned long long() const { return _combined; } - - ShardChunkVersion& operator=( const BSONElement& elem ){ - switch ( elem.type() ){ + + ShardChunkVersion& operator=( const BSONElement& elem ) { + switch ( elem.type() ) { case Timestamp: case NumberLong: case Date: @@ -109,39 +110,39 @@ namespace mongo { return *this; } }; - - inline ostream& operator<<( ostream &s , const ShardChunkVersion& v){ + + inline ostream& operator<<( ostream &s , const ShardChunkVersion& v) { s << v._major << "|" << v._minor; return s; } - /** - * your config info for a given shard/chunk is out of date + /** + * your config info for a given shard/chunk is out of date */ class StaleConfigException : public AssertionException { public: StaleConfigException( const string& ns , const string& raw , bool justConnection = false ) - : AssertionException( (string)"ns: " + ns + " " + raw , 9996 ) , + : AssertionException( (string)"ns: " + ns + " " + raw , 9996 ) , _justConnection(justConnection) , - _ns(ns){ + _ns(ns) { } - - virtual ~StaleConfigException() throw(){} - + + virtual ~StaleConfigException() throw() {} + virtual void appendPrefix( stringstream& ss ) const { ss << "StaleConfigException: "; } - + bool justConnection() const { return _justConnection; } - + string getns() const { return _ns; } - static bool parse( const string& big , string& ns , string& raw ){ + static bool parse( const string& big , string& ns , string& raw ) { string::size_type start = big.find( '[' ); if ( start == string::npos ) return false; string::size_type end = big.find( ']' ,start ); if ( end == string::npos ) return false; - + ns = big.substr( start + 1 , ( end - start ) - 1 ); raw = big.substr( end + 1 ); return true; @@ -151,6 +152,7 @@ namespace mongo { string _ns; }; - bool checkShardVersion( DBClientBase & conn , const string& ns , bool authoritative = false , int tryNumber = 1 ); - void resetShardVersion( DBClientBase * conn ); + extern boost::function4 checkShardVersionCB; + extern boost::function1 resetShardVersionCB; + } diff --git a/s/writeback_listener.cpp b/s/writeback_listener.cpp new file mode 100644 index 0000000..21d59d0 --- /dev/null +++ b/s/writeback_listener.cpp @@ -0,0 +1,254 @@ +// @file writeback_listener.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "pch.h" + +#include "../util/timer.h" + +#include "config.h" +#include "grid.h" +#include "request.h" +#include "server.h" +#include "shard.h" +#include "util.h" +#include "client.h" + +#include "writeback_listener.h" + +namespace mongo { + + map WriteBackListener::_cache; + set WriteBackListener::_seenSets; + mongo::mutex WriteBackListener::_cacheLock("WriteBackListener"); + + map WriteBackListener::_seenWritebacks; + mongo::mutex WriteBackListener::_seenWritebacksLock("WriteBackListener::seen"); + + WriteBackListener::WriteBackListener( const string& addr ) : _addr( addr ) { + log() << "creating WriteBackListener for: " << addr << endl; + } + + /* static */ + void WriteBackListener::init( DBClientBase& conn ) { + + if ( conn.type() == ConnectionString::SYNC ) { + // don't want write back listeners for config servers + return; + } + + if ( conn.type() != ConnectionString::SET ) { + init( conn.getServerAddress() ); + return; + } + + + { + scoped_lock lk( _cacheLock ); + if ( _seenSets.count( conn.getServerAddress() ) ) + return; + } + + // we want to do writebacks on all rs nodes + string errmsg; + ConnectionString cs = ConnectionString::parse( conn.getServerAddress() , errmsg ); + uassert( 13641 , str::stream() << "can't parse host [" << conn.getServerAddress() << "]" , cs.isValid() ); + + vector hosts = cs.getServers(); + + for ( unsigned i=0; igo(); + } + + /* static */ + BSONObj WriteBackListener::waitFor( ConnectionId connectionId, const OID& oid ) { + Timer t; + for ( int i=0; i<5000; i++ ) { + { + scoped_lock lk( _seenWritebacksLock ); + WBStatus s = _seenWritebacks[connectionId]; + if ( oid < s.id ) { + // this means we're waiting for a GLE that already passed. + // it should be impossible becauseonce we call GLE, no other + // writebacks should happen with that connection id + msgasserted( 13633 , str::stream() << "got writeback waitfor for older id " << + " oid: " << oid << " s.id: " << s.id << " connectionId: " << connectionId ); + } + else if ( oid == s.id ) { + return s.gle; + } + + } + sleepmillis( 10 ); + } + uasserted( 13403 , str::stream() << "didn't get writeback for: " << oid << " after: " << t.millis() << " ms" ); + throw 1; // never gets here + } + + void WriteBackListener::run() { + int secsToSleep = 0; + while ( ! inShutdown() ) { + + if ( ! Shard::isAShardNode( _addr ) ) { + log(1) << _addr << " is not a shard node" << endl; + sleepsecs( 60 ); + continue; + } + + try { + ScopedDbConnection conn( _addr ); + + BSONObj result; + + { + BSONObjBuilder cmd; + cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data + if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ) { + log() << "writebacklisten command failed! " << result << endl; + conn.done(); + continue; + } + + } + + log(1) << "writebacklisten result: " << result << endl; + + BSONObj data = result.getObjectField( "data" ); + if ( data.getBoolField( "writeBack" ) ) { + string ns = data["ns"].valuestrsafe(); + + ConnectionId cid = 0; + OID wid; + if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) { + cid = data["connectionId"].numberLong(); + wid = data["id"].OID(); + } + else { + warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl; + } + + int len; // not used, but needed for next call + Message m( (void*)data["msg"].binData( len ) , false ); + massert( 10427 , "invalid writeback message" , m.header()->valid() ); + + DBConfigPtr db = grid.getDBConfig( ns ); + ShardChunkVersion needVersion( data["version"] ); + + log(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString() + << " mine : " << db->getChunkManager( ns )->getVersion().toString() << endl;// TODO change to log(3) + + if ( logLevel ) log(1) << debugString( m ) << endl; + + if ( needVersion.isSet() && needVersion <= db->getChunkManager( ns )->getVersion() ) { + // this means when the write went originally, the version was old + // if we're here, it means we've already updated the config, so don't need to do again + //db->getChunkManager( ns , true ); // SERVER-1349 + } + else { + // we received a writeback object that was sent to a previous version of a shard + // the actual shard may not have the object the writeback operation is for + // we need to reload the chunk manager and get the new shard versions + db->getChunkManager( ns , true ); + } + + // do request and then call getLastError + // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError + + BSONObj gle; + try { + + Request r( m , 0 ); + r.init(); + + ClientInfo * ci = r.getClientInfo(); + ci->noAutoSplit(); + + r.process(); + + ci->newRequest(); // this so we flip prev and cur shards + + BSONObjBuilder b; + if ( ! ci->getLastError( BSON( "getLastError" << 1 ) , b , true ) ) { + b.appendBool( "commandFailed" , true ); + } + gle = b.obj(); + + ci->clearSinceLastGetError(); + } + catch ( DBException& e ) { + error() << "error processing writeback: " << e << endl; + BSONObjBuilder b; + b.append( "err" , e.toString() ); + e.getInfo().append( b ); + gle = b.obj(); + } + + { + scoped_lock lk( _seenWritebacksLock ); + WBStatus& s = _seenWritebacks[cid]; + s.id = wid; + s.gle = gle; + } + } + else if ( result["noop"].trueValue() ) { + // no-op + } + else { + log() << "unknown writeBack result: " << result << endl; + } + + conn.done(); + secsToSleep = 0; + continue; + } + catch ( std::exception e ) { + + if ( inShutdown() ) { + // we're shutting down, so just clean up + return; + } + + log() << "WriteBackListener exception : " << e.what() << endl; + + // It's possible this shard was removed + Shard::reloadShardInfo(); + } + catch ( ... ) { + log() << "WriteBackListener uncaught exception!" << endl; + } + secsToSleep++; + sleepsecs(secsToSleep); + if ( secsToSleep > 10 ) + secsToSleep = 0; + } + + log() << "WriteBackListener exiting : address no longer in cluster " << _addr; + + } + +} // namespace mongo diff --git a/s/writeback_listener.h b/s/writeback_listener.h new file mode 100644 index 0000000..7335999 --- /dev/null +++ b/s/writeback_listener.h @@ -0,0 +1,67 @@ +// @file writeback_listener.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "../pch.h" + +#include "../client/connpool.h" +#include "../util/background.h" +#include "../db/client.h" + +namespace mongo { + + /* + * The writeback listener takes back write attempts that were made against a wrong shard. + * (Wrong here in the sense that the target chunk moved before this mongos had a chance to + * learn so.) It is responsible for reapplying these writes to the correct shard. + * + * Currently, there is one listener per shard. + */ + class WriteBackListener : public BackgroundJob { + public: + static void init( DBClientBase& conn ); + static void init( const string& host ); + + static BSONObj waitFor( ConnectionId connectionId, const OID& oid ); + + protected: + WriteBackListener( const string& addr ); + + string name() const { return "WriteBackListener"; } + void run(); + + private: + string _addr; + + static mongo::mutex _cacheLock; // protects _cache + static map _cache; // server to listener + static set _seenSets; // cache of set urls we've seen - note this is ever expanding for order, case, changes + + struct WBStatus { + OID id; + BSONObj gle; + }; + + static mongo::mutex _seenWritebacksLock; // protects _seenWritbacks + static map _seenWritebacks; // connectionId -> last write back GLE + }; + + void waitForWriteback( const OID& oid ); + +} // namespace mongo diff --git a/scripting/bench.cpp b/scripting/bench.cpp new file mode 100644 index 0000000..2723985 --- /dev/null +++ b/scripting/bench.cpp @@ -0,0 +1,173 @@ +/** @file bench.cpp */ + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "pch.h" +#include "engine.h" +#include "../util/md5.hpp" +#include "../util/version.h" +#include "../client/dbclient.h" +#include "../client/connpool.h" +// --------------------------------- +// ---- benchmarking system -------- +// --------------------------------- + + +namespace mongo { + + + /** + * benchQuery( "foo" , { _id : 1 } ) + */ + BSONObj benchQuery( const BSONObj& args ) { + return BSONObj(); + } + + struct BenchRunConfig { + BenchRunConfig() { + host = "localhost"; + db = "test"; + + parallel = 1; + seconds = 1; + + active = true; + threadsReady = 0; + error = false; + } + + string host; + string db; + + unsigned parallel; + int seconds; + + BSONObj ops; + + bool active; // true at starts, gets set to false when should stop + AtomicUInt threadsReady; + + bool error; + }; + + static void benchThread( BenchRunConfig * config ) { + ScopedDbConnection conn( config->host ); + config->threadsReady++; + + while ( config->active ) { + BSONObjIterator i( config->ops ); + while ( i.more() ) { + BSONElement e = i.next(); + string ns = e["ns"].String(); + string op = e["op"].String(); + + if ( op == "findOne" ) { + conn->findOne( ns , e["query"].Obj() ); + } + else { + log() << "don't understand op: " << op << endl; + config->error = true; + return; + } + + } + } + + conn.done(); + } + + /** + * benchRun( { ops : [] , host : XXX , db : XXXX , parallel : 5 , seconds : 5 } + */ + BSONObj benchRun( const BSONObj& argsFake ) { + assert( argsFake.firstElement().isABSONObj() ); + BSONObj args = argsFake.firstElement().Obj(); + + // setup + + BenchRunConfig config; + + if ( args["host"].type() == String ) + config.host = args["host"].String(); + if ( args["db"].type() == String ) + config.db = args["db"].String(); + + if ( args["parallel"].isNumber() ) + config.parallel = args["parallel"].numberInt(); + if ( args["seconds"].isNumber() ) + config.seconds = args["seconds"].numberInt(); + + + config.ops = args["ops"].Obj(); + + // execute + + ScopedDbConnection conn( config.host ); + + // start threads + vector all; + for ( unsigned i=0; isimpleCommand( "admin" , &before , "serverStatus" ); + + sleepsecs( config.seconds ); + + BSONObj after; + conn->simpleCommand( "admin" , &after , "serverStatus" ); + + conn.done(); + + config.active = false; + + for ( unsigned i=0; ijoin(); + + if ( config.error ) + return BSON( "err" << 1 ); + + // compute actual ops/sec + + before = before["opcounters"].Obj(); + after = after["opcounters"].Obj(); + + BSONObjBuilder buf; + buf.append( "note" , "values per second" ); + + { + BSONObjIterator i( after ); + while ( i.more() ) { + BSONElement e = i.next(); + double x = e.number(); + x = x - before[e.fieldName()].number(); + buf.append( e.fieldName() , x / config.seconds ); + } + } + BSONObj zoo = buf.obj(); + return BSON( "" << zoo ); + } + + void installBenchmarkSystem( Scope& scope ) { + scope.injectNative( "benchRun" , benchRun ); + } + +} diff --git a/scripting/engine.cpp b/scripting/engine.cpp index da108c6..60e56ae 100644 --- a/scripting/engine.cpp +++ b/scripting/engine.cpp @@ -23,27 +23,27 @@ namespace mongo { long long Scope::_lastVersion = 1; - + int Scope::_numScopes = 0; - Scope::Scope() : _localDBName("") , _loadedVersion(0){ + Scope::Scope() : _localDBName("") , _loadedVersion(0) { _numScopes++; } - Scope::~Scope(){ + Scope::~Scope() { _numScopes--; } ScriptEngine::ScriptEngine() : _scopeInitCallback() { } - ScriptEngine::~ScriptEngine(){ + ScriptEngine::~ScriptEngine() { } - void Scope::append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName ){ + void Scope::append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName ) { int t = type( scopeName ); - - switch ( t ){ + + switch ( t ) { case Object: builder.append( fieldName , getObject( scopeName ) ); break; @@ -74,7 +74,7 @@ namespace mongo { builder.appendDate( fieldName , Date_t((unsigned long long)getNumber( scopeName )) ); break; case Code: - builder.appendCode( fieldName , getString( scopeName ).c_str() ); + builder.appendCode( fieldName , getString( scopeName ) ); break; default: stringstream temp; @@ -82,20 +82,20 @@ namespace mongo { temp << t; uassert( 10206 , temp.str() , 0 ); } - + } - int Scope::invoke( const char* code , const BSONObj& args, int timeoutMs ){ + int Scope::invoke( const char* code , const BSONObj& args, int timeoutMs ) { ScriptingFunction func = createFunction( code ); uassert( 10207 , "compile failed" , func ); return invoke( func , args, timeoutMs ); } - - bool Scope::execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs ){ - + + bool Scope::execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs ) { + path p( filename ); - if ( ! exists( p ) ){ + if ( ! exists( p ) ) { log() << "file [" << filename << "] doesn't exist" << endl; if ( assertOnError ) assert( 0 ); @@ -103,10 +103,10 @@ namespace mongo { } // iterate directories and recurse using all *.js files in the directory - if ( is_directory( p ) ){ + if ( is_directory( p ) ) { directory_iterator end; bool empty = true; - for (directory_iterator it (p); it != end; it++){ + for (directory_iterator it (p); it != end; it++) { empty = false; path sub (*it); if (!endsWith(sub.string().c_str(), ".js")) @@ -115,7 +115,7 @@ namespace mongo { return false; } - if (empty){ + if (empty) { log() << "directory [" << filename << "] doesn't have any *.js files" << endl; if ( assertOnError ) assert( 0 ); @@ -124,83 +124,97 @@ namespace mongo { return true; } - + File f; f.open( filename.c_str() , true ); - fileofs L = f.len(); - assert( L <= 0x7ffffffe ); - char * data = (char*)malloc( (size_t) L+1 ); + unsigned L; + { + fileofs fo = f.len(); + assert( fo <= 0x7ffffffe ); + L = (unsigned) fo; + } + boost::scoped_array data (new char[L+1]); data[L] = 0; - f.read( 0 , data , (size_t) L ); - - return exec( data , filename , printResult , reportError , assertOnError, timeoutMs ); + f.read( 0 , data.get() , L ); + + int offset = 0; + if (data[0] == '#' && data[1] == '!') { + const char* newline = strchr(data.get(), '\n'); + if (! newline) + return true; // file of just shebang treated same as empty file + offset = newline - data.get(); + } + + StringData code (data.get() + offset, L - offset); + + return exec( code , filename , printResult , reportError , assertOnError, timeoutMs ); } - void Scope::storedFuncMod(){ + void Scope::storedFuncMod() { _lastVersion++; } - + void Scope::validateObjectIdString( const string &str ) { massert( 10448 , "invalid object id: length", str.size() == 24 ); - for ( string::size_type i=0; i= '0' && c <= '9' ) || - ( c >= 'a' && c <= 'f' ) || - ( c >= 'A' && c <= 'F' ) ){ + ( c >= 'a' && c <= 'f' ) || + ( c >= 'A' && c <= 'F' ) ) { continue; } massert( 10430 , "invalid object id: not hex", false ); - } + } } - void Scope::loadStored( bool ignoreNotConnected ){ - if ( _localDBName.size() == 0 ){ + void Scope::loadStored( bool ignoreNotConnected ) { + if ( _localDBName.size() == 0 ) { if ( ignoreNotConnected ) return; uassert( 10208 , "need to have locallyConnected already" , _localDBName.size() ); } if ( _loadedVersion == _lastVersion ) return; - + _loadedVersion = _lastVersion; string coll = _localDBName + ".system.js"; - + static DBClientBase * db = createDirectClient(); - auto_ptr c = db->query( coll , Query() ); + auto_ptr c = db->query( coll , Query(), 0, 0, NULL, QueryOption_SlaveOk, 0 ); assert( c.get() ); - + set thisTime; - - while ( c->more() ){ + + while ( c->more() ) { BSONObj o = c->next(); BSONElement n = o["_id"]; BSONElement v = o["value"]; - + uassert( 10209 , "name has to be a string" , n.type() == String ); uassert( 10210 , "value has to be set" , v.type() != EOO ); - + setElement( n.valuestr() , v ); thisTime.insert( n.valuestr() ); _storedNames.insert( n.valuestr() ); - + } // --- remove things from scope that were removed list toremove; - for ( set::iterator i=_storedNames.begin(); i!=_storedNames.end(); i++ ){ + for ( set::iterator i=_storedNames.begin(); i!=_storedNames.end(); i++ ) { string n = *i; if ( thisTime.count( n ) == 0 ) toremove.push_back( n ); } - - for ( list::iterator i=toremove.begin(); i!=toremove.end(); i++ ){ + + for ( list::iterator i=toremove.begin(); i!=toremove.end(); i++ ) { string n = *i; _storedNames.erase( n ); execSetup( (string)"delete " + n , "clean up scope" ); @@ -208,11 +222,11 @@ namespace mongo { } - ScriptingFunction Scope::createFunction( const char * code ){ - if ( code[0] == '/' && code [1] == '*' ){ + ScriptingFunction Scope::createFunction( const char * code ) { + if ( code[0] == '/' && code [1] == '*' ) { code += 2; - while ( code[0] && code[1] ){ - if ( code[0] == '*' && code[1] == '/' ){ + while ( code[0] && code[1] ) { + if ( code[0] == '*' && code[1] == '/' ) { code += 2; break; } @@ -226,7 +240,7 @@ namespace mongo { _cachedFunctions[code] = f; return f; } - + typedef map< string , list > PoolToScopes; class ScopeCache { @@ -235,21 +249,21 @@ namespace mongo { ScopeCache() : _mutex("ScopeCache") { _magic = 17; } - - ~ScopeCache(){ + + ~ScopeCache() { assert( _magic == 17 ); _magic = 1; if ( inShutdown() ) return; - + clear(); } - void done( const string& pool , Scope * s ){ + void done( const string& pool , Scope * s ) { scoped_lock lk( _mutex ); list & l = _pools[pool]; - if ( l.size() > 10 ){ + if ( l.size() > 10 ) { delete s; } else { @@ -257,31 +271,31 @@ namespace mongo { s->reset(); } } - - Scope * get( const string& pool ){ + + Scope * get( const string& pool ) { scoped_lock lk( _mutex ); list & l = _pools[pool]; if ( l.size() == 0 ) return 0; - + Scope * s = l.back(); l.pop_back(); s->reset(); return s; } - - void clear(){ + + void clear() { set seen; - - for ( PoolToScopes::iterator i=_pools.begin() ; i != _pools.end(); i++ ){ - for ( list::iterator j=i->second.begin(); j != i->second.end(); j++ ){ + + for ( PoolToScopes::iterator i=_pools.begin() ; i != _pools.end(); i++ ) { + for ( list::iterator j=i->second.begin(); j != i->second.end(); j++ ) { Scope * s = *j; assert( ! seen.count( s ) ); delete s; seen.insert( s ); } } - + _pools.clear(); } @@ -295,12 +309,12 @@ namespace mongo { class PooledScope : public Scope { public: - PooledScope( const string pool , Scope * real ) : _pool( pool ) , _real( real ){ + PooledScope( const string pool , Scope * real ) : _pool( pool ) , _real( real ) { _real->loadStored( true ); }; - virtual ~PooledScope(){ + virtual ~PooledScope() { ScopeCache * sc = scopeCache.get(); - if ( sc ){ + if ( sc ) { sc->done( _pool , _real ); _real = 0; } @@ -312,88 +326,92 @@ namespace mongo { _real = 0; } } - - void reset(){ + + void reset() { _real->reset(); } - void init( BSONObj * data ){ + void init( const BSONObj * data ) { _real->init( data ); } - - void localConnect( const char * dbName ){ + + void localConnect( const char * dbName ) { _real->localConnect( dbName ); } - void externalSetup(){ + void externalSetup() { _real->externalSetup(); } - - double getNumber( const char *field ){ + + double getNumber( const char *field ) { return _real->getNumber( field ); } - string getString( const char *field ){ + string getString( const char *field ) { return _real->getString( field ); } - bool getBoolean( const char *field ){ + bool getBoolean( const char *field ) { return _real->getBoolean( field ); } - BSONObj getObject( const char *field ){ + BSONObj getObject( const char *field ) { return _real->getObject( field ); } - int type( const char *field ){ + int type( const char *field ) { return _real->type( field ); } - void setElement( const char *field , const BSONElement& val ){ + void setElement( const char *field , const BSONElement& val ) { _real->setElement( field , val ); } - void setNumber( const char *field , double val ){ + void setNumber( const char *field , double val ) { _real->setNumber( field , val ); } - void setString( const char *field , const char * val ){ + void setString( const char *field , const char * val ) { _real->setString( field , val ); } - void setObject( const char *field , const BSONObj& obj , bool readOnly=true ){ + void setObject( const char *field , const BSONObj& obj , bool readOnly=true ) { _real->setObject( field , obj , readOnly ); } - void setBoolean( const char *field , bool val ){ + void setBoolean( const char *field , bool val ) { _real->setBoolean( field , val ); } - void setThis( const BSONObj * obj ){ + void setThis( const BSONObj * obj ) { _real->setThis( obj ); } - - ScriptingFunction createFunction( const char * code ){ + + ScriptingFunction createFunction( const char * code ) { return _real->createFunction( code ); } - ScriptingFunction _createFunction( const char * code ){ + ScriptingFunction _createFunction( const char * code ) { return _real->createFunction( code ); } + void rename( const char * from , const char * to ) { + _real->rename( from , to ); + } + /** * @return 0 on success */ - int invoke( ScriptingFunction func , const BSONObj& args, int timeoutMs , bool ignoreReturn ){ + int invoke( ScriptingFunction func , const BSONObj& args, int timeoutMs , bool ignoreReturn ) { return _real->invoke( func , args , timeoutMs , ignoreReturn ); } - string getError(){ + string getError() { return _real->getError(); } - - bool exec( const string& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ){ + + bool exec( const StringData& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) { return _real->exec( code , name , printResult , reportError , assertOnError , timeoutMs ); } - bool execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ){ + bool execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) { return _real->execFile( filename , printResult , reportError , assertOnError , timeoutMs ); } - - void injectNative( const char *field, NativeFunction func ){ + + void injectNative( const char *field, NativeFunction func ) { _real->injectNative( field , func ); } - - void gc(){ + + void gc() { _real->gc(); } @@ -402,40 +420,57 @@ namespace mongo { Scope * _real; }; - auto_ptr ScriptEngine::getPooledScope( const string& pool ){ - if ( ! scopeCache.get() ){ + auto_ptr ScriptEngine::getPooledScope( const string& pool ) { + if ( ! scopeCache.get() ) { scopeCache.reset( new ScopeCache() ); } Scope * s = scopeCache->get( pool ); - if ( ! s ){ + if ( ! s ) { s = newScope(); } - + auto_ptr p; p.reset( new PooledScope( pool , s ) ); return p; } - - void ScriptEngine::threadDone(){ + + void ScriptEngine::threadDone() { ScopeCache * sc = scopeCache.get(); - if ( sc ){ + if ( sc ) { sc->clear(); } } - + void ( *ScriptEngine::_connectCallback )( DBClientWithCommands & ) = 0; - - ScriptEngine * globalScriptEngine; + const char * ( *ScriptEngine::_checkInterruptCallback )() = 0; + unsigned ( *ScriptEngine::_getInterruptSpecCallback )() = 0; + + ScriptEngine * globalScriptEngine = 0; - bool hasJSReturn( const string& code ){ + bool hasJSReturn( const string& code ) { size_t x = code.find( "return" ); if ( x == string::npos ) return false; - return + return ( x == 0 || ! isalpha( code[x-1] ) ) && ! isalpha( code[x+6] ); } + + const char * jsSkipWhiteSpace( const char * raw ) { + while ( raw[0] ) { + while (isspace(*raw)) { + raw++; + } + + if ( raw[0] != '/' || raw[1] != '/' ) + break; + + while ( raw[0] && raw[0] != '\n' ) + raw++; + } + return raw; + } } - + diff --git a/scripting/engine.h b/scripting/engine.h index e097401..62afd77 100644 --- a/scripting/engine.h +++ b/scripting/engine.h @@ -20,10 +20,23 @@ #include "../pch.h" #include "../db/jsobj.h" -extern const char * jsconcatcode; // TODO: change name to mongoJSCode - namespace mongo { + struct JSFile { + const char* name; + const StringData& source; + }; + + namespace JSFiles { + extern const JSFile collection; + extern const JSFile db; + extern const JSFile mongo; + extern const JSFile mr; + extern const JSFile query; + extern const JSFile servers; + extern const JSFile utils; + } + typedef unsigned long long ScriptingFunction; typedef BSONObj (*NativeFunction) ( const BSONObj &args ); @@ -31,20 +44,35 @@ namespace mongo { public: Scope(); virtual ~Scope(); - + virtual void reset() = 0; - virtual void init( BSONObj * data ) = 0; - void init( const char * data ){ + virtual void init( const BSONObj * data ) = 0; + void init( const char * data ) { BSONObj o( data , 0 ); init( &o ); } - + virtual void localConnect( const char * dbName ) = 0; virtual void externalSetup() = 0; - + + class NoDBAccess { + Scope * _s; + public: + NoDBAccess( Scope * s ) { + _s = s; + } + ~NoDBAccess() { + _s->rename( "____db____" , "db" ); + } + }; + NoDBAccess disableDBAccess( const char * why ) { + rename( "db" , "____db____" ); + return NoDBAccess( this ); + } + virtual double getNumber( const char *field ) = 0; - virtual int getNumberInt( const char *field ){ return (int)getNumber( field ); } - virtual long long getNumberLongLong( const char *field ){ return (long long)getNumber( field ); } + virtual int getNumberInt( const char *field ) { return (int)getNumber( field ); } + virtual long long getNumberLongLong( const char *field ) { return (long long)getNumber( field ); } virtual string getString( const char *field ) = 0; virtual bool getBoolean( const char *field ) = 0; virtual BSONObj getObject( const char *field ) = 0; @@ -59,52 +87,68 @@ namespace mongo { virtual void setObject( const char *field , const BSONObj& obj , bool readOnly=true ) = 0; virtual void setBoolean( const char *field , bool val ) = 0; virtual void setThis( const BSONObj * obj ) = 0; - + virtual ScriptingFunction createFunction( const char * code ); - + + virtual void rename( const char * from , const char * to ) = 0; /** * @return 0 on success */ virtual int invoke( ScriptingFunction func , const BSONObj& args, int timeoutMs = 0 , bool ignoreReturn = false ) = 0; - void invokeSafe( ScriptingFunction func , const BSONObj& args, int timeoutMs = 0 ){ + void invokeSafe( ScriptingFunction func , const BSONObj& args, int timeoutMs = 0 ) { int res = invoke( func , args , timeoutMs ); if ( res == 0 ) return; throw UserException( 9004 , (string)"invoke failed: " + getError() ); } virtual string getError() = 0; - + int invoke( const char* code , const BSONObj& args, int timeoutMs = 0 ); - void invokeSafe( const char* code , const BSONObj& args, int timeoutMs = 0 ){ + void invokeSafe( const char* code , const BSONObj& args, int timeoutMs = 0 ) { if ( invoke( code , args , timeoutMs ) == 0 ) return; throw UserException( 9005 , (string)"invoke failed: " + getError() ); } - virtual bool exec( const string& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) = 0; - virtual void execSetup( const string& code , const string& name = "setup" ){ + virtual bool exec( const StringData& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) = 0; + virtual void execSetup( const StringData& code , const string& name = "setup" ) { exec( code , name , false , true , true , 0 ); } + + void execSetup( const JSFile& file) { + execSetup(file.source, file.name); + } + + void execCoreFiles() { + // keeping same order as in SConstruct + execSetup(JSFiles::utils); + execSetup(JSFiles::db); + execSetup(JSFiles::mongo); + execSetup(JSFiles::mr); + execSetup(JSFiles::query); + execSetup(JSFiles::collection); + } + virtual bool execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ); - + virtual void injectNative( const char *field, NativeFunction func ) = 0; virtual void gc() = 0; void loadStored( bool ignoreNotConnected = false ); - + /** if any changes are made to .system.js, call this right now its just global - slightly inefficient, but a lot simpler */ static void storedFuncMod(); - - static int getNumScopes(){ + + static int getNumScopes() { return _numScopes; } - + static void validateObjectIdString( const string &str ); - + protected: virtual ScriptingFunction _createFunction( const char * code ) = 0; @@ -117,16 +161,16 @@ namespace mongo { static int _numScopes; }; - + void installGlobalUtils( Scope& scope ); class DBClientWithCommands; - + class ScriptEngine : boost::noncopyable { public: ScriptEngine(); virtual ~ScriptEngine(); - + virtual Scope * newScope() { Scope *s = createScope(); if ( s && _scopeInitCallback ) @@ -134,35 +178,63 @@ namespace mongo { installGlobalUtils( *s ); return s; } - + virtual void runTest() = 0; - + virtual bool utf8Ok() const = 0; static void setup(); auto_ptr getPooledScope( const string& pool ); void threadDone(); - + struct Unlocker { virtual ~Unlocker() {} }; virtual auto_ptr newThreadUnlocker() { return auto_ptr< Unlocker >( new Unlocker ); } - + void setScopeInitCallback( void ( *func )( Scope & ) ) { _scopeInitCallback = func; } static void setConnectCallback( void ( *func )( DBClientWithCommands& ) ) { _connectCallback = func; } static void runConnectCallback( DBClientWithCommands &c ) { if ( _connectCallback ) _connectCallback( c ); } - + + // engine implementation may either respond to interrupt events or + // poll for interrupts + + // the interrupt functions must not wait indefinitely on a lock + virtual void interrupt( unsigned opSpec ) {} + virtual void interruptAll() {} + + static void setGetInterruptSpecCallback( unsigned ( *func )() ) { _getInterruptSpecCallback = func; } + static bool haveGetInterruptSpecCallback() { return _getInterruptSpecCallback; } + static unsigned getInterruptSpec() { + massert( 13474, "no _getInterruptSpecCallback", _getInterruptSpecCallback ); + return _getInterruptSpecCallback(); + } + + static void setCheckInterruptCallback( const char * ( *func )() ) { _checkInterruptCallback = func; } + static bool haveCheckInterruptCallback() { return _checkInterruptCallback; } + static const char * checkInterrupt() { + return _checkInterruptCallback ? _checkInterruptCallback() : ""; + } + static bool interrupted() { + const char *r = checkInterrupt(); + return r && r[ 0 ]; + } + protected: virtual Scope * createScope() = 0; - + private: void ( *_scopeInitCallback )( Scope & ); static void ( *_connectCallback )( DBClientWithCommands & ); + static const char * ( *_checkInterruptCallback )(); + static unsigned ( *_getInterruptSpecCallback )(); }; bool hasJSReturn( const string& s ); + const char * jsSkipWhiteSpace( const char * raw ); + extern ScriptEngine * globalScriptEngine; } diff --git a/scripting/engine_java.cpp b/scripting/engine_java.cpp index dacf532..fc8945f 100644 --- a/scripting/engine_java.cpp +++ b/scripting/engine_java.cpp @@ -55,19 +55,19 @@ namespace mongo { no tss cleanup on windows for boost lib? we don't care for now esp on windows only - the boost source says: - - This function's sole purpose is to cause a link error in cases where - automatic tss cleanup is not implemented by Boost.Threads as a - reminder that user code is responsible for calling the necessary - functions at the appropriate times (and for implementing an a - tss_cleanup_implemented() function to eliminate the linker's - missing symbol error). - - If Boost.Threads later implements automatic tss cleanup in cases - where it currently doesn't (which is the plan), the duplicate - symbol error will warn the user that their custom solution is no - longer needed and can be removed. + the boost source says: + + This function's sole purpose is to cause a link error in cases where + automatic tss cleanup is not implemented by Boost.Threads as a + reminder that user code is responsible for calling the necessary + functions at the appropriate times (and for implementing an a + tss_cleanup_implemented() function to eliminate the linker's + missing symbol error). + + If Boost.Threads later implements automatic tss cleanup in cases + where it currently doesn't (which is the plan), the duplicate + symbol error will warn the user that their custom solution is no + longer needed and can be removed. */ extern "C" void tss_cleanup_implemented(void) { //out() << "tss_cleanup_implemented called" << endl; @@ -185,10 +185,10 @@ namespace mongo { if ( res ) { log() << "using classpath: " << q << endl; log() - << " res : " << (unsigned) res << " " - << "_jvm : " << _jvm << " " - << "_env : " << _mainEnv << " " - << endl; + << " res : " << (unsigned) res << " " + << "_jvm : " << _jvm << " " + << "_env : " << _mainEnv << " " + << endl; problem() << "Couldn't create JVM res:" << (int) res << " terminating" << endl; log() << "(try --nojni if you do not require that functionality)" << endl; exit(22); @@ -397,12 +397,11 @@ namespace mongo { return retStr; } - BSONObj JavaJSImpl::scopeGetObject( jlong id , const char * field ) - { + BSONObj JavaJSImpl::scopeGetObject( jlong id , const char * field ) { jstring s1 = _getEnv()->NewStringUTF( field ); int guess = _getEnv()->CallStaticIntMethod( _dbhook , _scopeGuessObjectSize , id , _getEnv()->NewStringUTF( field ) ); _getEnv()->DeleteLocalRef( s1 ); - + if ( guess == 0 ) return BSONObj(); @@ -471,12 +470,12 @@ namespace mongo { return env; } - Scope * JavaJSImpl::createScope(){ + Scope * JavaJSImpl::createScope() { return new JavaScope(); } - void ScriptEngine::setup(){ - if ( ! JavaJS ){ + void ScriptEngine::setup() { + if ( ! JavaJS ) { JavaJS = new JavaJSImpl(); globalScriptEngine = JavaJS; } @@ -564,40 +563,40 @@ namespace mongo { if ( ! possible.size() ) { possible.push_back( "./" ); possible.push_back( "../" ); - + log(2) << "dbExecCommand: " << dbExecCommand << endl; - + string dbDir = dbExecCommand; #ifdef WIN32 - if ( dbDir.find( "\\" ) != string::npos ){ + if ( dbDir.find( "\\" ) != string::npos ) { dbDir = dbDir.substr( 0 , dbDir.find_last_of( "\\" ) ); } else { dbDir = "."; } #else - if ( dbDir.find( "/" ) != string::npos ){ + if ( dbDir.find( "/" ) != string::npos ) { dbDir = dbDir.substr( 0 , dbDir.find_last_of( "/" ) ); } else { bool found = false; - - if ( getenv( "PATH" ) ){ + + if ( getenv( "PATH" ) ) { string s = getenv( "PATH" ); s += ":"; pcrecpp::StringPiece input( s ); string dir; pcrecpp::RE re("(.*?):"); - while ( re.Consume( &input, &dir ) ){ + while ( re.Consume( &input, &dir ) ) { string test = dir + "/" + dbExecCommand; - if ( boost::filesystem::exists( test ) ){ - while ( boost::filesystem::symbolic_link_exists( test ) ){ + if ( boost::filesystem::exists( test ) ) { + while ( boost::filesystem::symbolic_link_exists( test ) ) { char tmp[2048]; int len = readlink( test.c_str() , tmp , 2048 ); tmp[len] = 0; log(5) << " symlink " << test << " -->> " << tmp << endl; test = tmp; - + dir = test.substr( 0 , test.rfind( "/" ) ); } dbDir = dir; @@ -606,12 +605,12 @@ namespace mongo { } } } - + if ( ! found ) dbDir = "."; } #endif - + log(2) << "dbDir [" << dbDir << "]" << endl; possible.push_back( ( dbDir + "/../lib/mongo/" )); possible.push_back( ( dbDir + "/../lib64/mongo/" )); @@ -624,7 +623,7 @@ namespace mongo { for ( list::iterator i = possible.begin() ; i != possible.end(); i++ ) { const string temp = *i; const string jarDir = ((string)temp) + "jars/"; - + log(5) << "possible jarDir [" << jarDir << "]" << endl; path p(jarDir ); @@ -641,7 +640,7 @@ namespace mongo { }; - + // --- JNIEXPORT void JNICALL java_native_say(JNIEnv * env , jclass, jobject outBuffer ) { @@ -692,7 +691,7 @@ namespace mongo { jlong func1 = JavaJS.functionCreate( "foo = 5.6; bar = \"eliot\"; abc = { foo : 517 }; " ); - jassert( ! JavaJS.invoke( scope , func1 ) ); + jassert( ! JavaJS.invoke( scope , func1 ) ); if ( debug ) out() << "func3 start" << endl; @@ -757,7 +756,7 @@ namespace mongo { assert( 12 == JavaJS.scopeGetNumber( scope , "return" ) ); } - + #endif } // namespace mongo diff --git a/scripting/engine_java.h b/scripting/engine_java.h index 5c6bc3b..b8245ba 100644 --- a/scripting/engine_java.h +++ b/scripting/engine_java.h @@ -163,10 +163,10 @@ namespace mongo { JavaJS->scopeInit( s , o ); } - void localConnect( const char * dbName ){ + void localConnect( const char * dbName ) { setString("$client", dbName ); } - + double getNumber(const char *field) { return JavaJS->scopeGetNumber(s,field); } @@ -183,7 +183,7 @@ namespace mongo { return JavaJS->scopeGetType(s,field); } - void setThis( const BSONObj * obj ){ + void setThis( const BSONObj * obj ) { JavaJS->scopeSetThis( s , obj ); } @@ -200,17 +200,17 @@ namespace mongo { void setBoolean(const char *field, bool val ) { JavaJS->scopeSetBoolean(s,field,val); } - - ScriptingFunction createFunction( const char * code ){ + + ScriptingFunction createFunction( const char * code ) { return JavaJS->functionCreate( code ); } - int invoke( ScriptingFunction function , const BSONObj& args ){ + int invoke( ScriptingFunction function , const BSONObj& args ) { setObject( "args" , args , true ); return JavaJS->invoke(s,function); } - - string getError(){ + + string getError() { return getString( "error" ); } diff --git a/scripting/engine_none.cpp b/scripting/engine_none.cpp index 2320d0e..d13dbec 100644 --- a/scripting/engine_none.cpp +++ b/scripting/engine_none.cpp @@ -18,7 +18,7 @@ #include "engine.h" namespace mongo { - void ScriptEngine::setup(){ + void ScriptEngine::setup() { // noop } } diff --git a/scripting/engine_spidermonkey.cpp b/scripting/engine_spidermonkey.cpp index c8f2eca..73ebfaa 100644 --- a/scripting/engine_spidermonkey.cpp +++ b/scripting/engine_spidermonkey.cpp @@ -26,31 +26,34 @@ #endif #define smuassert( cx , msg , val ) \ - if ( ! ( val ) ){ \ - JS_ReportError( cx , msg ); \ - return JS_FALSE; \ - } + if ( ! ( val ) ){ \ + JS_ReportError( cx , msg ); \ + return JS_FALSE; \ + } #define CHECKNEWOBJECT(xx,ctx,w) \ if ( ! xx ){ \ massert(13072,(string)"JS_NewObject failed: " + w ,xx); \ } +#define CHECKJSALLOC( newthing ) \ + massert( 13615 , "JS allocation failed, either memory leak or using too much memory" , newthing ) + namespace mongo { - + class InvalidUTF8Exception : public UserException { public: - InvalidUTF8Exception() : UserException( 9006 , "invalid utf8" ){ + InvalidUTF8Exception() : UserException( 9006 , "invalid utf8" ) { } }; - string trim( string s ){ + string trim( string s ) { while ( s.size() && isspace( s[0] ) ) s = s.substr( 1 ); - + while ( s.size() && isspace( s[s.size()-1] ) ) s = s.substr( 0 , s.size() - 1 ); - + return s; } @@ -65,18 +68,18 @@ namespace mongo { class BSONHolder { public: - BSONHolder( BSONObj obj ){ + BSONHolder( BSONObj obj ) { _obj = obj.getOwned(); _inResolve = false; _modified = false; _magic = 17; } - - ~BSONHolder(){ + + ~BSONHolder() { _magic = 18; } - void check(){ + void check() { uassert( 10212 , "holder magic value is wrong" , _magic == 17 && _obj.isValid() ); } @@ -89,24 +92,24 @@ namespace mongo { set _removed; bool _modified; }; - + class BSONFieldIterator { public: - BSONFieldIterator( BSONHolder * holder ){ + BSONFieldIterator( BSONHolder * holder ) { set added; BSONObjIterator it( holder->_obj ); - while ( it.more() ){ + while ( it.more() ) { BSONElement e = it.next(); if ( holder->_removed.count( e.fieldName() ) ) continue; _names.push_back( e.fieldName() ); added.insert( e.fieldName() ); } - - for ( list::iterator i = holder->_extra.begin(); i != holder->_extra.end(); i++ ){ + + for ( list::iterator i = holder->_extra.begin(); i != holder->_extra.end(); i++ ) { if ( ! added.count( *i ) ) _names.push_back( *i ); } @@ -114,11 +117,11 @@ namespace mongo { _it = _names.begin(); } - bool more(){ + bool more() { return _it != _names.end(); } - string next(){ + string next() { string s = *_it; _it++; return s; @@ -129,24 +132,24 @@ namespace mongo { list::iterator _it; }; - BSONFieldIterator * BSONHolder::it(){ + BSONFieldIterator * BSONHolder::it() { return new BSONFieldIterator( this ); } class TraverseStack { public: - TraverseStack(){ + TraverseStack() { _o = 0; _parent = 0; } - TraverseStack( JSObject * o , const TraverseStack * parent ){ + TraverseStack( JSObject * o , const TraverseStack * parent ) { _o = o; _parent = parent; } TraverseStack dive( JSObject * o ) const { - if ( o ){ + if ( o ) { uassert( 13076 , (string)"recursive toObject" , ! has( o ) ); } return TraverseStack( o , this ); @@ -155,7 +158,7 @@ namespace mongo { int depth() const { int d = 0; const TraverseStack * s = _parent; - while ( s ){ + while ( s ) { s = s->_parent; d++; } @@ -165,12 +168,12 @@ namespace mongo { bool isTop() const { return _parent == 0; } - + bool has( JSObject * o ) const { if ( ! o ) return false; const TraverseStack * s = this; - while ( s ){ + while ( s ) { if ( s->_o == o ) return true; s = s->_parent; @@ -184,11 +187,11 @@ namespace mongo { class Convertor : boost::noncopyable { public: - Convertor( JSContext * cx ){ + Convertor( JSContext * cx ) { _context = cx; } - string toString( JSString * so ){ + string toString( JSString * so ) { jschar * s = JS_GetStringChars( so ); size_t srclen = JS_GetStringLength( so ); if( srclen == 0 ) @@ -202,7 +205,16 @@ namespace mongo { // units, but experiments suggest 8bit units expected. We allocate // enough memory that either will work. - assert( JS_EncodeCharacters( _context , s , srclen , dst , &len) ); + if ( !JS_EncodeCharacters( _context , s , srclen , dst , &len) ) { + StringBuilder temp; + temp << "Not proper UTF-16: "; + for ( size_t i=0; i 0 ) + temp << ","; + temp << s[i]; + } + uasserted( 13498 , temp.str() ); + } string ss( dst , len ); free( dst ); @@ -212,7 +224,7 @@ namespace mongo { return ss; } - string toString( jsval v ){ + string toString( jsval v ) { return toString( JS_ValueToString( _context , v ) ); } @@ -221,27 +233,28 @@ namespace mongo { boost::uint64_t val; if ( hasProperty( o, "top" ) ) { val = - ( (boost::uint64_t)(boost::uint32_t)getNumber( o , "top" ) << 32 ) + - ( boost::uint32_t)( getNumber( o , "bottom" ) ); - } else { + ( (boost::uint64_t)(boost::uint32_t)getNumber( o , "top" ) << 32 ) + + ( boost::uint32_t)( getNumber( o , "bottom" ) ); + } + else { val = (boost::uint64_t)(boost::int64_t) getNumber( o, "floatApprox" ); } return val; } - - double toNumber( jsval v ){ + + double toNumber( jsval v ) { double d; uassert( 10214 , "not a number" , JS_ValueToNumber( _context , v , &d ) ); return d; } - bool toBoolean( jsval v ){ + bool toBoolean( jsval v ) { JSBool b; assert( JS_ValueToBoolean( _context, v , &b ) ); return b; } - OID toOID( jsval v ){ + OID toOID( jsval v ) { JSContext * cx = _context; assert( JSVAL_IS_OID( v ) ); @@ -251,21 +264,21 @@ namespace mongo { return oid; } - BSONObj toObject( JSObject * o , const TraverseStack& stack=TraverseStack() ){ + BSONObj toObject( JSObject * o , const TraverseStack& stack=TraverseStack() ) { if ( ! o ) return BSONObj(); - if ( JS_InstanceOf( _context , o , &bson_ro_class , 0 ) ){ + if ( JS_InstanceOf( _context , o , &bson_ro_class , 0 ) ) { BSONHolder * holder = GETHOLDER( _context , o ); assert( holder ); return holder->_obj.getOwned(); } BSONObj orig; - if ( JS_InstanceOf( _context , o , &bson_class , 0 ) ){ + if ( JS_InstanceOf( _context , o , &bson_class , 0 ) ) { BSONHolder * holder = GETHOLDER(_context,o); assert( holder ); - if ( ! holder->_modified ){ + if ( ! holder->_modified ) { return holder->_obj; } orig = holder->_obj; @@ -273,26 +286,26 @@ namespace mongo { BSONObjBuilder b; - if ( ! appendSpecialDBObject( this , b , "value" , OBJECT_TO_JSVAL( o ) , o ) ){ + if ( ! appendSpecialDBObject( this , b , "value" , OBJECT_TO_JSVAL( o ) , o ) ) { - if ( stack.isTop() ){ + if ( stack.isTop() ) { jsval theid = getProperty( o , "_id" ); - if ( ! JSVAL_IS_VOID( theid ) ){ + if ( ! JSVAL_IS_VOID( theid ) ) { append( b , "_id" , theid , EOO , stack.dive( o ) ); } } - + JSIdArray * properties = JS_Enumerate( _context , o ); assert( properties ); - - for ( jsint i=0; ilength; i++ ){ + + for ( jsint i=0; ilength; i++ ) { jsid id = properties->vector[i]; jsval nameval; assert( JS_IdToValue( _context ,id , &nameval ) ); string name = toString( nameval ); if ( stack.isTop() && name == "_id" ) continue; - + append( b , name , getProperty( o , name.c_str() ) , orig[name].type() , stack.dive( o ) ); } @@ -302,34 +315,34 @@ namespace mongo { return b.obj(); } - BSONObj toObject( jsval v ){ + BSONObj toObject( jsval v ) { if ( JSVAL_IS_NULL( v ) || - JSVAL_IS_VOID( v ) ) + JSVAL_IS_VOID( v ) ) return BSONObj(); uassert( 10215 , "not an object" , JSVAL_IS_OBJECT( v ) ); return toObject( JSVAL_TO_OBJECT( v ) ); } - string getFunctionCode( JSFunction * func ){ + string getFunctionCode( JSFunction * func ) { return toString( JS_DecompileFunction( _context , func , 0 ) ); } - string getFunctionCode( jsval v ){ + string getFunctionCode( jsval v ) { uassert( 10216 , "not a function" , JS_TypeOfValue( _context , v ) == JSTYPE_FUNCTION ); return getFunctionCode( JS_ValueToFunction( _context , v ) ); } - - void appendRegex( BSONObjBuilder& b , const string& name , string s ){ + + void appendRegex( BSONObjBuilder& b , const string& name , string s ) { assert( s[0] == '/' ); s = s.substr(1); string::size_type end = s.rfind( '/' ); - b.appendRegex( name , s.substr( 0 , end ).c_str() , s.substr( end + 1 ).c_str() ); + b.appendRegex( name , s.substr( 0 , end ) , s.substr( end + 1 ) ); } - void append( BSONObjBuilder& b , string name , jsval val , BSONType oldType = EOO , const TraverseStack& stack=TraverseStack() ){ + void append( BSONObjBuilder& b , string name , jsval val , BSONType oldType = EOO , const TraverseStack& stack=TraverseStack() ) { //cout << "name: " << name << "\t" << typeString( val ) << " oldType: " << oldType << endl; - switch ( JS_TypeOfValue( _context , val ) ){ + switch ( JS_TypeOfValue( _context , val ) ) { case JSTYPE_VOID: b.appendUndefined( name ); break; case JSTYPE_NULL: b.appendNull( name ); break; @@ -347,12 +360,12 @@ namespace mongo { case JSTYPE_OBJECT: { JSObject * o = JSVAL_TO_OBJECT( val ); - if ( ! o || o == JSVAL_NULL ){ + if ( ! o || o == JSVAL_NULL ) { b.appendNull( name ); } - else if ( ! appendSpecialDBObject( this , b , name , val , o ) ){ + else if ( ! appendSpecialDBObject( this , b , name , val , o ) ) { BSONObj sub = toObject( o , stack ); - if ( JS_IsArrayObject( _context , o ) ){ + if ( JS_IsArrayObject( _context , o ) ) { b.appendArray( name , sub ); } else { @@ -364,11 +377,11 @@ namespace mongo { case JSTYPE_FUNCTION: { string s = toString(val); - if ( s[0] == '/' ){ + if ( s[0] == '/' ) { appendRegex( b , name , s ); } else { - b.appendCode( name , getFunctionCode( val ).c_str() ); + b.appendCode( name , getFunctionCode( val ) ); } break; } @@ -379,25 +392,28 @@ namespace mongo { // ---------- to spider monkey --------- - bool hasFunctionIdentifier( const string& code ){ + bool hasFunctionIdentifier( const string& code ) { if ( code.size() < 9 || code.find( "function" ) != 0 ) return false; return code[8] == ' ' || code[8] == '('; } - bool isSimpleStatement( const string& code ){ + bool isSimpleStatement( const string& code ) { if ( hasJSReturn( code ) ) return false; - if ( code.find( ";" ) != string::npos && - code.find( ";" ) != code.rfind( ";" ) ) + if ( code.find( ';' ) != string::npos && + code.find( ';' ) != code.rfind( ';' ) ) + return false; + + if ( code.find( '\n') != string::npos ) return false; if ( code.find( "for(" ) != string::npos || - code.find( "for (" ) != string::npos || - code.find( "while (" ) != string::npos || - code.find( "while(" ) != string::npos ) + code.find( "for (" ) != string::npos || + code.find( "while (" ) != string::npos || + code.find( "while(" ) != string::npos ) return false; return true; @@ -405,20 +421,20 @@ namespace mongo { void addRoot( JSFunction * f , const char * name ); - JSFunction * compileFunction( const char * code, JSObject * assoc = 0 ){ + JSFunction * compileFunction( const char * code, JSObject * assoc = 0 ) { const char * gcName = "unknown"; JSFunction * f = _compileFunction( code , assoc , gcName ); //addRoot( f , gcName ); return f; } - JSFunction * _compileFunction( const char * raw , JSObject * assoc , const char *& gcName ){ + JSFunction * _compileFunction( const char * raw , JSObject * assoc , const char *& gcName ) { if ( ! assoc ) assoc = JS_GetGlobalObject( _context ); - while (isspace(*raw)) { - raw++; - } + raw = jsSkipWhiteSpace( raw ); + + //cout << "RAW\n---\n" << raw << "\n---" << endl; stringstream fname; fname << "cf_"; @@ -426,34 +442,34 @@ namespace mongo { fname << "_" << fnum++ << "_"; - if ( ! hasFunctionIdentifier( raw ) ){ + if ( ! hasFunctionIdentifier( raw ) ) { string s = raw; - if ( isSimpleStatement( s ) ){ + if ( isSimpleStatement( s ) ) { s = "return " + s; } gcName = "cf anon"; fname << "anon"; - return JS_CompileFunction( _context , assoc , fname.str().c_str() , 0 , 0 , s.c_str() , strlen( s.c_str() ) , "nofile_a" , 0 ); + return JS_CompileFunction( _context , assoc , fname.str().c_str() , 0 , 0 , s.c_str() , s.size() , "nofile_a" , 0 ); } string code = raw; - + size_t start = code.find( '(' ); assert( start != string::npos ); - + fname << "_f_" << trim( code.substr( 9 , start - 9 ) ); code = code.substr( start + 1 ); size_t end = code.find( ')' ); assert( end != string::npos ); - + string paramString = trim( code.substr( 0 , end ) ); code = code.substr( end + 1 ); - + vector params; - while ( paramString.size() ){ + while ( paramString.size() ) { size_t c = paramString.find( ',' ); - if ( c == string::npos ){ + if ( c == string::npos ) { params.push_back( paramString ); break; } @@ -461,14 +477,14 @@ namespace mongo { paramString = trim( paramString.substr( c + 1 ) ); paramString = trim( paramString ); } - + boost::scoped_array paramArray (new const char*[params.size()]); for ( size_t i=0; ifirstElement().fieldName() ){ + if ( ref == obj->firstElement().fieldName() ) { JSObject * o = JS_NewObject( _context , &dbref_class , NULL, NULL); CHECKNEWOBJECT(o,_context,"toJSObject1"); assert( JS_SetPrivate( _context , o , (void*)(new BSONHolder( obj->getOwned() ) ) ) ); @@ -527,7 +543,7 @@ namespace mongo { return o; } - jsval toval( const BSONObj* obj , bool readOnly=false ){ + jsval toval( const BSONObj* obj , bool readOnly=false ) { JSObject * o = toJSObject( obj , readOnly ); return OBJECT_TO_JSVAL( o ); } @@ -535,7 +551,7 @@ namespace mongo { void makeLongObj( long long n, JSObject * o ) { boost::uint64_t val = (boost::uint64_t)n; CHECKNEWOBJECT(o,_context,"NumberLong1"); - setProperty( o , "floatApprox" , toval( (double)(boost::int64_t)( val ) ) ); + setProperty( o , "floatApprox" , toval( (double)(boost::int64_t)( val ) ) ); if ( (boost::int64_t)val != (boost::int64_t)(double)(boost::int64_t)( val ) ) { // using 2 doubles here instead of a single double because certain double // bit patterns represent undefined values and sm might trash them @@ -543,16 +559,16 @@ namespace mongo { setProperty( o , "bottom" , toval( (double)(boost::uint32_t)( val & 0x00000000ffffffff ) ) ); } } - + jsval toval( long long n ) { JSObject * o = JS_NewObject( _context , &numberlong_class , 0 , 0 ); makeLongObj( n, o ); return OBJECT_TO_JSVAL( o ); } - - jsval toval( const BSONElement& e ){ - switch( e.type() ){ + jsval toval( const BSONElement& e ) { + + switch( e.type() ) { case EOO: case jstNULL: case Undefined: @@ -565,50 +581,50 @@ namespace mongo { return toval( e.valuestr() ); case Bool: return e.boolean() ? JSVAL_TRUE : JSVAL_FALSE; - case Object:{ + case Object: { BSONObj embed = e.embeddedObject().getOwned(); return toval( &embed ); } - case Array:{ + case Array: { BSONObj embed = e.embeddedObject().getOwned(); - if ( embed.isEmpty() ){ + if ( embed.isEmpty() ) { return OBJECT_TO_JSVAL( JS_NewArrayObject( _context , 0 , 0 ) ); } - - int n = embed.nFields(); - - JSObject * array = JS_NewArrayObject( _context , n , 0 ); - assert( array ); + + JSObject * array = JS_NewArrayObject( _context , 1 , 0 ); + CHECKJSALLOC( array ); jsval myarray = OBJECT_TO_JSVAL( array ); - for ( int i=0; iit(); *statep = PRIVATE_TO_JSVAL( it ); } else { - *statep = 0; + *statep = 0; } if ( idp ) *idp = JSVAL_ZERO; @@ -774,13 +790,13 @@ namespace mongo { } BSONFieldIterator * it = (BSONFieldIterator*)JSVAL_TO_PRIVATE( *statep ); - if ( ! it ){ + if ( ! it ) { *statep = 0; return JS_TRUE; } - if ( enum_op == JSENUMERATE_NEXT ){ - if ( it->more() ){ + if ( enum_op == JSENUMERATE_NEXT ) { + if ( it->more() ) { string name = it->next(); Convertor c(cx); assert( JS_ValueToId( cx , c.toval( name.c_str() ) , idp ) ); @@ -792,7 +808,7 @@ namespace mongo { return JS_TRUE; } - if ( enum_op == JSENUMERATE_DESTROY ){ + if ( enum_op == JSENUMERATE_DESTROY ) { if ( it ) delete it; return JS_TRUE; @@ -802,9 +818,9 @@ namespace mongo { return JS_FALSE; } - JSBool noaccess( JSContext *cx, JSObject *obj, jsval idval, jsval *vp){ + JSBool noaccess( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) { BSONHolder * holder = GETHOLDER( cx , obj ); - if ( ! holder ){ + if ( ! holder ) { // in init code still return JS_TRUE; } @@ -821,7 +837,7 @@ namespace mongo { JSCLASS_NO_OPTIONAL_MEMBERS }; - JSBool bson_cons( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ){ + JSBool bson_cons( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) { cerr << "bson_cons : shouldn't be here!" << endl; JS_ReportError( cx , "can't construct bson object" ); return JS_FALSE; @@ -830,26 +846,26 @@ namespace mongo { JSFunctionSpec bson_functions[] = { { 0 } }; - - JSBool bson_add_prop( JSContext *cx, JSObject *obj, jsval idval, jsval *vp){ + + JSBool bson_add_prop( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) { BSONHolder * holder = GETHOLDER( cx , obj ); - if ( ! holder ){ + if ( ! holder ) { // static init return JS_TRUE; } - if ( ! holder->_inResolve ){ + if ( ! holder->_inResolve ) { Convertor c(cx); string name = c.toString( idval ); - if ( holder->_obj[name].eoo() ){ + if ( holder->_obj[name].eoo() ) { holder->_extra.push_back( name ); } holder->_modified = true; } return JS_TRUE; } - - JSBool mark_modified( JSContext *cx, JSObject *obj, jsval idval, jsval *vp){ + + JSBool mark_modified( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) { Convertor c(cx); BSONHolder * holder = GETHOLDER( cx , obj ); if ( !holder ) // needed when we're messing with DBRef.prototype @@ -860,8 +876,8 @@ namespace mongo { holder->_removed.erase( c.toString( idval ) ); return JS_TRUE; } - - JSBool mark_modified_remove( JSContext *cx, JSObject *obj, jsval idval, jsval *vp){ + + JSBool mark_modified_remove( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) { Convertor c(cx); BSONHolder * holder = GETHOLDER( cx , obj ); if ( holder->_inResolve ) @@ -887,10 +903,10 @@ namespace mongo { // --- global helpers --- - JSBool native_print( JSContext * cx , JSObject * obj , uintN argc, jsval *argv, jsval *rval ){ + JSBool native_print( JSContext * cx , JSObject * obj , uintN argc, jsval *argv, jsval *rval ) { stringstream ss; Convertor c( cx ); - for ( uintN i=0; i 0 ) ss << " "; ss << c.toString( argv[i] ); @@ -900,32 +916,32 @@ namespace mongo { return JS_TRUE; } - JSBool native_helper( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ){ + JSBool native_helper( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ) { Convertor c(cx); - + NativeFunction func = (NativeFunction)((long long)c.getNumber( obj , "x" ) ); assert( func ); - + BSONObj a; - if ( argc > 0 ){ + if ( argc > 0 ) { BSONObjBuilder args; - for ( uintN i=0; i_obj.objsize(); } } @@ -976,36 +998,36 @@ namespace mongo { BSONObj temp = c.toObject( o ); size = temp.objsize(); } - + *rval = c.toval( size ); - return JS_TRUE; + return JS_TRUE; } - + JSFunctionSpec objectHelpers[] = { - { "bsonsize" , &bson_get_size , 1 , 0 , 0 } , - { 0 , 0 , 0 , 0 , 0 } + { "bsonsize" , &bson_get_size , 1 , 0 , 0 } , + { 0 , 0 , 0 , 0 , 0 } }; - + // end Object helpers - JSBool resolveBSONField( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ){ + JSBool resolveBSONField( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ) { assert( JS_EnterLocalRootScope( cx ) ); Convertor c( cx ); BSONHolder * holder = GETHOLDER( cx , obj ); - if ( ! holder ){ + if ( ! holder ) { // static init *objp = 0; JS_LeaveLocalRootScope( cx ); return JS_TRUE; } holder->check(); - + string s = c.toString( id ); BSONElement e = holder->_obj[ s.c_str() ]; - - if ( e.type() == EOO || holder->_removed.count( s ) ){ + + if ( e.type() == EOO || holder->_removed.count( s ) ) { *objp = 0; JS_LeaveLocalRootScope( cx ); return JS_TRUE; @@ -1025,12 +1047,12 @@ namespace mongo { holder->_inResolve = true; assert( JS_SetProperty( cx , obj , s.c_str() , &val ) ); holder->_inResolve = false; - - if ( val != JSVAL_NULL && val != JSVAL_VOID && JSVAL_IS_OBJECT( val ) ){ + + if ( val != JSVAL_NULL && val != JSVAL_VOID && JSVAL_IS_OBJECT( val ) ) { // TODO: this is a hack to get around sub objects being modified JSObject * oo = JSVAL_TO_OBJECT( val ); - if ( JS_InstanceOf( cx , oo , &bson_class , 0 ) || - JS_IsArrayObject( cx , oo ) ){ + if ( JS_InstanceOf( cx , oo , &bson_class , 0 ) || + JS_IsArrayObject( cx , oo ) ) { holder->_modified = true; } } @@ -1046,15 +1068,15 @@ namespace mongo { class SMEngine : public ScriptEngine { public: - SMEngine(){ + SMEngine() { #ifdef SM18 JS_SetCStringsAreUTF8(); #endif _runtime = JS_NewRuntime(8L * 1024L * 1024L); uassert( 10221 , "JS_NewRuntime failed" , _runtime ); - - if ( ! utf8Ok() ){ + + if ( ! utf8Ok() ) { log() << "*** warning: spider monkey build without utf8 support. consider rebuilding with utf8 support" << endl; } @@ -1063,7 +1085,7 @@ namespace mongo { uassert( 10222 , "assert not being executed" , x == 1 ); } - ~SMEngine(){ + ~SMEngine() { JS_DestroyRuntime( _runtime ); JS_ShutDown(); } @@ -1088,7 +1110,7 @@ namespace mongo { SMEngine * globalSMEngine; - void ScriptEngine::setup(){ + void ScriptEngine::setup() { globalSMEngine = new SMEngine(); globalScriptEngine = globalSMEngine; } @@ -1097,11 +1119,11 @@ namespace mongo { // ------ scope ------ - JSBool no_gc(JSContext *cx, JSGCStatus status){ + JSBool no_gc(JSContext *cx, JSGCStatus status) { return JS_FALSE; } - JSBool yes_gc(JSContext *cx, JSGCStatus status){ + JSBool yes_gc(JSContext *cx, JSGCStatus status) { return JS_TRUE; } @@ -1125,64 +1147,65 @@ namespace mongo { JS_SetOptions( _context , JS_GetOptions( _context ) | JSOPTION_VAROBJFIX ); JS_DefineFunctions( _context , _global , globalHelpers ); - + JS_DefineFunctions( _context , _convertor->getGlobalObject( "Object" ), objectHelpers ); //JS_SetGCCallback( _context , no_gc ); // this is useful for seeing if something is a gc problem _postCreateHacks(); } - - ~SMScope(){ + + ~SMScope() { smlock; uassert( 10223 , "deleted SMScope twice?" , _convertor ); - for ( list::iterator i=_roots.begin(); i != _roots.end(); i++ ){ + for ( list::iterator i=_roots.begin(); i != _roots.end(); i++ ) { JS_RemoveRoot( _context , *i ); } _roots.clear(); - - if ( _this ){ + + if ( _this ) { JS_RemoveRoot( _context , &_this ); _this = 0; } - if ( _convertor ){ + if ( _convertor ) { delete _convertor; _convertor = 0; } - - if ( _context ){ + + if ( _context ) { + // This is expected to reclaim _global as well. JS_DestroyContext( _context ); _context = 0; } } - - void reset(){ + + void reset() { smlock; assert( _convertor ); return; - if ( _this ){ + if ( _this ) { JS_RemoveRoot( _context , &_this ); _this = 0; } currentScope.reset( this ); _error = ""; } - - void addRoot( void * root , const char * name ){ + + void addRoot( void * root , const char * name ) { JS_AddNamedRoot( _context , root , name ); _roots.push_back( root ); } - void init( BSONObj * data ){ + void init( const BSONObj * data ) { smlock; if ( ! data ) return; BSONObjIterator i( *data ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); _convertor->setProperty( _global , e.fieldName() , _convertor->toval( e ) ); _initFieldNames.insert( e.fieldName() ); @@ -1190,7 +1213,7 @@ namespace mongo { } - void externalSetup(){ + void externalSetup() { smlock; uassert( 10224 , "already local connected" , ! _localConnect ); if ( _externalSetup ) @@ -1199,20 +1222,20 @@ namespace mongo { _externalSetup = true; } - void localConnect( const char * dbName ){ + void localConnect( const char * dbName ) { { smlock; uassert( 10225 , "already setup for external db" , ! _externalSetup ); - if ( _localConnect ){ + if ( _localConnect ) { uassert( 10226 , "connected to different db" , _localDBName == dbName ); return; } - + initMongoJS( this , _context , _global , true ); - + exec( "_mongo = new Mongo();" ); exec( ((string)"db = _mongo.getDB( \"" + dbName + "\" ); ").c_str() ); - + _localConnect = true; _localDBName = dbName; } @@ -1220,14 +1243,14 @@ namespace mongo { } // ----- getters ------ - double getNumber( const char *field ){ + double getNumber( const char *field ) { smlock; jsval val; assert( JS_GetProperty( _context , _global , field , &val ) ); return _convertor->toNumber( val ); } - string getString( const char *field ){ + string getString( const char *field ) { smlock; jsval val; assert( JS_GetProperty( _context , _global , field , &val ) ); @@ -1235,27 +1258,27 @@ namespace mongo { return _convertor->toString( s ); } - bool getBoolean( const char *field ){ + bool getBoolean( const char *field ) { smlock; return _convertor->getBoolean( _global , field ); } - BSONObj getObject( const char *field ){ + BSONObj getObject( const char *field ) { smlock; return _convertor->toObject( _convertor->getProperty( _global , field ) ); } - JSObject * getJSObject( const char * field ){ + JSObject * getJSObject( const char * field ) { smlock; return _convertor->getJSObject( _global , field ); } - int type( const char *field ){ + int type( const char *field ) { smlock; jsval val; assert( JS_GetProperty( _context , _global , field , &val ) ); - switch ( JS_TypeOfValue( _context , val ) ){ + switch ( JS_TypeOfValue( _context , val ) ) { case JSTYPE_VOID: return Undefined; case JSTYPE_NULL: return jstNULL; case JSTYPE_OBJECT: { @@ -1280,52 +1303,61 @@ namespace mongo { // ----- setters ------ - void setElement( const char *field , const BSONElement& val ){ + void setElement( const char *field , const BSONElement& val ) { smlock; jsval v = _convertor->toval( val ); assert( JS_SetProperty( _context , _global , field , &v ) ); } - void setNumber( const char *field , double val ){ + void setNumber( const char *field , double val ) { smlock; jsval v = _convertor->toval( val ); assert( JS_SetProperty( _context , _global , field , &v ) ); } - void setString( const char *field , const char * val ){ + void setString( const char *field , const char * val ) { smlock; jsval v = _convertor->toval( val ); assert( JS_SetProperty( _context , _global , field , &v ) ); } - void setObject( const char *field , const BSONObj& obj , bool readOnly ){ + void setObject( const char *field , const BSONObj& obj , bool readOnly ) { smlock; jsval v = _convertor->toval( &obj , readOnly ); JS_SetProperty( _context , _global , field , &v ); } - void setBoolean( const char *field , bool val ){ + void setBoolean( const char *field , bool val ) { smlock; jsval v = BOOLEAN_TO_JSVAL( val ); assert( JS_SetProperty( _context , _global , field , &v ) ); } - void setThis( const BSONObj * obj ){ + void setThis( const BSONObj * obj ) { smlock; - if ( _this ){ + if ( _this ) { JS_RemoveRoot( _context , &_this ); _this = 0; } - - if ( obj ){ + + if ( obj ) { _this = _convertor->toJSObject( obj ); JS_AddNamedRoot( _context , &_this , "scope this" ); } } + void rename( const char * from , const char * to ) { + smlock; + jsval v; + assert( JS_GetProperty( _context , _global , from , &v ) ); + assert( JS_SetProperty( _context , _global , to , &v ) ); + v = JSVAL_VOID; + assert( JS_SetProperty( _context , _global , from , &v ) ); + } + // ---- functions ----- - ScriptingFunction _createFunction( const char * code ){ + ScriptingFunction _createFunction( const char * code ) { smlock; precall(); return (ScriptingFunction)_convertor->compileFunction( code ); @@ -1337,40 +1369,49 @@ namespace mongo { int count; }; - static JSBool _checkTimeout( JSContext *cx ){ + // should not generate exceptions, as those can be caught in + // javascript code; returning false without an exception exits + // immediately + static JSBool _interrupt( JSContext *cx ) { TimeoutSpec &spec = *(TimeoutSpec *)( JS_GetContextPrivate( cx ) ); if ( ++spec.count % 1000 != 0 ) return JS_TRUE; + const char * interrupt = ScriptEngine::checkInterrupt(); + if ( interrupt && interrupt[ 0 ] ) { + return JS_FALSE; + } + if ( spec.timeout.ticks() == 0 ) { + return JS_TRUE; + } boost::posix_time::time_duration elapsed = ( boost::posix_time::microsec_clock::local_time() - spec.start ); if ( elapsed < spec.timeout ) { return JS_TRUE; } - JS_ReportError( cx, "Timeout exceeded" ); return JS_FALSE; } - static JSBool checkTimeout( JSContext *cx, JSScript *script ){ - return _checkTimeout( cx ); - } + static JSBool interrupt( JSContext *cx, JSScript *script ) { + return _interrupt( cx ); + } - void installCheckTimeout( int timeoutMs ) { - if ( timeoutMs > 0 ) { + void installInterrupt( int timeoutMs ) { + if ( timeoutMs != 0 || ScriptEngine::haveCheckInterruptCallback() ) { TimeoutSpec *spec = new TimeoutSpec; spec->timeout = boost::posix_time::millisec( timeoutMs ); spec->start = boost::posix_time::microsec_clock::local_time(); spec->count = 0; JS_SetContextPrivate( _context, (void*)spec ); #if defined(SM181) && !defined(XULRUNNER190) - JS_SetOperationCallback( _context, _checkTimeout ); + JS_SetOperationCallback( _context, _interrupt ); #else - JS_SetBranchCallback( _context, checkTimeout ); + JS_SetBranchCallback( _context, interrupt ); #endif } } - void uninstallCheckTimeout( int timeoutMs ) { - if ( timeoutMs > 0 ) { + void uninstallInterrupt( int timeoutMs ) { + if ( timeoutMs != 0 || ScriptEngine::haveCheckInterruptCallback() ) { #if defined(SM181) && !defined(XULRUNNER190) JS_SetOperationCallback( _context , 0 ); #else @@ -1381,34 +1422,33 @@ namespace mongo { } } - void precall(){ + void precall() { _error = ""; currentScope.reset( this ); } - bool exec( const string& code , const string& name = "(anon)" , bool printResult = false , bool reportError = true , bool assertOnError = true, int timeoutMs = 0 ){ + bool exec( const StringData& code , const string& name = "(anon)" , bool printResult = false , bool reportError = true , bool assertOnError = true, int timeoutMs = 0 ) { smlock; precall(); jsval ret = JSVAL_VOID; - installCheckTimeout( timeoutMs ); - JSBool worked = JS_EvaluateScript( _context , _global , code.c_str() , strlen( code.c_str() ) , name.c_str() , 0 , &ret ); - uninstallCheckTimeout( timeoutMs ); + installInterrupt( timeoutMs ); + JSBool worked = JS_EvaluateScript( _context , _global , code.data() , code.size() , name.c_str() , 1 , &ret ); + uninstallInterrupt( timeoutMs ); - if ( ! worked && _error.size() == 0 ){ + if ( ! worked && _error.size() == 0 ) { jsval v; - if ( JS_GetPendingException( _context , &v ) ){ + if ( JS_GetPendingException( _context , &v ) ) { _error = _convertor->toString( v ); if ( reportError ) cout << _error << endl; } } - if ( assertOnError ) - uassert( 10228 , name + " exec failed" , worked ); + uassert( 10228 , str::stream() << name + " exec failed: " << _error , worked || ! assertOnError ); - if ( reportError && ! _error.empty() ){ + if ( reportError && ! _error.empty() ) { // cout << "exec error: " << _error << endl; // already printed in reportError, so... TODO } @@ -1421,23 +1461,23 @@ namespace mongo { return worked; } - - int invoke( JSFunction * func , const BSONObj& args, int timeoutMs , bool ignoreReturn ){ + + int invoke( JSFunction * func , const BSONObj& args, int timeoutMs , bool ignoreReturn ) { smlock; precall(); assert( JS_EnterLocalRootScope( _context ) ); - + int nargs = args.nFields(); scoped_array smargsPtr( new jsval[nargs] ); - if ( nargs ){ + if ( nargs ) { BSONObjIterator it( args ); - for ( int i=0; itoval( it.next() ); } } - if ( args.isEmpty() ){ + if ( args.isEmpty() ) { _convertor->setProperty( _global , "args" , JSVAL_NULL ); } else { @@ -1446,35 +1486,35 @@ namespace mongo { JS_LeaveLocalRootScope( _context ); - installCheckTimeout( timeoutMs ); + installInterrupt( timeoutMs ); jsval rval; JSBool ret = JS_CallFunction( _context , _this ? _this : _global , func , nargs , smargsPtr.get() , &rval ); - uninstallCheckTimeout( timeoutMs ); + uninstallInterrupt( timeoutMs ); if ( !ret ) { return -3; } - - if ( ! ignoreReturn ){ + + if ( ! ignoreReturn ) { assert( JS_SetProperty( _context , _global , "return" , &rval ) ); } return 0; } - int invoke( ScriptingFunction funcAddr , const BSONObj& args, int timeoutMs = 0 , bool ignoreReturn = 0 ){ + int invoke( ScriptingFunction funcAddr , const BSONObj& args, int timeoutMs = 0 , bool ignoreReturn = 0 ) { return invoke( (JSFunction*)funcAddr , args , timeoutMs , ignoreReturn ); } - void gotError( string s ){ + void gotError( string s ) { _error = s; } - string getError(){ + string getError() { return _error; } - void injectNative( const char *field, NativeFunction func ){ + void injectNative( const char *field, NativeFunction func ) { smlock; string name = field; _convertor->setProperty( _global , (name + "_").c_str() , _convertor->toval( (double)(long long)func ) ); @@ -1482,19 +1522,19 @@ namespace mongo { stringstream code; code << field << "_" << " = { x : " << field << "_ }; "; code << field << " = function(){ return nativeHelper.apply( " << field << "_ , arguments ); }"; - exec( code.str().c_str() ); + exec( code.str() ); } - virtual void gc(){ + virtual void gc() { smlock; JS_GC( _context ); } JSContext *SavedContext() const { return _context; } - + private: - void _postCreateHacks(){ + void _postCreateHacks() { #ifdef XULRUNNER exec( "__x__ = new Date(1);" ); globalSMEngine->_dateClass = _convertor->getClass( _global , "__x__" ); @@ -1502,7 +1542,7 @@ namespace mongo { globalSMEngine->_regexClass = _convertor->getClass( _global , "__x__" ); #endif } - + JSContext * _context; Convertor * _convertor; @@ -1514,41 +1554,41 @@ namespace mongo { bool _externalSetup; bool _localConnect; - + set _initFieldNames; - + }; /* used to make the logging not overly chatty in the mongo shell. */ extern bool isShell; - void errorReporter( JSContext *cx, const char *message, JSErrorReport *report ){ + void errorReporter( JSContext *cx, const char *message, JSErrorReport *report ) { stringstream ss; - if( !isShell ) + if( !isShell ) ss << "JS Error: "; ss << message; - if ( report && report->filename ){ + if ( report && report->filename ) { ss << " " << report->filename << ":" << report->lineno; } tlog() << ss.str() << endl; - if ( currentScope.get() ){ + if ( currentScope.get() ) { currentScope->gotError( ss.str() ); } } - JSBool native_load( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ){ + JSBool native_load( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ) { Convertor c(cx); Scope * s = currentScope.get(); - for ( uintN i=0; iexecFile( filename , false , true , false ) ){ + if ( ! s->execFile( filename , false , true , false ) ) { JS_ReportError( cx , ((string)"error loading js file: " + filename ).c_str() ); return JS_FALSE; } @@ -1559,7 +1599,7 @@ namespace mongo { - void SMEngine::runTest(){ + void SMEngine::runTest() { SMScope s; s.localConnect( "foo" ); @@ -1589,17 +1629,17 @@ namespace mongo { } - Scope * SMEngine::createScope(){ + Scope * SMEngine::createScope() { return new SMScope(); } - void Convertor::addRoot( JSFunction * f , const char * name ){ + void Convertor::addRoot( JSFunction * f , const char * name ) { if ( ! f ) return; SMScope * scope = currentScope.get(); uassert( 10229 , "need a scope" , scope ); - + JSObject * o = JS_GetFunctionObject( f ); assert( o ); scope->addRoot( &o , name ); diff --git a/scripting/engine_spidermonkey.h b/scripting/engine_spidermonkey.h index 4617b5d..3ee7495 100644 --- a/scripting/engine_spidermonkey.h +++ b/scripting/engine_spidermonkey.h @@ -37,7 +37,7 @@ #include "jstypes.h" #undef JS_PUBLIC_API #undef JS_PUBLIC_DATA -#define JS_PUBLIC_API(t) t __cdecl +#define JS_PUBLIC_API(t) t __cdecl #define JS_PUBLIC_DATA(t) t #endif @@ -64,7 +64,7 @@ #define JSCLASS_GLOBAL_FLAGS 0 -JSBool JS_CStringsAreUTF8(){ +JSBool JS_CStringsAreUTF8() { return false; } @@ -85,7 +85,7 @@ namespace mongo { class SMScope; class Convertor; - + extern JSClass bson_class; extern JSClass bson_ro_class; @@ -99,10 +99,10 @@ namespace mongo { extern JSClass maxkey_class; // internal things - void dontDeleteScope( SMScope * s ){} + void dontDeleteScope( SMScope * s ) {} void errorReporter( JSContext *cx, const char *message, JSErrorReport *report ); extern boost::thread_specific_ptr currentScope; - + // bson JSBool resolveBSONField( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ); @@ -112,14 +112,14 @@ namespace mongo { bool appendSpecialDBObject( Convertor * c , BSONObjBuilder& b , const string& name , jsval val , JSObject * o ); #define JSVAL_IS_OID(v) ( JSVAL_IS_OBJECT( v ) && JS_InstanceOf( cx , JSVAL_TO_OBJECT( v ) , &object_id_class , 0 ) ) - + bool isDate( JSContext * cx , JSObject * o ); // JS private data must be 2byte aligned, so we use a holder to refer to an unaligned pointer. struct BinDataHolder { BinDataHolder( const char *c, int copyLen = -1 ) : - c_( const_cast< char * >( c ) ), - iFree_( copyLen != -1 ) { + c_( const_cast< char * >( c ) ), + iFree_( copyLen != -1 ) { if ( copyLen != -1 ) { c_ = (char*)malloc( copyLen ); memcpy( c_, c, copyLen ); diff --git a/scripting/engine_v8.cpp b/scripting/engine_v8.cpp index 08826b1..cd186b4 100644 --- a/scripting/engine_v8.cpp +++ b/scripting/engine_v8.cpp @@ -1,4 +1,4 @@ -//engine_v8.cpp +//engine_v8.cpp /* Copyright 2009 10gen Inc. * @@ -21,54 +21,74 @@ #include "v8_utils.h" #include "v8_db.h" -#define V8_SIMPLE_HEADER Locker l; HandleScope handle_scope; Context::Scope context_scope( _context ); +#define V8_SIMPLE_HEADER V8Lock l; HandleScope handle_scope; Context::Scope context_scope( _context ); namespace mongo { + // guarded by v8 mutex + map< unsigned, int > __interruptSpecToThreadId; + // --- engine --- V8ScriptEngine::V8ScriptEngine() {} - - V8ScriptEngine::~V8ScriptEngine(){ + + V8ScriptEngine::~V8ScriptEngine() { } - void ScriptEngine::setup(){ - if ( !globalScriptEngine ){ + void ScriptEngine::setup() { + if ( !globalScriptEngine ) { globalScriptEngine = new V8ScriptEngine(); } } + void V8ScriptEngine::interrupt( unsigned opSpec ) { + v8::Locker l; + if ( __interruptSpecToThreadId.count( opSpec ) ) { + V8::TerminateExecution( __interruptSpecToThreadId[ opSpec ] ); + } + } + void V8ScriptEngine::interruptAll() { + v8::Locker l; + vector< int > toKill; // v8 mutex could potentially be yielded during the termination call + for( map< unsigned, int >::const_iterator i = __interruptSpecToThreadId.begin(); i != __interruptSpecToThreadId.end(); ++i ) { + toKill.push_back( i->second ); + } + for( vector< int >::const_iterator i = toKill.begin(); i != toKill.end(); ++i ) { + V8::TerminateExecution( *i ); + } + } + // --- scope --- - - V8Scope::V8Scope( V8ScriptEngine * engine ) - : _engine( engine ) , - _connectState( NOT ){ - Locker l; - HandleScope handleScope; + V8Scope::V8Scope( V8ScriptEngine * engine ) + : _engine( engine ) , + _connectState( NOT ) { + + V8Lock l; + HandleScope handleScope; _context = Context::New(); Context::Scope context_scope( _context ); _global = Persistent< v8::Object >::New( _context->Global() ); _this = Persistent< v8::Object >::New( v8::Object::New() ); - _global->Set(v8::String::New("print"), v8::FunctionTemplate::New(Print)->GetFunction() ); - _global->Set(v8::String::New("version"), v8::FunctionTemplate::New(Version)->GetFunction() ); + _global->Set(v8::String::New("print"), newV8Function< Print >()->GetFunction() ); + _global->Set(v8::String::New("version"), newV8Function< Version >()->GetFunction() ); _global->Set(v8::String::New("load"), - v8::FunctionTemplate::New(loadCallback, v8::External::New(this))->GetFunction() ); - + v8::FunctionTemplate::New( v8Callback< loadCallback >, v8::External::New(this))->GetFunction() ); + _wrapper = Persistent< v8::Function >::New( getObjectWrapperTemplate()->GetFunction() ); - - _global->Set(v8::String::New("gc"), v8::FunctionTemplate::New(GCV8)->GetFunction() ); + + _global->Set(v8::String::New("gc"), newV8Function< GCV8 >()->GetFunction() ); installDBTypes( _global ); } - V8Scope::~V8Scope(){ - Locker l; - Context::Scope context_scope( _context ); + V8Scope::~V8Scope() { + V8Lock l; + Context::Scope context_scope( _context ); _wrapper.Dispose(); _this.Dispose(); for( unsigned i = 0; i < _funcs.size(); ++i ) @@ -79,7 +99,7 @@ namespace mongo { } Handle< Value > V8Scope::nativeCallback( const Arguments &args ) { - Locker l; + V8Lock l; HandleScope handle_scope; Local< External > f = External::Cast( *args.Callee()->Get( v8::String::New( "_native_function" ) ) ); NativeFunction function = (NativeFunction)(f->Value()); @@ -93,16 +113,18 @@ namespace mongo { BSONObj ret; try { ret = function( nativeArgs ); - } catch( const std::exception &e ) { + } + catch( const std::exception &e ) { return v8::ThrowException(v8::String::New(e.what())); - } catch( ... ) { - return v8::ThrowException(v8::String::New("unknown exception")); + } + catch( ... ) { + return v8::ThrowException(v8::String::New("unknown exception")); } return handle_scope.Close( mongoToV8Element( ret.firstElement() ) ); } Handle< Value > V8Scope::loadCallback( const Arguments &args ) { - Locker l; + V8Lock l; HandleScope handle_scope; Handle field = Handle::Cast(args.Data()); void* ptr = field->Value(); @@ -120,46 +142,46 @@ namespace mongo { // ---- global stuff ---- - void V8Scope::init( BSONObj * data ){ - Locker l; + void V8Scope::init( const BSONObj * data ) { + V8Lock l; if ( ! data ) return; - + BSONObjIterator i( *data ); - while ( i.more() ){ + while ( i.more() ) { BSONElement e = i.next(); setElement( e.fieldName() , e ); } } - - void V8Scope::setNumber( const char * field , double val ){ + + void V8Scope::setNumber( const char * field , double val ) { V8_SIMPLE_HEADER _global->Set( v8::String::New( field ) , v8::Number::New( val ) ); } - void V8Scope::setString( const char * field , const char * val ){ + void V8Scope::setString( const char * field , const char * val ) { V8_SIMPLE_HEADER _global->Set( v8::String::New( field ) , v8::String::New( val ) ); } - void V8Scope::setBoolean( const char * field , bool val ){ + void V8Scope::setBoolean( const char * field , bool val ) { V8_SIMPLE_HEADER _global->Set( v8::String::New( field ) , v8::Boolean::New( val ) ); } - void V8Scope::setElement( const char *field , const BSONElement& e ){ + void V8Scope::setElement( const char *field , const BSONElement& e ) { V8_SIMPLE_HEADER _global->Set( v8::String::New( field ) , mongoToV8Element( e ) ); } - void V8Scope::setObject( const char *field , const BSONObj& obj , bool readOnly){ + void V8Scope::setObject( const char *field , const BSONObj& obj , bool readOnly) { V8_SIMPLE_HEADER // Set() accepts a ReadOnly parameter, but this just prevents the field itself // from being overwritten and doesn't protect the object stored in 'field'. _global->Set( v8::String::New( field ) , mongoToV8( obj, false, readOnly) ); } - int V8Scope::type( const char *field ){ + int V8Scope::type( const char *field ) { V8_SIMPLE_HEADER Handle v = get( field ); if ( v->IsNull() ) @@ -178,7 +200,7 @@ namespace mongo { return NumberInt; if ( v->IsNumber() ) return NumberDouble; - if ( v->IsExternal() ){ + if ( v->IsExternal() ) { uassert( 10230 , "can't handle external yet" , 0 ); return -1; } @@ -190,36 +212,36 @@ namespace mongo { throw UserException( 12509, (string)"don't know what this is: " + field ); } - v8::Handle V8Scope::get( const char * field ){ + v8::Handle V8Scope::get( const char * field ) { return _global->Get( v8::String::New( field ) ); } - double V8Scope::getNumber( const char *field ){ + double V8Scope::getNumber( const char *field ) { V8_SIMPLE_HEADER return get( field )->ToNumber()->Value(); } - int V8Scope::getNumberInt( const char *field ){ + int V8Scope::getNumberInt( const char *field ) { V8_SIMPLE_HEADER return get( field )->ToInt32()->Value(); } - long long V8Scope::getNumberLongLong( const char *field ){ + long long V8Scope::getNumberLongLong( const char *field ) { V8_SIMPLE_HEADER return get( field )->ToInteger()->Value(); } - string V8Scope::getString( const char *field ){ + string V8Scope::getString( const char *field ) { V8_SIMPLE_HEADER return toSTLString( get( field ) ); } - bool V8Scope::getBoolean( const char *field ){ + bool V8Scope::getBoolean( const char *field ) { V8_SIMPLE_HEADER return get( field )->ToBoolean()->Value(); } - - BSONObj V8Scope::getObject( const char * field ){ + + BSONObj V8Scope::getObject( const char * field ) { V8_SIMPLE_HEADER Handle v = get( field ); if ( v->IsNull() || v->IsUndefined() ) @@ -227,21 +249,28 @@ namespace mongo { uassert( 10231 , "not an object" , v->IsObject() ); return v8ToMongo( v->ToObject() ); } - + // --- functions ----- - Local< v8::Function > V8Scope::__createFunction( const char * raw ){ - for(; isspace( *raw ); ++raw ); // skip whitespace + bool hasFunctionIdentifier( const string& code ) { + if ( code.size() < 9 || code.find( "function" ) != 0 ) + return false; + + return code[8] == ' ' || code[8] == '('; + } + + Local< v8::Function > V8Scope::__createFunction( const char * raw ) { + raw = jsSkipWhiteSpace( raw ); string code = raw; - if ( code.find( "function" ) == string::npos ){ - if ( code.find( "\n" ) == string::npos && - ! hasJSReturn( code ) && - ( code.find( ";" ) == string::npos || code.find( ";" ) == code.size() - 1 ) ){ + if ( !hasFunctionIdentifier( code ) ) { + if ( code.find( "\n" ) == string::npos && + ! hasJSReturn( code ) && + ( code.find( ";" ) == string::npos || code.find( ";" ) == code.size() - 1 ) ) { code = "return " + code; } code = "function(){ " + code + "}"; } - + int num = _funcs.size() + 1; string fn; @@ -250,29 +279,30 @@ namespace mongo { ss << "_funcs" << num; fn = ss.str(); } - + code = fn + " = " + code; TryCatch try_catch; - Handle