Skip to content

Instantly share code, notes, and snippets.

@comerford
Created February 27, 2014 13:27
Show Gist options
  • Save comerford/9249951 to your computer and use it in GitHub Desktop.
Save comerford/9249951 to your computer and use it in GitHub Desktop.
Creating an odd chunk distribution in MongoDB - mistaken pre-split
// start a shell from the command line, do not connect to a database
./mongo --nodb
// using that shell start a new cluster, with a 1MB chunk size
cluster = new ShardingTest({shards: 2, chunksize: 1});
// open another shell (previous one will be full of logging and not actually connected to anything)
./mongo --port 30999
// stop the balancer
sh.stopBalancer()
sh.getBalancerState()
// select test DB, enable sharding
use chunktest;
sh.enableSharding("chunktest");
// Need a GUID function, bit hacky, but it works:
function GUID () {
var S4 = function () {
return Math.floor(
Math.random() * 0x10000 /* 65536 */
).toString(16);
};
return (
"" + S4() + S4() + S4() + S4() + S4() + S4() + S4() + S4()
);
}
// insert 10,000,000 docs *before* trying thr pre-split
// NOTE: this is a mistake usually, doing this on purpose
for(var i = 0; i < 10000000; i++){db.prefixsplit.insert({"_id" : GUID(), "date" : new Date(), "otherID" : new ObjectId()})}
//Shard the collection (this will cause initial splits, lots of them)
sh.shardCollection("chunktest.prefixsplit", {"_id" : 1});
// Attempt now to "pre" split the collection after the initial split
for ( var x=0; x<16; x++ ){
for( var y=0; y<16; y++ ) {
for ( var z=0; z<16; z+=4 ) {
var prefix = '' + x.toString(16) + y.toString(16) + z.toString(16) + "00000000000000000000000000000";
db.adminCommand( { split : "chunktest.prefixsplit" , middle : { _id : prefix } } );
}
}
}
// now check out the chunk info, will be pretty weird
AllChunkInfo = function(ns){
var chunks = db.getSiblingDB("config").chunks.find({"ns" : ns}).sort({min:1}); //this will return all chunks for the ns ordered by min
//some counters for overall stats at the end
var totalChunks = 0;
var totalSize = 0;
var totalEmpty = 0;
print("ChunkID,ChunkSize,ObjectsInChunk");
// iterate over all the chunks, print out info for each
chunks.forEach(
function printChunkInfo(chunk) {
var db1 = db.getSiblingDB(chunk.ns.split(".")[0]); // could do this without the var but gets unwieldy
var key = db.getSiblingDB("config").collections.findOne({_id:chunk.ns}).key; // will need this for the dataSize call
var dataSizeResult = db1.runCommand({datasize:chunk.ns, keyPattern:key, min:chunk.min, max:chunk.max, estimate:true}); // returns the info we need on the data
// printjson(dataSizeResult); // uncomment to see how long it takes to run and status
// print("***********Chunk Information***********");
print(chunk._id+","+dataSizeResult.size+","+dataSizeResult.numObjects);
// print("Chunk ID: " + chunk._id);
// print("Chunk Size: "+dataSizeResult.size);
// print("Objects in chunk: "+dataSizeResult.numObjects);
totalSize += dataSizeResult.size;
totalChunks++;
if (dataSizeResult.size == 0) { totalEmpty++ }; //count empty chunks for summary
}
)
print("***********Summary Chunk Information***********");
print("Total Chunks: "+totalChunks);
print("Average Chunk Size (bytes): "+(totalSize/totalChunks));
print("Empty Chunks: "+totalEmpty);
print("Average Chunk Size (non-empty): "+(totalSize/(totalChunks-totalEmpty)));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment