Skip to content

Instantly share code, notes, and snippets.

@9b
Created September 22, 2011 17:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9b/1235452 to your computer and use it in GitHub Desktop.
Save 9b/1235452 to your computer and use it in GitHub Desktop.
Generate a bunch of simple mapreduce jobs and output accordingly
import simplejson as json
from pymongo import Connection
from bson.code import Code
#cheap connection
def connect_to_mongo(host, port, database, collection):
connection = Connection(host, port)
db = connection[database]
collection = db[collection]
return collection
#connect
con = connect_to_mongo("localhost", 27017, "pdfs", "targeted")
#define the leaf nodes (this should be automated)
content_objects_set = { "sets": [
{
"set_name":"contents.objects",
"set_values":[
{"name":"encrypted"},
{"name":"id"},
{"name":"offset"},
{"name":"raw"},
{"name":"raw_hash"},
{"name":"size"},
{"name":"stream.decoded_hash"},
{"name":"stream.decoded_stream"},
{"name":"stream.encoded_hash"},
{"name":"stream.encoded_stream"},
{"name":"stream.filter"},
{"name":"stream.size"},
]
},{
"set_name":"versions",
"set_values":[
{"name":"author"},
{"name":"creation_date"},
{"name":"creator"},
{"name":"modification_date"},
{"name":"producer"}
]
}
] }
#load the json
dumped = json.dumps(content_objects_set)
loaded = json.loads(dumped)
#iteration
loaded_sets = loaded.get("sets")
for sets in loaded_sets:
set_name = sets.get("set_name")
set_values = sets.get("set_values")
for values in set_values:
value_name = values.get("name")
#mapreduce calls here based on the sets
map = Code(
"function () {"
"var hash = this.hash_data.file.md5;"
"this." + set_name + ".forEach("
"function (z) {"
"emit(z." + value_name + ", {hash: hash, count: 1});"
"}"
");"
"}"
)
reduce = Code(
"function (key, values) {"
"var hashes = [];"
"var total = 0;"
"for (var i = 0; i < values.length; i++) {"
"if (hashes.indexOf(values[i].hash) < 0) {"
"total += values[i].count;"
"hashes.push(values[i].hash);"
"}"
"}"
"return { hashes: hashes, count: total };"
"}"
)
written = set_name + "_" + value_name
print "[*] Executing job: " + written
con.map_reduce(map,reduce,written)
print "[*] " + set_name + ":" + value_name + " job ran" #sanity print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment