Skip to content

Instantly share code, notes, and snippets.

@johnlpage
Last active December 8, 2015 13:19
Show Gist options
  • Save johnlpage/e0bb9971f4f1c4ed3a09 to your computer and use it in GitHub Desktop.
Save johnlpage/e0bb9971f4f1c4ed3a09 to your computer and use it in GitHub Desktop.
Data Loading and Aggregation demo in MongoDB

#Start a machine use any method you like here, I'm doing it from the command line 40 cores, 120GB RAM, SSD with 8,500 IOPS (SAN) - Amazon linux (Centos)

ec2-run-instances ami-a10897d6 -t m4.10xlarge -g jlp -k john_page_demos -b "/dev/xvdb=:725:true:io1:8500"

#Log on

ssh -L 27017:localhost:27017 -i <your public key>.pem ec2-user@<your ip address>

#Set up disk

sudo -s
mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/xvdb
mkdir /data
mount /dev/xvdb /data
chown ec2-user /data

#Download MongoDB

vi /etc/yum.repos.d/mongodb-org-3.0.repo
[mongodb-org-3.0]
name=MongoDB Repository
baseurl=https://repo.mongodb.org/yum/amazon/2013.03/mongodb-org/3.0/x86_64/
gpgcheck=0
enabled=1
yum install -y mongodb-org
service mongod stop

#Setup Server Kernel etc.

echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
blockdev --setra 32 /dev/xvdb
echo "* soft nofile 20000" >> /etc/security/limits.conf
echo "* hard nofile 20000" >> /etc/security/limits.conf
exit

#Start Mongo Instances

for s in `seq 1 40`
do
 mkdir /data/shard$s 
 let port=$s+27100
 numactl --interleave=all mongod --storageEngine=wiredTiger --wiredTigerCacheSizeGB=3 --nojournal --port=$port --dbpath=/data/shard$s --logpath=/data/log$s --fork
done

#Start Config Server and Router

mkdir /data/config
numactl --interleave=all  mongod --configsvr --port 27019 --dbpath=/data/config --logpath=/data/config.log --fork
mongos --configdb=localhost:27019 --logpath=/data/mongos.log --fork

#Start mongo shell and add shards

mongo
for(s=1;s<=40;s++) {
 sh.addShard("localhost:"+(s+27100))
}
sh.status()
exit

Download raw data

mkdir /data/raw
cd /data/raw
curl -s -O http://data.dft.gov.uk/anonymised-mot-test/12-03/test_result_[2005-2013].txt.gz &

#Install Dev tools

cd
sudo yum groupinstall -y "development tools"

#Install MongoDB C Driver

cd
git clone https://github.com/mongodb/mongo-c-driver.git
cd mongo-c-driver
./autogen.sh
make
sudo make install

#Install Fast Loader

cd
git clone http://github.com/johnlpage/FastLoad
cd FastLoad
make

#Unzip Files once downloaded

cd /data/raw
wait

for f in *.gz
do
gunzip $f&
done
wait

#Load them in

export LD_LIBRARY_PATH=/usr/local/lib
time for f in *
do
~/FastLoad/fastload $f
done

#Test connection over ssh tunnel

mongo
sh.status()
use vosa
db.mot_results.findOne()
db.mot_results.count()
exit
mongostat

#Install matplotlib

pip install matplotlib

#Install pymongo

pip install pymongo

#Start python

python
from pymongo import *
from pprint import pprint
from matplotlib import pyplot as pyplot
import time
client = MongoClient()
db = client.vosa
db.mot_results.count()
doc = db.mot_results.find_one()
pprint(doc)
ageinusecs = { "$subtract" : [ "$TestDate", "$FirstUseDate" ] }
age = { "$divide" :[ ageinusecs , (1000*3600*24*365) ] }
ageinyears = { "$subtract" : [ age, { "$mod" : [ age,1 ]} ] }
ispass =  { "$cond" : [{"$eq": ["$Result","P"]},1,0]}
project = { "$project" : { "_id":0,"Make":1,"Result":1, "TestDate":1,"Mileage":1,"FirstUseDate":1,"Age":ageinyears,"pass":ispass }}
results = db.mot_results.aggregate([project,{"$limit":5}])
pprint(list(results))

carsonly = { "$match" : { "TestClass" : { "$eq" : 4}}}
knownage = { "$match" : { "FirstUseDate" : { "$exists": True}}}
group = { "$group" : { "_id" : { "make": "$Make", "age" : "$Age" }, "count" : {"$sum":1} , "miles": {"$avg":"$Mileage"},"passes":{"$sum":"$pass"}}}
out = { "$out" : "summary" }

t0 = time.time()
results = db.mot_results.aggregate([carsonly,knownage,project,group,out])
print time.time() - t0

age=[]
reliability=[]
labels = []
colours = []

for r in db.summary.find():
	count = r['count']
	if count > 2000:
		id = r['_id']
		age.append(id['age'])
		passes = r['passes']
		reliability.append(passes/float(count))
		make = id['make']
		labels.append(make)
		colours.append(hash(make) % 65535)

figure = pyplot.figure();
axis = figure.add_subplot(111);
axis.scatter(age,reliability,c=colours,picker=5,s=80,alpha=0.3)

def onpick(event):
	print labels[event.ind[0]]

	
figure.canvas.mpl_connect('pick_event',onpick)
pyplot.show()

filter = {"$match" : { "count" : { "$gte" : 2000 } } }
sort = {"$sort": { "_id" : 1 }}
groupmake = { "$group" : { "_id" : "$_id.make" , "years" : { "$push" : { "age" :"$_id.age", "miles" : "$miles" } } } }
results = db.summary.aggregate([filter,sort,groupmake])

figure = pyplot.figure();
axis = figure.add_subplot(111);

makes = {}
for r in results:
	make = r['_id']
	age=[]
	miles=[]
	yeardata = r['years']
	for y in yeardata:
		age.append(y['age'])
		miles.append(y['miles'])
	tp = axis.plot(age,miles,picker=5)
	makes[tp[0]]=make

def onpick(event):
	artist = event.artist
	print makes[artist]

figure.canvas.mpl_connect('pick_event',onpick)
pyplot.show()

miles=[]
reliability=[]
labels = []
colours = []

for r in db.summary.find():
	count = r['count']
	if count > 2000:
		id = r['_id']
		miles.append(r['miles'])
		passes = r['passes']
		reliability.append(passes/float(count))
		make = id['make']
		labels.append(make)
		colours.append(hash(make) % 65535)

figure = pyplot.figure();
axis = figure.add_subplot(111);
axis.scatter(miles,reliability,c=colours,picker=5,s=80,alpha=0.3)

def onpick(event):
	print labels[event.ind[0]]

	
figure.canvas.mpl_connect('pick_event',onpick)
pyplot.show()
@shimirel
Copy link

Before running
pip install matplotlib
you need to run
sudo yum -y install freetype freetype-devel libpng-devel
otherwise you get the error "Command python setup.py egg_info failed with error code 1".

If you use an out of the box install of Amazon command line. You need to add the region to
ec2-run-instances ami-a10897d6 -t m4.10xlarge -g jlp -k john_page_demos -b "/dev/xvdb=:725:true:io1:8500"
e.g.
ec2-run-instances ami-a10897d6 -t m4.10xlarge -g default -k devenv-key -b "/dev/xvdb=:725:true:io1:8500" -region eu-west-1
otherwise it will complain about being unable to find the template "ami-a10897d6".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment