Skip to content

Instantly share code, notes, and snippets.

@gerigk
Created June 3, 2012 14:00
Show Gist options
  • Save gerigk/2863607 to your computer and use it in GitHub Desktop.
Save gerigk/2863607 to your computer and use it in GitHub Desktop.
Bootstrap file to load binaries for pandas and dependencies
#!/bin/bash
###################
#configuration here
####################
bucketname="your_bucket_name"
##########################
cd /home/hadoop
#first we set two vars...I had errors without this
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_RUN_PATH=/usr/local/lib:$LD_RUN_PATH
#sqlite3 is needed for python but somehow isnt detected
hadoop fs -get s3://${bucketname}/emr_resources/python_binaries/sqlite.tar.gz sqlite.tar.gz
tar -vxf sqlite.tar.gz
cd sqlite-autoconf-3070603
sudo make install
cd ..
#now python itself
hadoop fs -get s3://${bucketname}/emr_resources/python_binaries/Python-2.7.3.tar.gz Python-2.7.3.tar.gz
tar -vxf Python-2.7.3.tar.gz
cd Python-2.7.3
#now we install python to build the necessary libraries
sudo make install
# we have to create the links
sudo rm /usr/bin/python
sudo ln -s /usr/bin/python2.7 /usr/bin/python
cd ..
# install setup tools
wget http://peak.telecommunity.com/dist/ez_setup.py
sudo python ez_setup.py
# now atlas
hadoop fs -get s3://${bucketname}/emr_resources/libraries/ATLAS.tar.gz ATLAS.tar.gz
tar -vxf ATLAS.tar.gz
cd ATLAS/build
sudo make install
cd ..
cd ..
#hdf5
# first szlib
hadoop fs -get s3://${bucketname}/emr_resources/libraries/szip-2.1.tar.gz szip-2.1.tar.gz
tar -vxf szip-2.1.tar.gz
cd szip-2.1
sudo make install
cd ..
##### now hdf5
hadoop fs -get s3://${bucketname}/emr_resources/libraries/hdf5-1.8.9.tar.gz hdf5-1.8.9.tar.gz
tar -vxf hdf5-1.8.9.tar.gz
cd hdf5-1.8.9/build
sudo make install
cd ..
cd ..
#mrjob is needed of course. simplejson, boto and pyyaml are installed on the way
# those are fine because it doesnt take long to install them. feel free to add binaries
# for pyyaml with libyaml if you need the speed
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/mrjob.tar.gz mrjob.tar.gz
tar -vxf mrjob.tar.gz
#### this prepared mrjob to be installed next time. mrjob is python code only
#### now it is installed so you can run a job already in this session
cd mrjob
sudo python setup.py install
cd ..
# cython is needed for pandas
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/Cython-0.16.tar.gz Cython-0.16.tar.gz
tar -vxf Cython-0.16.tar.gz
# this prepared the binaries.
# now we install in order to build pandas
cd Cython-0.16
sudo python setup.py install
cd ..
# dateutil also needed for pandas
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/python-dateutil-1.5.tar.gz python-dateutil-1.5.tar.gz
#dateutil doesn't contain any non-python code
# now we install it for pandas
tar -vxf python-dateutil-1.5.tar.gz
cd python-dateutil-1.5
sudo python setup.py install
cd ..
# the same with pytz
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/pytz-2012c.tar.gz pytz-2012c.tar.gz
#install for the pandas tests
tar -vxf pytz-2012c.tar.gz
cd pytz-2012c
sudo python setup.py install
cd ..
################## we're close
################### time for numpy
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/numpy.tar.gz numpy.tar.gz
tar -vxf numpy.tar.gz
# now we install numpy for the pandas build
cd numpy
sudo python setup.py install
cd ..
#scipy
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/scipy-0.10.1.tar.gz scipy-0.10.1.tar.gz
tar -vxf scipy-0.10.1.tar.gz
cd scipy-0.10.1
sudo python setup.py install
cd ..
#numexpr
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/numexpr-2.0.1.tar.gz numexpr-2.0.1.tar.gz
tar -vxf numexpr-2.0.1.tar.gz
cd numexpr-2.0.1
sudo python setup.py install
cd ..
#pytables
#lzo compression
hadoop fs -get s3://${bucketname}/emr_resources/libraries/lzo-2.06.tar.gz lzo-2.06.tar.gz
tar -vxf lzo-2.06.tar.gz
cd lzo-2.06
sudo make install
cd ..
### now pytables
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/tables-2.3.1.tar.gz tables-2.3.1.tar.gz
tar -vxf tables-2.3.1.tar.gz
cd tables-2.3.1
sudo python setup.py install
cd ..
# nosetests to see whether everything went right
wget http://pypi.python.org/packages/source/n/nose/nose-1.1.2.tar.gz
tar -vxf nose-1.1.2.tar.gz
cd nose-1.1.2
sudo python setup.py install
cd ..
# and pandas
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/pandas.tar.gz pandas.tar.gz
tar -vxf pandas.tar.gz
cd pandas
sudo python setup.py install
# create a unique filename
unique=`ip addr show dev eth0 | grep ether | tr -s ' ' | cut -d' ' -f 3 | tr -d ':'`
nosetests pandas >${unique}.txt 2>&1
#upload test results to s3. this is nice if you start hundreds of instances and you still want to know that this setup produces
#a working pandas build
hadoop fs -put ${unique}.txt s3://${bucketname}/emr_resources/python_packages/pandas_tests/${unique}.txt
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment