Skip to content

Instantly share code, notes, and snippets.

@gerigk
Created June 3, 2012 13:48
Show Gist options
  • Save gerigk/2863552 to your computer and use it in GitHub Desktop.
Save gerigk/2863552 to your computer and use it in GitHub Desktop.
Build binaries to run Pandas with EMR
#!/bin/bash
###################
#configuration here
####################
bucketname="my_bucket_name"
##########################
cd /home/hadoop
#first we set two vars...I had errors without this
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_RUN_PATH=/usr/local/lib:$LD_RUN_PATH
#sqlite3 is needed for python but somehow isnt detected
wget http://www.sqlite.org/sqlite-autoconf-3070603.tar.gz
tar -vxf sqlite-autoconf-3070603.tar.gz
cd sqlite-autoconf-3070603
./configure
make
cd ..
tar -czf sqlite.tar.gz sqlite-autoconf-3070603
hadoop fs -put sqlite.tar.gz s3://${bucketname}/emr_resources/python_binaries/sqlite.tar.gz
cd sqlite-autoconf-3070603
sudo make install
cd ..
#now python itself
wget http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tar.bz2
tar jfx Python-2.7.3.tar.bz2
cd Python-2.7.3
./configure
## I had a weird pandas build error when enabling --with-py-debug
make -s
cd ..
tar -czf Python-2.7.3.tar.gz Python-2.7.3
hadoop fs -put Python-2.7.3.tar.gz s3://${bucketname}/emr_resources/python_binaries/Python-2.7.3.tar.gz
######## this uploads the binaries for the next time we run the normal bootstrap script
cd Python-2.7.3
#now we install python to build the necessary libraries
sudo make install
# make our version the default one
sudo rm /usr/bin/python
sudo ln -s /usr/bin/python2.7 /usr/bin/python
# if you build python using --enable-shared then you have to set these links
#sudo ln -s /usr/local/lib/libpython2.7.so.1.0 /usr/lib/
#sudo ln -s /usr/local/lib/libpython2.7.so /usr/
cd ..
# install setup tools
wget http://peak.telecommunity.com/dist/ez_setup.py
sudo python ez_setup.py
#use the following only if you have a lot of time. also because of cpu throttling I wouldn't recommend building on ec2
# unless you find a way to turn it off. I used a machine that runs debian stable and the same version of gcc.
# with later gccs it won't run on ec2
#wget http://www.netlib.org/lapack/lapack-3.4.1.tgz
#wget http://downloads.sourceforge.net/project/math-atlas/Developer%20%28unstable%29/3.9.76/atlas3.9.76.tar.bz2
#tar -vxf atlas3.9.76.tar.bz2
#cd ATLAS
#mkdir build
#cd build
################################## -t 2 means 2 threads. depending on the ec2 instance you can choose more threads 14
### V 448 means SSE1/2/3 support. A14 means x86SSE364SSE2 architecture. check the documentation for more information
#../configure -b 64 -V 448 -A 14 -t 2 --with-netlib-lapack-tarfile=/home/$USER/lapack-3.4.1.tgz --shared
#make
#make check
#make time
#cd lib
#make shared
#make ptshared
#cd ..
#cd ..
#cd ..
#tar -czf ATLAS.tar.gz ATLAS
#hadoop fs -put ATLAS.tar.gz s3://${bucketname}/emr_resources/libraries/ATLAS.tar.gz
# start here if you downloaded my binaries or built them yourself.
hadoop fs -get s3://${bucketname}/emr_resources/libraries/ATLAS.tar.gz ATLAS.tar.gz
tar -vxf ATLAS.tar.gz
cd ATLAS/build
sudo make install
cd ..
cd ..
#hdf5
# first szlib
wget http://www.hdfgroup.org/ftp/lib-external/szip/2.1/src/szip-2.1.tar.gz
tar -vxf szip-2.1.tar.gz
cd szip-2.1
./configure
make
cd ..
tar -czf szip-2.1.tar.gz szip-2.1
hadoop fs -put szip-2.1.tar.gz s3://${bucketname}/emr_resources/libraries/szip-2.1.tar.gz
cd szip-2.1
sudo make install
cd ..
##### now hdf5
wget http://www.hdfgroup.org/ftp/HDF5/current/src/hdf5-1.8.9.tar.gz
tar -vxf hdf5-1.8.9.tar.gz
cd hdf5-1.8.9
mkdir build
cd build
../configure --prefix=/usr/local --enable-fortran --enable-cxx --with-szlib=/home/hadoop/szip-2.1/szip/lib
make
cd ..
cd ..
tar -czf hdf5-1.8.9.tar.gz hdf5-1.8.9
hadoop fs -put hdf5-1.8.9.tar.gz s3://${bucketname}/emr_resources/libraries/hdf5-1.8.9.tar.gz
cd hdf5-1.8.9/build
sudo make install
cd ..
cd ..
#mrjob is needed of course. simplejson, boto and pyyaml are installed on the way
# those are fine because it doesnt take long to install them. feel free to add binaries
# for pyyaml with libyaml if you need the speed
git clone https://github.com/Yelp/mrjob.git
tar -czf mrjob.tar.gz mrjob
hadoop fs -put mrjob.tar.gz s3://${bucketname}/emr_resources/python_packages/mrjob.tar.gz
#### this prepared mrjob to be installed next time. mrjob is python code only
#### now it is installed so you can run a job already in this session
cd mrjob
sudo python setup.py install
cd ..
# cython is needed in order to build pandas
wget http://cython.org/release/Cython-0.16.tar.gz
tar -vxf Cython-0.16.tar.gz
cd Cython-0.16
python setup.py build
cd ..
tar -czf Cython-0.16.tar.gz Cython-0.16
hadoop fs -put Cython-0.16.tar.gz s3://${bucketname}/emr_resources/python_packages/Cython-0.16.tar.gz
# this prepared the binaries.
# now we install in order to build pandas
cd Cython-0.16
sudo python setup.py install
cd ..
# dateutil also needed for pandas
wget http://labix.org/download/python-dateutil/python-dateutil-1.5.tar.gz
hadoop fs -put python-dateutil-1.5.tar.gz s3://${bucketname}/emr_resources/python_packages/python-dateutil-1.5.tar.gz
#dateutil doesn't contain any non-python code
#now we install it for pandas
tar -vxf python-dateutil-1.5.tar.gz
cd python-dateutil-1.5
sudo python setup.py install
cd ..
# the same with pytz
wget http://pypi.python.org/packages/source/p/pytz/pytz-2012c.tar.gz
hadoop fs -put pytz-2012c.tar.gz s3://${bucketname}/emr_resources/python_packages/pytz-2012c.tar.gz
#install for the pandas tests
tar -vxf pytz-2012c.tar.gz
cd pytz-2012c
sudo python setup.py install
cd ..
################## we're close
################### time for numpy
### if you don't trust numpy 1.7 yet change the following lines.
#wget http://sourceforge.net/projects/numpy/files/NumPy/1.6.2/numpy-1.6.2.tar.gz
#tar -vxf numpy-1.6.2.tar.gz
#cd numpy-1.6.2
git clone https://github.com/numpy/numpy.git
cd numpy
# create the site.cfg so numpy builds with atlas
cat >site.cfg <<HEREDOC
[DEFAULT]
library_dirs = /usr/local/atlas/lib
include_dirs = /usr/local/atlas/include
[blas_opt]
libraries = ptf77blas, ptcblas, atlas
[lapack_opt]
libraries = lapack, ptf77blas, ptcblas, atlas
HEREDOC
#and finally build numpy
python setup.py build
cd ..
tar -czf numpy.tar.gz numpy
hadoop fs -put numpy.tar.gz s3://${bucketname}/emr_resources/python_packages/numpy.tar.gz
# now we install numpy for the pandas build
cd numpy
sudo python setup.py install
cd ..
# again if you want to use stable numpy use these lines
#tar -czf numpy-1.6.2.tar.gz numpy-1.6.2
#hadoop fs -put numpy-1.6.2.tar.gz s3://${bucketname}/emr_resources/python_packages/numpy-1.6.2.tar.gz
# now we install numpy for the pandas build
#cd numpy-1.6.2
#sudo python setup.py install
#cd ..
#scipy
wget http://sourceforge.net/projects/scipy/files/scipy/0.10.1/scipy-0.10.1.tar.gz
tar -vxf scipy-0.10.1.tar.gz
cd scipy-0.10.1
python setup.py build
cd ..
tar -czf scipy-0.10.1.tar.gz scipy-0.10.1
hadoop fs -put scipy-0.10.1.tar.gz s3://${bucketname}/emr_resources/python_packages/scipy-0.10.1.tar.gz
cd scipy-0.10.1
sudo python setup.py install
cd ..
#numexpr
wget http://numexpr.googlecode.com/files/numexpr-2.0.1.tar.gz
tar -vxf numexpr-2.0.1.tar.gz
cd numexpr-2.0.1
python setup.py build
cd ..
tar -czf numexpr-2.0.1.tar.gz numexpr-2.0.1
hadoop fs -put numexpr-2.0.1.tar.gz s3://${bucketname}/emr_resources/python_packages/numexpr-2.0.1.tar.gz
cd numexpr-2.0.1
sudo python setup.py install
cd ..
#pytables
#lzo compression
wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.06.tar.gz
tar -vxf lzo-2.06.tar.gz
cd lzo-2.06
./configure --enable-shared
make
cd ..
tar -czf lzo-2.06.tar.gz lzo-2.06
hadoop fs -put lzo-2.06.tar.gz s3://${bucketname}/emr_resources/libraries/lzo-2.06.tar.gz
cd lzo-2.06
sudo make install
cd ..
### now pytables
wget http://downloads.sourceforge.net/project/pytables/pytables/2.3.1/tables-2.3.1.tar.gz
tar -vxf tables-2.3.1.tar.gz
cd tables-2.3.1
python setup.py build
cd ..
tar -czf tables-2.3.1.tar.gz tables-2.3.1
hadoop fs -put tables-2.3.1.tar.gz s3://${bucketname}/emr_resources/python_packages/tables-2.3.1.tar.gz
cd tables-2.3.1
sudo python setup.py install
cd ..
# nosetests to see whether everything went right
wget http://pypi.python.org/packages/source/n/nose/nose-1.1.2.tar.gz
tar -vxf nose-1.1.2.tar.gz
cd nose-1.1.2
sudo python setup.py install
cd ..
# and pandas
git clone https://github.com/pydata/pandas.git
cd pandas
python setup.py build
python setup.py build_ext --inplace
cd ..
tar -czf pandas.tar.gz pandas
hadoop fs -put pandas.tar.gz s3://${bucketname}/emr_resources/python_packages/pandas.tar.gz
cd pandas
sudo python setup.py install
# create a unique filename
unique=`ip addr show dev eth0 | grep ether | tr -s ' ' | cut -d' ' -f 3 | tr -d ':'`
nosetests pandas >${unique}.txt 2>&1
#upload the result to s3
hadoop fs -put ${unique}.txt s3://${bucketname}/emr_resources/python_packages/pandas_tests/${unique}.txt
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment