Skip to content

Instantly share code, notes, and snippets.

@aniquetahir
Created November 21, 2017 15:44
Show Gist options
  • Save aniquetahir/e9464437d9257675a2580e8f13f2b306 to your computer and use it in GitHub Desktop.
Save aniquetahir/e9464437d9257675a2580e8f13f2b306 to your computer and use it in GitHub Desktop.
cd ~/
# Set up project
mkdir projects
cd projects
git clone https://github.com/aniquetahir/yellowtaxi.git
git clone https://github.com/aniquetahir/datapolygamyutils.git
export POLYUTILS=$(pwd)/datapolygamyutils
sudo apt install maven axel pypy
cd yellowtaxi
git checkout hdfs
mvn install
# Initialize hadoop
hadoop fs -mkdir /user/root
hadoop fs -mkdir /user/root/data
pushd .
cd /tmp
axel -a https://s3-us-west-2.amazonaws.com/anique/yellowdata_pickup.tar.gz
tar zxvf yellowdata_pickup.tar.gz
hadoop fs -put yellowdata_pickup.csv /user/root/data/
popd
hadoop fs -mkdir /user/root/jars/
hadoop fs -put target/yellowtaxi-1.0-SNAPSHOT-jar-with-dependencies.jar /user/root/jars/yellowtaxi.jar
hadoop fs -put heirarchy.ser /user/root
# Aggravation
time spark-submit --class edu.asu.yellowtaxi.Aggravation target/yellowtaxi-1.0-SNAPSHOT-jar-with-dependencies.jar yellowdata_pickup.csv 0
time spark-submit --class edu.asu.yellowtaxi.ZoneAggravation target/yellowtaxi-1.0-SNAPSHOT-jar-with-dependencies.jar yellowdata_pickup.csv 0
# Intervention
time spark-submit --deploy-mode cluster --master yarn-client --class edu.asu.yellowtaxi.Intervention target/yellowtaxi-1.0-SNAPSHOT-jar-with-dependencies.jar yellowdata_pickup.csv 0
time spark-submit --class edu.asu.yellowtaxi.ZoneIntervention target/yellowtaxi-1.0-SNAPSHOT-jar-with-dependencies.jar yellowdata_pickup.csv 0
time spark-submit --class edu.asu.yellowtaxi.HeirarchichalIntervention target/yellowtaxi-1.0-SNAPSHOT-jar-with-dependencies.jar yellowdata_pickup.csv 0
cd ~/projects
git clone https://github.com/ViDA-NYU/data-polygamy.git
cd data-polygamy/sigmod16/
./prepareSoftware.sh
cd setup
echo -e "yellowdata\t309" > ../../data/datasets.txt
./hdfs_dir
hdfs dfs -put ../../data/block.txt block
hdfs dfs -put ../../data/block-graph.txt block-graph
pushd .
cd /tmp
axel -a https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv
tail -n +3 yellow_tripdata_2016-01.csv > yellowdata
head -n 1 yellow_tripdata_2016-01.csv > yellowdata.header
echo 'NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE' > yellowdata.defaults
pypy $POLYUTILS/datechanger.py yellowdata newyellowdata
rm yellowdata
mv newyellowdata yellowdata
awk -F ',' '{print $2","$4","$5","$6","$7","$16","$19","$20}' yellowdata > yellowdatastripped
cat yellowdata.header | awk -F ',' 'BEGIN {RS="\r\n"}; {print $2","$4","$5","$6","$7","$16","$19",tip_percentage"}' > yellowdatastripped.header
rm yellowdata
rm yellowdata.header
mv yellowdatastripped yellowdata
mv yellowdatastripped.header yellowdata.header
hadoop fs -put yellowdata data/
hadoop fs -put yellowdata.defaults data/
hadoop fs -put yellowdata.header data/
popd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment