Skip to content

Instantly share code, notes, and snippets.

View weizhou2273's full-sized avatar
🎯
Focusing

William Zhou weizhou2273

🎯
Focusing
  • NYC Data Science Academy
View GitHub Profile
# Download spark from https://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz
wget https://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz
# Unzip spark
tar -xvzf spark-2.1.1-bin-hadoop2.7.tgz
# remove tgz file
rm spark-2.1.1-bin-hadoop2.7.tgz
# download mysql_driver from https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.42.tar.gz
wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.42.tar.gz
# unzip mysql driver
tar -xvzf mysql-connector-java-5.1.42.tar.gz
# remove tar.gz
rm mysql-connector-java-5.1.42.tar.gz
# navigate to spark folder
cd spark-2.1.1-bin-hadoop2.7/
# Initiate spark console
./bin/pyspark
from __future__ import print_function
import sys
import re
from operator import add
import pandas as pd
from pyspark.sql.types import StructField, StructType, StringType
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
import json
#install cron
sudo apt-get install cron
#Schedule task
sudo crontab -e
#Schedule runnning task every 2 minutes
*/2 * * * * source ./capstone/run.sh
#Save cron file
# Check your crontab task
sudo crontab -l
~/spark-2.1.1-bin-hadoop2.7/bin/spark-submit --packages org.apache.hadoop:hadoop-aws:2.7.1 --driver-class-path ~/spark-2.1.1-bin-hadoop2.7/mysql-connector-java-5.1.42/mysql-connector-java-5.1.42-bin.jar --jars ~/spark-2.1.1-bin-hadoop2.7/mysql-connector-java-5.1.42/mysql-connector-java-5.1.42-bin.jar ~/capstone/spark_analysis.py