Skip to content

Instantly share code, notes, and snippets.

Avatar
🎯
Focusing

William Zhou weizhou2273

🎯
Focusing
  • NYC Data Science Academy
View GitHub Profile
View run.sh
~/spark-2.1.1-bin-hadoop2.7/bin/spark-submit --packages org.apache.hadoop:hadoop-aws:2.7.1 --driver-class-path ~/spark-2.1.1-bin-hadoop2.7/mysql-connector-java-5.1.42/mysql-connector-java-5.1.42-bin.jar --jars ~/spark-2.1.1-bin-hadoop2.7/mysql-connector-java-5.1.42/mysql-connector-java-5.1.42-bin.jar ~/capstone/spark_analysis.py
View crontab_spark.sh
#install cron
sudo apt-get install cron
#Schedule task
sudo crontab -e
#Schedule runnning task every 2 minutes
*/2 * * * * source ./capstone/run.sh
#Save cron file
# Check your crontab task
sudo crontab -l
View spark_analysis.py
from __future__ import print_function
import sys
import re
from operator import add
import pandas as pd
from pyspark.sql.types import StructField, StructType, StringType
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
import json
View run_pyspark.py
# navigate to spark folder
cd spark-2.1.1-bin-hadoop2.7/
# Initiate spark console
./bin/pyspark
View download_unzip_mysql_driver.sh
# download mysql_driver from https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.42.tar.gz
wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.42.tar.gz
# unzip mysql driver
tar -xvzf mysql-connector-java-5.1.42.tar.gz
# remove tar.gz
rm mysql-connector-java-5.1.42.tar.gz
View download_unzip_spark.sh
# Download spark from https://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz
wget https://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz
# Unzip spark
tar -xvzf spark-2.1.1-bin-hadoop2.7.tgz
# remove tgz file
rm spark-2.1.1-bin-hadoop2.7.tgz
View delete_kinesis.py
import boto3
from moto import mock_kinesis
import credentials
import sys
from botocore.exceptions import ClientError
aws_key_id = credentials.aws['key_id']
aws_key = credentials.aws['key']
View install_python_package.sh
sudo apt-get install pip
sudo pip install boto3
sudo pip install tweepy
sudo pip install time
sudo pip install json
sudo pip install os
sudp pip install uuid
View setup_EC2_environment.sh
# install java
sudo add-apt-repository ppa:webupd8team/java
sudo apt-get update
sudo apt-get install oracle-java8-installer
sudo update-alternatives --config java
readlink -f $(which java)
# /usr/lib/jvm/java-8-oracle
sudo vi /etc/profile
# {vi environment}
source JAVA_HOME="/usr/lib/jvm/java-8-oracle/jre"