Skip to content

Instantly share code, notes, and snippets.

@jovianlin
jovianlin / pyspark_threads.py
Created April 17, 2017 06:27
How to run multiple jobs in one sparkcontext from separate threads in pyspark?
# Soure http://stackoverflow.com/questions/30214474/how-to-run-multiple-jobs-in-one-sparkcontext-from-separate-threads-in-pyspark
# Prereqs:
# set
# spark.dynamicAllocation.enabled true
# spark.shuffle.service.enabled true
# in spark-defaults.conf
import threading
from pyspark import SparkContext, SparkConf
@jovianlin
jovianlin / pyspark_quick_codes.py
Created April 12, 2017 05:31
PySpark Quick Codes
# Write DataFrame to Disk
spark_df.coalesce(1).write.csv( '<saved_output/YOUR_FOLDER_NAME>', header=True, mode='overwrite' )
# Read from Disk to DataFrame
new_spark_df = sqlContext.read.csv(s3_path, header=True, inferSchema=False) # For S3
new_spark_df = sqlContext.read.csv('<LOCATION>', header=True, inferSchema=False) # mode='FAILFAST'
# SORTING
from pyspark.sql.functions import col
col_name = 'restaurant_id'
@jovianlin
jovianlin / upload_to_s3.py
Created April 4, 2017 08:37
Upload stuff to Amazon/AWS S3
#############
# VARIABLES #
#############
access_key_id = '<ACCESS KEY ID>'
secret_access_key = '<SOME SECRET SHIT>'
bucket_name = 'my-awesome-bucket'
folder_name = 'upload_folder'
file_name = 'uploaded_doge_shit.jpg'
@jovianlin
jovianlin / find_ipynb.sh
Created April 4, 2017 08:33
Find the darn ".ipynb_checkpoints"
find . -name ".ipynb_checkpoints" | grep ipynb
@jovianlin
jovianlin / seaborn_color_scheme.py
Created April 2, 2017 15:51
seaborn nice color scheme
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
sns.set(font_scale=1.5)
@jovianlin
jovianlin / custom_concat_columns.py
Created March 30, 2017 09:17
Custom Concat Columns for PySpark
from pyspark.sql.functions import col, concat, lit
custom_concat = [col('appName'), lit('|'), col('platform'), lit('|'),
col('carrier'), lit('|'), col('connectionType'), lit('|'),
col('country'), lit('|'), col('city'), lit('|'),
col('userAgent')]
# Add a new column entitled "custom_col"
union_df = union_df.withColumn('custom_col', concat(*custom_concat))
@jovianlin
jovianlin / test_graphframes.py
Created March 27, 2017 15:14
test_graphframes.py
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame([
("a", "Alice", 34),
("b", "Bob", 36),
("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame([
("a", "b", "friend"),
("b", "c", "follow"),
@jovianlin
jovianlin / add_subl.sh
Last active March 25, 2017 16:53
Opening Sublime Text on command line as "subl" on Mac OS
# Check out:
# https://gist.github.com/adrianorsouza/df4759b0583dcd112da4
# http://olivierlacan.com/posts/launch-sublime-text-3-from-the-command-line/
# To usr/bin
sudo ln -s /Applications/Sublime\ Text.app/Contents/SharedSupport/bin/subl /usr/bin/subl
# To use/***LOCAL***/bin
ln -s "/Applications/Sublime Text.app/Contents/SharedSupport/bin/subl" /usr/local/bin/subl
@jovianlin
jovianlin / jupyter_cosmetics.py
Created March 9, 2017 01:54
ipython-jupyter notebook cosmetics
# Widen width of notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
# Set
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 200)
@jovianlin
jovianlin / map_clsloc.txt
Created December 26, 2016 05:59 — forked from aaronpolhamus/map_clsloc.txt
Image net classes + labels
n02119789 1 kit_fox
n02100735 2 English_setter
n02110185 3 Siberian_husky
n02096294 4 Australian_terrier
n02102040 5 English_springer
n02066245 6 grey_whale
n02509815 7 lesser_panda
n02124075 8 Egyptian_cat
n02417914 9 ibex
n02123394 10 Persian_cat