Chris Rawles crawles

## .profile
function mvim () {
    local f
    for f; do
        test -e "$f" || touch "$f"
    done
    open -a macvim "$@"
}

export CLICOLOR=1
export LSCOLORS=GxFxCxDxBxegedabagaced

## set_interval.py
import threading

def set_interval(func, sec):
    def func_wrapper():
        set_interval(func, sec)
        func()
    t = threading.Timer(sec, func_wrapper)
    t.start()
    return t

## Spark Dataframe Cheat Sheet.py
# A simple cheat sheet of Spark Dataframe syntax
# Current for Spark 1.6.1

# import statements
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

#creating dataframes
df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data

## spark-2.1.0-setup.txt
step 1)
Download Spark. Choose package type: “Source Code"
https://spark.apache.org/downloads.html

step 2)
$ brew install scala
$ export JAVA_HOME=$(/usr/libexec/java_home)
$ tar -xvzf spark-2.1.0.tgz
$ mv spark-2.1.0 /usr/local
$ cd /usr/local/spark-2.1.0/

## install_spark_mac.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                crawles
                / install_spark_mac.md
            
            
              Last active
              January 13, 2017 22:08
            
              
                Install Spark on Mac OS X Yosemite
              
          
Download spark from http://spark.apache.org/downloads.html


Extract file contents to a directory. E.g., /usr/local/


Modify bash profile:


export SPARK_PATH=/usr/local/spark-X-X-X-bin-hadoop2.7
alias pyspark="$SPARK_PATH/bin/pyspark --master local[4]"
alias snotebook="PYSPARK_DRIVER_PYTHON='jupyter' PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark"


## plot_3_attempts.py
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

_df = fga_py.toPandas()
plt.plot(_df.yr,_df.fg3a_p36m, color = '#00a79c')
plt.xlabel('Year')
_=plt.title('Player average 3-point attempts (per 36 minutes)')
plt.annotate('3 pointer introduced', xy=(1980, .5), xytext=(1981, 1.1), fontsize = 12,
            arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))

## read_season_totals.py
df = spark.read.option('header','true')\
          .option('inferSchema','true')\
          .csv('data/season_totals.csv')
df.cache() # cache the result as we will refer back to this dataframe

## compute_per_36.py
# 3 point attempts / 36 minute
from pyspark.sql.functions import col
fga_py = df.groupBy('yr')\
           .agg({'mp' : 'sum', 'fg3a' : 'sum'})\
           .select(col('yr'), (36*col('sum(fg3a)')/col('sum(mp)')).alias('fg3a_p36m'))\
           .orderBy('yr')

## sql_per_36.py
# SQL per 36
sqlContext.registerDataFrameAsTable(df, 'df')
fga_py = sqlContext.sql('''SELECT yr,
                                  sum(fg3a)/sum(mp)*36 fg3a_p36m
                           FROM df GROUP BY yr
                           ORDER BY yr''')

## train_3pt_lr.py
# train the model
from pyspark.ml.feature import VectorAssembler
t = VectorAssembler(inputCols=['yr'], outputCol = 'features')
training = t.transform(fga_py)\
            .withColumn('yr',fga_py.yr)\
            .withColumn('label',fga_py.fg3a_p36m)
training.toPandas().head()
	function mvim () {
	local f
	for f; do
	test -e "$f" \|\| touch "$f"
	done
	open -a macvim "$@"
	}

	export CLICOLOR=1
	export LSCOLORS=GxFxCxDxBxegedabagaced
	import threading

	def set_interval(func, sec):
	def func_wrapper():
	set_interval(func, sec)
	func()
	t = threading.Timer(sec, func_wrapper)
	t.start()
	return t
	# A simple cheat sheet of Spark Dataframe syntax
	# Current for Spark 1.6.1

	# import statements
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *

	#creating dataframes
	df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
	step 1)
	Download Spark. Choose package type: “Source Code"
	https://spark.apache.org/downloads.html

	step 2)
	$ brew install scala
	$ export JAVA_HOME=$(/usr/libexec/java_home)
	$ tar -xvzf spark-2.1.0.tgz
	$ mv spark-2.1.0 /usr/local
	$ cd /usr/local/spark-2.1.0/
	from matplotlib import pyplot as plt
	import seaborn as sns
	plt.style.use('fivethirtyeight')

	_df = fga_py.toPandas()
	plt.plot(_df.yr,_df.fg3a_p36m, color = '#00a79c')
	plt.xlabel('Year')
	_=plt.title('Player average 3-point attempts (per 36 minutes)')
	plt.annotate('3 pointer introduced', xy=(1980, .5), xytext=(1981, 1.1), fontsize = 12,
	arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
	df = spark.read.option('header','true')\
	.option('inferSchema','true')\
	.csv('data/season_totals.csv')
	df.cache() # cache the result as we will refer back to this dataframe
	# 3 point attempts / 36 minute
	from pyspark.sql.functions import col
	fga_py = df.groupBy('yr')\
	.agg({'mp' : 'sum', 'fg3a' : 'sum'})\
	.select(col('yr'), (36*col('sum(fg3a)')/col('sum(mp)')).alias('fg3a_p36m'))\
	.orderBy('yr')
	# SQL per 36
	sqlContext.registerDataFrameAsTable(df, 'df')
	fga_py = sqlContext.sql('''SELECT yr,
	sum(fg3a)/sum(mp)*36 fg3a_p36m
	FROM df GROUP BY yr
	ORDER BY yr''')
	# train the model
	from pyspark.ml.feature import VectorAssembler
	t = VectorAssembler(inputCols=['yr'], outputCol = 'features')
	training = t.transform(fga_py)\
	.withColumn('yr',fga_py.yr)\
	.withColumn('label',fga_py.fg3a_p36m)
	training.toPandas().head()