ya7ya/statarb1.py

## statarb1.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# we're using yahoo finance data, pandas datareader will import the data we need
from pandas_datareader.data import DataReader

def get_adj_prices(symbols,start_date):
	df = pd.DataFrame()
	for symbol in symbols:
		dftemp = DataReader(symbol, "yahoo", start_date)
		df[symbol] = dftemp["Adj Close"]

	return df

def find_corr(df,lagged, lag=1):
	dflag = df
	dflag["{}_lag".format(lagged)] = dflag[lagged].shift(lag)
	dflag = dflag.dropna()
	# computing correlation with 1 line.
	dflag = dflag.assign(correlation = dflag.ix[:,0].rolling(window=5).corr(dflag["{}_lag".format(lagged)]))
	dflag = dflag.dropna()

	# this produces 2 arrays of count and the slices
	count, division = np.histogram(dflag["correlation"])
	# argmax is used to get the index of the highest count,
	# then getting the value in the divison array using that index
	most_occuring_value = division[count.argmax()]
	# visualizing using a histogram
	ax = dflag.hist(column="correlation")
	plt.title("Correlation Histogram")
	# plotting a line
	plt.axvline(most_occuring_value, color="r", linestyle="dashed", linewidth=2)
	plt.show()
	print "Most re-occuring Corr value = %f" % most_occuring_value

	df_normalized = df[[s for s in df.columns if s not in [lagged]]]
	# normalized the numbers to make it easier to compare
	df_normalized = df_normalized/ df_normalized.iloc[0]
	df_normalized.plot()
	plt.title("Normalized Adj Closing Prices")
	plt.show()


# to test this
# df = get_adj_prices(["PEP","KO"], "2011-01-01")
# find_corr(df,"KO",1)
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	# we're using yahoo finance data, pandas datareader will import the data we need
	from pandas_datareader.data import DataReader

	def get_adj_prices(symbols,start_date):
	df = pd.DataFrame()
	for symbol in symbols:
	dftemp = DataReader(symbol, "yahoo", start_date)
	df[symbol] = dftemp["Adj Close"]

	return df

	def find_corr(df,lagged, lag=1):
	dflag = df
	dflag["{}_lag".format(lagged)] = dflag[lagged].shift(lag)
	dflag = dflag.dropna()
	# computing correlation with 1 line.
	dflag = dflag.assign(correlation = dflag.ix[:,0].rolling(window=5).corr(dflag["{}_lag".format(lagged)]))
	dflag = dflag.dropna()

	# this produces 2 arrays of count and the slices
	count, division = np.histogram(dflag["correlation"])
	# argmax is used to get the index of the highest count,
	# then getting the value in the divison array using that index
	most_occuring_value = division[count.argmax()]
	# visualizing using a histogram
	ax = dflag.hist(column="correlation")
	plt.title("Correlation Histogram")
	# plotting a line
	plt.axvline(most_occuring_value, color="r", linestyle="dashed", linewidth=2)
	plt.show()
	print "Most re-occuring Corr value = %f" % most_occuring_value

	df_normalized = df[[s for s in df.columns if s not in [lagged]]]
	# normalized the numbers to make it easier to compare
	df_normalized = df_normalized/ df_normalized.iloc[0]
	df_normalized.plot()
	plt.title("Normalized Adj Closing Prices")
	plt.show()


	# to test this
	# df = get_adj_prices(["PEP","KO"], "2011-01-01")
	# find_corr(df,"KO",1)