Identifying Highly Correlated Cryptocurrencies using PCA:
# N-CryptoAsset Portfolios: Identifying Highly Correlated
# Cryptocurrencies using PCA
# (c) 2017, by Pawel Lachowicz
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
from datetime import datetime
import json
from bs4 import BeautifulSoup
import requests
# define some custom colours
grey = .6, .6, .6
def timestamp2date(timestamp):
# function converts a Unix timestamp into Gregorian date
return datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d')
def date2timestamp(date):
# function coverts Gregorian date in a given format to timestamp
return datetime.strptime(date_today, '%Y-%m-%d').timestamp()
def fetchCryptoClose(fsym, tsym):
# function fetches the close-price time-series from
# it may ignore USDT coin (due to near-zero pricing)
# daily sampled
cols = ['date', 'timestamp', fsym]
lst = ['time', 'open', 'high', 'low', 'close']
timestamp_today =
curr_timestamp = timestamp_today
for j in range(2):
df = pd.DataFrame(columns=cols)
url = "" + fsym + \
"&tsym=" + tsym + "&toTs=" + str(int(curr_timestamp)) + "&limit=2000"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
dic = json.loads(soup.prettify())
for i in range(1, 2001):
tmp = []
for e in enumerate(lst):
x = e[0]
y = dic['Data'][i][e[1]]
if(x == 0):
if(np.sum(tmp[-4::]) > 0): # remove for USDT
tmp = np.array(tmp)
tmp = tmp[[0,1,4]] # filter solely for close prices
df.loc[len(df)] = np.array(tmp)
# ensure a correct date format
df.index = pd.to_datetime(, format="%Y-%m-%d")
df.drop('date', axis=1, inplace=True)
curr_timestamp = int(df.ix[0][0])
if(j == 0):
df0 = df.copy()
data = pd.concat([df, df0], axis=0)
data.drop("timestamp", axis=1, inplace=True)
# N-Cryptocurrency Portfolio (tickers)
fsym = ['BTC', 'ETH', 'DASH', 'XMR', 'XRP', 'LTC', 'ETC', 'XEM', 'REP',
'MAID', 'ZEC', 'STEEM', 'GNT', 'FCT', 'ICN', 'DGD',
'WAVES', 'DCR', 'LSK', 'DOGE', 'PIVX']
# vs.
tsym = 'USD'
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
data = fetchCryptoClose(e[1], tsym)
data = data.join(fetchCryptoClose(e[1], tsym))
data = data.astype(float) # ensure values to be floats
# save portfolio to a file (HDF5 file format)
store = pd.HDFStore('portfolio.h5')
store['data'] = data
# read in your portfolio from a file
df = pd.read_hdf('portfolio.h5', 'data')
# drop duplicates
df1 = df1.dropna().drop_duplicates()
# portfolio pre-processing
dfP = df[(df.index >= "2017-03-01") & (df.index <= "2017-03-31")]
dfP = dfP.dropna(axis=1, how='any')
m = dfP.mean(axis=0)
s = dfP.std(ddof=1, axis=0)
# normalised time-series as an input for PCA
dfPort = (dfP - m)/s
c = np.cov(dfPort.values.T) # covariance matrix
co = np.corrcoef(dfP.values.T) # correlation matrix
tickers = list(dfP.columns)
plt.imshow(co, cmap="RdGy", interpolation="nearest")
cb = plt.colorbar()
cb.set_label("Correlation Matrix Coefficients")
plt.title("Correlation Matrix", fontsize=14)
plt.xticks(np.arange(len(tickers)), tickers, rotation=90)
plt.yticks(np.arange(len(tickers)), tickers)
# perform PCA
w, v = np.linalg.eig(c)
ax = plt.figure(figsize=(8,8)).gca()
plt.imshow(v, cmap="bwr", interpolation="nearest")
cb = plt.colorbar()
plt.yticks(np.arange(len(tickers)), tickers)
plt.xlabel("PC Number")
plt.title("PCA", fontsize=14)
# force x-tickers to be displayed as integers (not floats)
# choose PC-k numbers
k1 = -1 # the last PC column in 'v' PCA matrix
k2 = -2 # the second last PC column
# begin constructing bi-plot for PC(k1) and PC(k2)
# loadings
# compute the distance from (0,0) point
dist = []
for i in range(v.shape[0]):
x = v[i,k1]
y = v[i,k2]
plt.plot(x, y, '.k')
plt.plot([0,x], [0,y], '-', color=grey)
d = np.sqrt(x**2 + y**2)
# check and save membership of a coin to
# a quarter number 1, 2, 3 or 4 on the plane
quar = []
for i in range(v.shape[0]):
x = v[i,k1]
y = v[i,k2]
d = np.sqrt(x**2 + y**2)
if(d > np.mean(dist) + np.std(dist, ddof=1)):
plt.plot(x, y, '.r', markersize=10)
plt.plot([0,x], [0,y], '-', color=grey)
if((x > 0) and (y > 0)):
quar.append((i, 1))
elif((x < 0) and (y > 0)):
quar.append((i, 2))
elif((x < 0) and (y < 0)):
quar.append((i, 3))
elif((x > 0) and (y < 0)):
quar.append((i, 4))
plt.text(x, y, tickers[i], color='k')
plt.xlabel("PC-" + str(len(tickers)+k1+1))
plt.ylabel("PC-" + str(len(tickers)+k2+1))
for i in range(len(quar)):
# Q1 vs Q3
if(quar[i][1] == 1):
for j in range(len(quar)):
if(quar[j][1] == 3):
# highly correlated coins according to the PC analysis
print(tickers[quar[i][0]], tickers[quar[j][0]])
ts1 = dfP[tickers[quar[i][0]]] # time-series
ts2 = dfP[tickers[quar[j][0]]]
# correlation metrics and their p_values
slope, intercept, r2, pvalue, _ = stats.linregress(ts1, ts2)
ktau, kpvalue = stats.kendalltau(ts1, ts2)
print(r2, pvalue)
print(ktau, kpvalue)
plt.plot(ts1, ts2, '.k')
xline = np.linspace(np.min(ts1), np.max(ts1), 100)
yline = slope*xline + intercept
plt.plot(xline, yline,'--', color='b') # linear model fit
# Q2 vs Q4
if(quar[i][1] == 2):
for j in range(len(quar)):
if(quar[j][1] == 4):
print(tickers[quar[i][0]], tickers[quar[j][0]])
ts1 = dfP[tickers[quar[i][0]]]
ts2 = dfP[tickers[quar[j][0]]]
slope, intercept, r2, pvalue, _ = stats.linregress(ts1, ts2)
ktau, kpvalue = stats.kendalltau(ts1, ts2)
print(r2, pvalue)
print(ktau, kpvalue)
plt.plot(ts1, ts2, '.k')
xline = np.linspace(np.min(ts1), np.max(ts1), 100)
yline = slope*xline + intercept
plt.plot(xline, yline,'--', color='b')
