Skip to content

Instantly share code, notes, and snippets.

@ikovsky
Last active August 20, 2016 15:43
Show Gist options
  • Save ikovsky/49e4ac3cf75e3f4450499c303255868e to your computer and use it in GitHub Desktop.
Save ikovsky/49e4ac3cf75e3f4450499c303255868e to your computer and use it in GitHub Desktop.
The Codes for The Web Scrapping Project
import pandas as pd
import numpy as np
import string
#MM means Migration Probability Matrix
#In this utility function we input the raw migration pandas dataframes as an ordered dict
#Then we normalize by the non-immigrant tax population to get the migration probability matrices
def ConstructMM(dataDict, outputByYear=False):
postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'
postAbb = pd.read_csv(postAbbFile,sep='\t')
numStates = postAbb.shape[0]
stateNames = postAbb.iloc[:,0]
years = [str(yyyy) for yyyy in range(1991,2012)]
non_migrants = {}
yearlyMM = {}
for year in years:
yearlyMM[year] = pd.DataFrame(np.zeros((numStates,numStates)))
yearlyMM[year].columns = postAbb.iloc[:,0]
yearlyMM[year].index = postAbb.iloc[:,0]
non_migrants[year] = pd.DataFrame(np.zeros((numStates,1)),index=postAbb.iloc[:,0])
for key, DF in dataDict.items():
region, inOrout, yy = key.split(':')
if inOrout=='OUT': continue
if sum(stateNames.isin([region]))<1: continue
if yy[0]=='9': year = '19'+yy
else: year = '20'+yy
DF = DF.set_index(['Name'])
hasTotal = DF.index.str.contains('Non-Migrant',case=False)
rowIdxes = np.arange(DF.shape[0])[hasTotal]
if len(rowIdxes)<1:raise ValueError("no non-imigrant row")
non_migrants[year].ix[region,0] = DF.iloc[rowIdxes[0],0]
for key, DF in dataDict.items():
region, inOrout, yy = key.split(':')
if inOrout=='IN': continue
if sum(stateNames.isin([region]))<1:
continue
if yy[0]=='9': year = '19'+yy
else: year = '20'+yy
DF = DF.set_index(['Name'])
#hasTotal = DF.index.str.contains('Non-Migrant',case=False)
#rowIdxes = np.arange(DF.shape[0])[hasTotal]
#if len(rowIdxes)<1:raise ValueError("no non-imigrant row")
for state in stateNames:
if state==region:continue
stateLoc = DF.index.str.contains(state,case=False)
if sum(stateLoc)<1: continue
value = DF[stateLoc].ix[0,0]
if isinstance(value,str): continue
meanPopulation = 0.5 * (non_migrants[year].ix[region,0] + non_migrants[year].ix[state,0])
yearlyMM[year].ix[region, state] = value*1.0/meanPopulation*10000
return yearlyMM
import pandas as pd
import numpy as np
import os, re, sys
# The state2state migration data consists of 2000+ small files of 20+ years of yearly
# state 2 state migration data (tracked by the tax returns filed to IRS).
# The data is very dirty in that the format of each file is not always the same, the state
#abbreviations are named almost semi-randomly.
#We put the soft links of these 2000+ files into a single directory.
#Then we load them, mapping to the state abbreviations back to the state names and store into
#pandas data frames.
def LoadMigration():
postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'
postAbb = pd.read_csv(postAbbFile,sep='\t')
country =['United States','ForeignOverseas']
dataDir = '/Users/ikovsky/python_data/IRS/TAX-STATS/soi-tax-stats-migration-data/LinkedData'
translate = {'^oeg':'oregon','^az':'arizona','^DiCo':'Distr','^dico':'Distr','^vrg':'Virginia','^Miso':'Misso',\
'^miso':'Misso','^wsc':'Wiscon', '^wiso':'Wiscon','^RhIs':'Rhode','^aka':'arkan','^arkas':'arkan'\
,'^Misi':'missi','^nhio':'ohio','^nrbt':'nebra'}
fileNameList = os.listdir(dataDir)
p = re.compile('\d+')
q = re.compile('^[0-9]{4}')
key = None
stateDict = StateDict(postAbb)
data = {}
oddFormat = False
for name in fileNameList:
if not re.search('.xls$',name): continue
if q.search(name): oddFormat = True
else: oddFormat = False
nameTokens = p.split(name)
token = "^"+nameTokens[0]
failed = True
key = None
for i, state in enumerate(postAbb.iloc[:,0]):
if oddFormat:
break
result = re.search(token,state,flags=re.I)
if result is not None: key = state
if result is None:
state2 = re.sub(' ','',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('West ','W',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('West ','We',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('South ','S',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('South ','So',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('North ','no',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('North ','n',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('New ','ne',state)
result = re.search(token,state2,flags=re.I)
key = state
if result is None and translate.get(token) is not None:
result = re.search(translate[token],state,flags=re.I)
key = state
if result is not None: break
if not oddFormat and result is None:
result = re.search(token,country[0],flags=re.I)
key = country[0]
if not oddFormat and result is None:
result = re.search(token,country[1],flags=re.I)
key = country[1]
if not oddFormat and result:
failed = False
if not oddFormat and failed:
print(nameTokens+[name])
print(len(nameTokens[0]))
name1 = re.sub("^"+nameTokens[0],'',name)
inOrout = '' if oddFormat else InOrOut(name)
year = re.sub(nameTokens[-1]+"$",'',name1)
year = name[2:4] if oddFormat else MapYear(year)
if oddFormat: key = HandleOddFormatName(name,stateDict)
data[key+inOrout+":"+year] = LoadFileToDF(dataDir, name)
return(data)
def LoadFileToDF(dirName, fileName):
DF = pd.read_excel(dirName+"/"+fileName)
if DF.shape[1] not in [6,7,8]: raise ValueError("bad column size")
DF[DF==''] = np.NaN
naCols = DF.isnull().all(axis=0)
DF = DF.loc[:,~naCols]
redCol = [t for t in DF.columns if not DF[t].str.contains('PERCENT',case=False).any()]
DF = DF[redCol]
columns = DF.columns
gross = [idx for idx, t in enumerate(DF.columns) if \
(DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('AGGREGATE',case=False)[:8].any())\
or (DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('MONEY',case=False)[:8].any())]
if len(gross)>0:
idx=gross[-1]
if idx==0: raise ValueError("")
DF = DF.iloc[:,idx-3:idx+1]
DF.columns = ['Name','Num Returns','Num Exempt','AAGI']
else:
DF = DF.iloc[:,-3:]
DF.columns = ['Name','Num Returns','Num Exempt']
missingRow = DF.isnull().any(axis=1)
DF = DF.loc[~missingRow,:]
return(DF.apply(pd.to_numeric,errors='ignore'))
def InOrOut(name):
token = name[-7:-4].lower()
token = re.sub('[0-9]+','',token)
if re.search('i|n',token): return(':IN')
elif re.search('o',token): return(':OUT')
raise ValueError(name+' makes me confused')
def HandleOddFormatName(name, stateDict):
name1 = name[-9:-4]
stateAbb = name1[-2:].upper()
inOrout = name1[:3].upper()
if (inOrout == 'GIN'): inOrout = 'IN'
stateName = stateDict[stateAbb]
return(stateName+":"+inOrout)
def StateDict(postAbb):
stateNames = postAbb.iloc[:,0]
stateAbbs = postAbb.iloc[:,1]
ans = dict()
for idx, abb in enumerate(stateAbbs):
ans[abb] = stateNames[idx]
return(ans)
def MapYear(year):
if len(year) == 2:
pass
elif len(year) == 4:
year = year[2:4]
if len(year) not in [2,4]:
#print(year)
#print(name)
#print(nameTokens)
#956 -> 95-96
#8 -> 98
if year == '8': year = '98'
elif year == '956': year = '96'
else: print(year)
return(year)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.random as random
# Plot the descending sorted SVD singular values
def PlotSingularValues(singularValues,key='avg',logOdds=True):
token = ' Log Odd ' if logOdds else 'Original'
numStates = singularValues[key].shape[0]
plt.plot(np.arange(numStates),singularValues[key])
plt.xlabel("Descending Sorted Order")
plt.ylabel("Singular Values of %s Migration Prob. SVD" %(token))
# Plot the SVD descending sorted singular values from a random matrix
def PlotRandomSVD(nrow=51,ncol=51,std=1,dist='normal'):
if dist=='normal':
Q = random.normal(0,std,nrow * ncol).reshape(nrow,ncol)
elif dist=='chisq':
Q = random.chisquare(std,nrow * ncol).reshape(nrow,ncol)
else: raise ValueError(dist + " Unknown dist choice")
U,S,V = np.linalg.svd(np.matrix(Q),full_matrices=True)
plt.plot(np.arange(S.shape[0]),S)
plt.xlabel('Descending Sorted Order')
plt.ylabel('%s Random Eigen Values'%(dist.upper()))
# Plot the average of L1 norm of the matrix entries
def PlotMeanResiduals(residuals):
N = residuals['avg'].shape[0]
x = np.arange(N).reshape(1,-1)
y = np.arange(N).reshape(-1,1)
years = list(residuals.keys())
years = [y for y in years if y != 'avg']
years.sort()
mL1 = np.zeros(len(years))
l1 = 0.0
for yIdx, key in enumerate(years):
if key == 'avg': continue
value = residuals[key].as_matrix()
mL1[yIdx] = np.mean(np.abs(value[x!=y]))
l1 += np.abs(value[x!=y])
l1/=len(years)
years = np.array([int(y) for y in years])
plt.plot(years, mL1)
return(l1.mean())
#Plot the collection of the correlations values between the K(K-1) residuals of different years
def PlotTSCorrDist(X,ResFct,Time=21):
if X.shape[0] != 0.5*Time*(Time-1): raise ValueError("X size not compatible with Time variable")
sns.kdeplot(X)
plt.xlabel('Cosine values of the residuals from different years')
plt.ylabel('Distribution Density')
plt.title('Correlation Value KDE Plot of %d degree of freedom' %(ResFct*(ResFct-1)))
Q = random.normal(0,1,ResFct*(ResFct-1)*Time).reshape(Time,-1)
A = np.matrix(Q) * np.matrix(Q).transpose()
A = A/np.sqrt(np.diag(A).reshape(1,-1)*np.diag(A).reshape(-1,1))
x = np.arange(Time).reshape(-1,1)
y = np.arange(Time).reshape(1,-1)
sns.kdeplot(np.array(A)[x>y].flatten())
#Plot the ratio of cumulative lambda*2 covered by the truncated SVD
def PlotRSquare(singularValues,key='avg', logOdds=True):
eigen = singularValues.get(key)
if eigen is None: raise ValueError("Invalid key "+key+"!")
X = eigen
z = np.cumsum(X**2)/np.sum(X**2)
plt.plot(np.arange(z.shape[0]),z)
token = ' Log Odds ' if logOdds else ''
plt.title(key+" Migration Prob. "+token+" Matrix T-SVD R^2 Plot")
plt.grid(True)
library(reshape2)
library(gridExtra)
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_original/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_1/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_2/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_3/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_4/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_7/avg.csv')
avgME = read.csv('/Users/ikovsky/r_wd/data/MigrationME/MM_ME.csv',header=T)
#rownames(avgME)<-avgME$X
#avgME$X<-NULL
avgME_long<-melt(avgME,id='X',variable.name='Truncation',value.name='ML1')
avg_long<-melt(avg,id=c('US.State.'),variable.name='US.State.2',value.name='probability')
ggplot(avg_long, aes(x=factor(US.State.), y=factor(US.State.2),
fill=probability)) + geom_tile() + scale_fill_gradientn(
colors=c('black','dark blue', 'blue','red','white'
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Out of State') + ylab(
'Into State')+labs(fill='prob. in bps')
ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
fill=ML1)) + geom_tile() + scale_fill_gradientn(
colors=c('black','dark blue', 'blue','red','white'
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
'SVD Truncation')+labs(fill='mean L1 error')
ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
fill=ML1)) + geom_tile() + scale_fill_gradientn(
colors=c('black','dark blue', 'blue','red','white'
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
'SVD Truncation')+labs(fill='root mean squared error')
topTable=head(arrange(avg_long,desc(probability)),20)
names(topTable) = c('Out Of','Into','Probability in bps')
ss<-tableGrob(topTable, theme=ttheme_default(base_size=9))
grid.arrange(ss)
unique(topTable$'Into')
unique(topTable$'Out of')
[DEFAULT]
rootDir = /Users/ikovsky/python_data/IRS
search =
findAllTokens =
excludeFile = pdf
includeFile = xls, zip
ignore1 = notice, work, response, appeal, phish, advocate, help, rights, treasury, chinese, korean, contact, us
ignore2 = identity, fear, freedom, privacy,Accessibility,filing,payments, refunds, vietnamese,spanish,FollowLinks
ignore3 = laws,shares, pros, wire, navigation, share, form, news
ignore = %(ignore1)s, %(ignore2)s, %(ignore3)s
followNumNodes = 1
prePend = 0
nodes_nickName =
logLevel = 2
[ScrapeURL]
url = 'https://www.irs.gov/uac/tax-stats'
[TAX-STATS]
search = migration, wealth, individual, personal
loglevel = 2
findAllTokens = table, tr, td
excludeFile = pdf
followNumNodes = 5
[Personal Wealth]
search = Wealthholders
loglevel = 2
findAllTokens =
excludeFile = pdf
followNumNodes = 5
[Individual Income Tax]
search = wealth, state, zip, income, migration, estate
loglevel = 2
findAllTokens =
excludeFile = pdf, report
followNumNodes = 5
[soi-tax-stats-migration-data]
findAllTokens =
followNumNodes = 50
search = ^[12][0-9]{3}
prePend = 1
nodes_nickName = YYYY to YYYY+1
logLevel = 1
[YYYY to YYYY+1]
search = state, county
from bs4 import BeautifulSoup
from collections import OrderedDict
import urllib
import requests
import re,configparser
import argparse
# The main entry point of this module
def ScrapeURL(cfgFile='./ScrapeURL.cfg'):
cfg = configparser.ConfigParser()
cfg.read(cfgFile)
url = cfg.get('ScrapeURL','url')
linkObj = LinkObj(cfg, name='TAX-STATS',url=url,ancesters=[])
linkObj.ScrapeLinks()
linkObj.FollowLinks()
return(linkObj)
# The Node Object abstracting a link, storing the related information related to scraping
# The program will follow the instruction from the user and follow the link to scrape/download
# the files at the terminal nodes.
# To avoid scraping too much garbage, we need to design letting the user decide what key words
# of interest to follow, what to avoid
class LinkObj(object):
def __init__(self,cfg,name,url,ancesters=[],nodes_nickName=''):
self.cfg = cfg
self.nameSpace = ''
x = str.split(name,'/')
if len(x)>1:
self.name = x[-1]
self.nameSpace = x[0]
else: self.name = name
if nodes_nickName is not None and nodes_nickName != '':
self.nickName = nodes_nickName
else: self.nickName = self.name
if not self.cfg.has_section(self.nickName):self.cfg.add_section(self.nickName)
self.url = url
self.prePend = bool(int(self.cfg.get(self.nickName,'prePend')))
self.ancesters = ancesters
self.linkDict = OrderedDict()
x=str.split(re.sub(' ','',self.cfg.get(self.nickName, 'search')),',')
self.findAllTokens = str.split(re.sub(' ','',self.cfg.get(self.nickName,'findAllTokens')),',')
self.findAllTokens = [x for x in self.findAllTokens if x!='']
self.searchToken = '|'.join(x)
ignore = str.split(re.sub(' ','',self.cfg.get(self.nickName,'ignore')),',')
self.ignore = '|'.join(ignore)
self.nodes = []
self.nodes_nickName = self.cfg.get(self.nickName,'nodes_nickName')
self.excludeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'excludeFile')),',')
self.excludeFile = [x for x in self.excludeFile if x!='']
self.excludeFile = "|".join([t+"$" for t in self.excludeFile])
self.followNumNodes = int(self.cfg.get(self.nickName,'followNumNodes'))
self.includeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'includeFile')),',')
self.includeFile = [x for x in self.includeFile if x!='']
self.includeFile = "|".join([t+"$" for t in self.includeFile])
self.rootDir = self.cfg.get(self.nickName,'rootDir')
self.terminal = bool(re.search(self.includeFile,self.url))
self.logLevel = int(self.cfg.get(self.nickName,'loglevel'))
def ScrapeLinks(self):
if self.terminal:
self.DownLoadLink()
return
if self.logLevel == 1: print(self.name+" ScrapeLinks")
r = urllib.request.urlopen(self.url)
soup = BeautifulSoup(r,'lxml')
A = soup.find_all('a',href=True)
self.PrintResultSet(A)
self.IterFindAll(soup,level=0)
def FollowLinks(self):
if self.terminal: return
followNumNodes = self.followNumNodes if self.followNumNodes>=0 else len(self.nodes)
print(self.name + " FollowLinks")
count = 0
for token, url in self.linkDict.items():
if count >= self.followNumNodes: break
self.nodes.append(LinkObj(self.cfg,name=token,url=url,ancesters=self.ancesters+[self.name],nodes_nickName=self.nodes_nickName))
self.nodes[-1].ScrapeLinks()
self.nodes[-1].FollowLinks()
count+=1
def IterFindAll(self, X, level):
if self.logLevel==1: print(self.name + " IterFindAll")
depth = len(self.findAllTokens)
if (depth>=level+1):
Y = X.find_all(self.findAllTokens[level])
for y in Y:
self.IterFindAll(y,level+1)
else:
Y = X.find_all(['a','b'])
#Y = X.find_all('a',href=True)
strong = ''
for a in Y:
text = a.string
if a.name=='b':
if a.string is not None and re.search('^[A-Za-z]+[\w_\- ]*[\w]',a.string):
strong = a.string
#print(strong)
continue
elif a.get('href') is not None: link = a['href']
else: continue
if text is None: text = str.split(link,'/')[-1]
if self.searchToken=='':
if self.logLevel == 1: print(self.name + "no search Token")
continue
elif (len(self.excludeFile)>0 and re.search(self.excludeFile,link,flags=re.I|re.X)): continue
elif re.search(self.searchToken,text,flags=re.I|re.X):
if strong != '' and self.prePend: text = strong + "/" + text
self.linkDict[text] = urllib.parse.urljoin(self.url,link)
def PrintResultSet(self,A):
if self.logLevel > 1: return
for item in A:
link = item['href']
text = str.split(link,'/')[-1] if item.string is None else item.string
if re.search(self.ignore,text,flags=re.I|re.X): continue
if re.search(self.ignore,link,flags=re.I|re.X): continue
print(self.name+"<<"+text+":"+link)
def DownLoadLink(self):
path=self.rootDir + "/" + "/".join(self.ancesters) + "/" + self.nameSpace+"/"+self.name+"/"
fileName = self.url.split('/')[-1]
if not re.search(self.searchToken,fileName): return
print(path+fileName)
import os
if not os.path.exists(path):
os.makedirs(path)
elif os.path.exists(path+fileName):
print(path+fileName+" exists!")
return
urllib.request.urlretrieve(self.url,path+fileName)
if __name__ == "__main__":
parser = argparse.ArgumentParser('use ScrapeURL to scrape and download a tree of links')
parser.add_argument('-cfg',type=str,help='command line option to control the ScrapeURL by passing a config fileName')
args = parser.parse_args()
if args.cfg is not None:
ScrapeURL(cfgFile=args.cfg)
import pandas as pd
import numpy as np
import os
#The main function to process a dictionary of Migration probability Matrices
#and return the truncated SVD, the singular values, log-odd domain residuals, the residuals
#in the sigmoid transformed (original) domain.
# numFct controls the cut off.
def SVD_MM(MM,logOdds=True,numFct=4):
MM2,states = ReturnAvg(MM)
if logOdds: MM_logOdds = GetLogOdds(MM2)
ans,singularValues,residuals,residuals2 = \
GetTSVD(MM2 if not logOdds else MM_logOdds,logOdds=logOdds,truncated=numFct)
ans = DressUpToDF(ans,states)
residuals = DressUpToDF(residuals,states)
return(ans,singularValues,residuals,residuals2)
#Save the end result, pandas data frames, of the TSVD to csv files
def SaveDF2CSV(ans,dirName='/Users/ikovsky/r_wd/data/MigrationData/'):
if not os.path.isdir(dirName): raise IOError(dirName+" not a valid directory name")
for key, value in ans.items():
fileName = dirName+"/"+key + ".csv"
value.to_csv(fileName)
# Compute the average MM of the whole 1991-2011 period
def ReturnAvg(MM):
MM2 = {}
avg = 0
colNames = []
for key, value in MM.items():
avg += value.as_matrix()
MM2[key] = value.as_matrix()
if len(colNames)==0: colNames = value.columns
MM2['avg'] = avg/len(MM.keys())
return(MM2,colNames)
# Perform the log odd transformation, Notice that
# our original MMs are scaled up by 1e4=10000 because the percentages of
# people migrating are very tiny.
def GetLogOdds(MM2):
ans = {}
x = np.arange(MM2['avg'].shape[0]).reshape(1,-1)
y = x.reshape((-1,1))
dia = x==y
for key, value in MM2.items():
T = np.log(1e-4*value/(1-1e-4*value)) # we had scale up the transfer matrix by 1e4 for the ease of
# visualization, we need to scale it back.
T[dia] = 0.0
T[np.isinf(T)] = np.log(1e-5)
ans[key] = T
return(ans)
# Get the truncated SVD
def GetTSVD(MM,truncated=None,logOdds=True):
ans = {}
residuals = {}
residuals2 = {}
singularValues = {}
x = np.arange(MM['avg'].shape[0]).reshape(1,-1)
y = x.reshape((-1,1))
dia = x==y
for key, value in MM.items():
U,S,V=np.linalg.svd(value,full_matrices=True)
singularValues[key] = S.copy()
S1 = S.copy()
S2 = S.copy()
S1[truncated:] = 0
S2[:truncated] = 0
Lambda1 = np.diag(S1)
A = np.matrix(U)*np.matrix(Lambda1)*np.matrix(V)
Lambda2 = np.diag(S2)
B = np.matrix(U)*np.matrix(Lambda2)*np.matrix(V)
B[dia] = 0
residuals[key] = B
if logOdds:
A = 1e4/(1+np.exp(-A))
A[dia] = 0
ans[key] = A
rA = 1e4/(1+np.exp(-value))-A
rA[dia] = 0
residuals2[key] = pd.DataFrame(rA)
return(ans,singularValues,residuals,residuals2)
# In the SVD computation, we need to convert the Data Frames into numpy matrices
# For display or for storage into csv, we need to convert them back to DataFrame
# This is the funtionality of this function
def DressUpToDF(MMDict,states):
ans = {}
for key, value in MMDict.items():
ans[key] = pd.DataFrame(value,index=states,columns=states)
return(ans)
# Given a dictionary of the numpy matrices of the same size, extract the
# off diagonals and collect them into a 1D numpy array
def PickUpOffDiag(MMDict):
K = MMDict['avg'].shape[0]
x = np.arange(K).reshape(-1,1)
y = np.arange(K).reshape(1,-1)
offDiag = x != y
collection = []
for key, value in MMDict.items():
if key == 'avg': continue
collection.extend(value.as_matrix()[offDiag].flatten())
return(np.array(collection))
# Check if there is evidence against the IID assumption
def CheckIID(MMDict,numFct=4,Time=False,cov=False):
years = list(MMDict.keys())
years.sort()
K = MMDict['avg'].shape[0]-numFct
U,S,V=np.linalg.svd(MMDict['avg'].as_matrix(),full_matrices=True)
x = np.arange(K).reshape(-1,1)
y = np.arange(K).reshape(1,-1)
offDiag = x != y
u = np.arange(K*(K-1)).reshape(-1,1)
v = np.arange(K*(K-1)).reshape(1,-1)
upperDiag1 = u > v
T = len(years)-1
a = np.arange(T).reshape(-1,1)
b = np.arange(T).reshape(1,-1)
upperDiag2 = a > b
resMatrix = np.zeros((np.sum(offDiag),T))
for idx, key in enumerate(years):
if key == 'avg': continue
value = MMDict[key].as_matrix()
R = np.matrix(U).transpose()*value*np.matrix(V).transpose()
R = R[numFct:,numFct:]
resMatrix[:,idx] = np.array(R[offDiag]).flatten()
if Time is False:
X = (np.matrix(resMatrix) * np.matrix(resMatrix).transpose())
if not cov:
X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1))
else: X = np.array(X)/T
X = X[upperDiag1]
else:
#Z = np.matrix(resMatrix-np.mean(resMatrix,axis=0).reshape(1,-1))
Z = np.matrix(resMatrix)
X = np.array(Z.transpose() * Z)
if not cov:
X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1))
else:
X = np.array(X)/Z.shape[0]
X = X[upperDiag2]
return(np.array(X).flatten())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment