-
-
Save ikovsky/49e4ac3cf75e3f4450499c303255868e to your computer and use it in GitHub Desktop.
The Codes for The Web Scrapping Project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import string | |
#MM means Migration Probability Matrix | |
#In this utility function we input the raw migration pandas dataframes as an ordered dict | |
#Then we normalize by the non-immigrant tax population to get the migration probability matrices | |
def ConstructMM(dataDict, outputByYear=False): | |
postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv' | |
postAbb = pd.read_csv(postAbbFile,sep='\t') | |
numStates = postAbb.shape[0] | |
stateNames = postAbb.iloc[:,0] | |
years = [str(yyyy) for yyyy in range(1991,2012)] | |
non_migrants = {} | |
yearlyMM = {} | |
for year in years: | |
yearlyMM[year] = pd.DataFrame(np.zeros((numStates,numStates))) | |
yearlyMM[year].columns = postAbb.iloc[:,0] | |
yearlyMM[year].index = postAbb.iloc[:,0] | |
non_migrants[year] = pd.DataFrame(np.zeros((numStates,1)),index=postAbb.iloc[:,0]) | |
for key, DF in dataDict.items(): | |
region, inOrout, yy = key.split(':') | |
if inOrout=='OUT': continue | |
if sum(stateNames.isin([region]))<1: continue | |
if yy[0]=='9': year = '19'+yy | |
else: year = '20'+yy | |
DF = DF.set_index(['Name']) | |
hasTotal = DF.index.str.contains('Non-Migrant',case=False) | |
rowIdxes = np.arange(DF.shape[0])[hasTotal] | |
if len(rowIdxes)<1:raise ValueError("no non-imigrant row") | |
non_migrants[year].ix[region,0] = DF.iloc[rowIdxes[0],0] | |
for key, DF in dataDict.items(): | |
region, inOrout, yy = key.split(':') | |
if inOrout=='IN': continue | |
if sum(stateNames.isin([region]))<1: | |
continue | |
if yy[0]=='9': year = '19'+yy | |
else: year = '20'+yy | |
DF = DF.set_index(['Name']) | |
#hasTotal = DF.index.str.contains('Non-Migrant',case=False) | |
#rowIdxes = np.arange(DF.shape[0])[hasTotal] | |
#if len(rowIdxes)<1:raise ValueError("no non-imigrant row") | |
for state in stateNames: | |
if state==region:continue | |
stateLoc = DF.index.str.contains(state,case=False) | |
if sum(stateLoc)<1: continue | |
value = DF[stateLoc].ix[0,0] | |
if isinstance(value,str): continue | |
meanPopulation = 0.5 * (non_migrants[year].ix[region,0] + non_migrants[year].ix[state,0]) | |
yearlyMM[year].ix[region, state] = value*1.0/meanPopulation*10000 | |
return yearlyMM |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import os, re, sys | |
# The state2state migration data consists of 2000+ small files of 20+ years of yearly | |
# state 2 state migration data (tracked by the tax returns filed to IRS). | |
# The data is very dirty in that the format of each file is not always the same, the state | |
#abbreviations are named almost semi-randomly. | |
#We put the soft links of these 2000+ files into a single directory. | |
#Then we load them, mapping to the state abbreviations back to the state names and store into | |
#pandas data frames. | |
def LoadMigration(): | |
postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv' | |
postAbb = pd.read_csv(postAbbFile,sep='\t') | |
country =['United States','ForeignOverseas'] | |
dataDir = '/Users/ikovsky/python_data/IRS/TAX-STATS/soi-tax-stats-migration-data/LinkedData' | |
translate = {'^oeg':'oregon','^az':'arizona','^DiCo':'Distr','^dico':'Distr','^vrg':'Virginia','^Miso':'Misso',\ | |
'^miso':'Misso','^wsc':'Wiscon', '^wiso':'Wiscon','^RhIs':'Rhode','^aka':'arkan','^arkas':'arkan'\ | |
,'^Misi':'missi','^nhio':'ohio','^nrbt':'nebra'} | |
fileNameList = os.listdir(dataDir) | |
p = re.compile('\d+') | |
q = re.compile('^[0-9]{4}') | |
key = None | |
stateDict = StateDict(postAbb) | |
data = {} | |
oddFormat = False | |
for name in fileNameList: | |
if not re.search('.xls$',name): continue | |
if q.search(name): oddFormat = True | |
else: oddFormat = False | |
nameTokens = p.split(name) | |
token = "^"+nameTokens[0] | |
failed = True | |
key = None | |
for i, state in enumerate(postAbb.iloc[:,0]): | |
if oddFormat: | |
break | |
result = re.search(token,state,flags=re.I) | |
if result is not None: key = state | |
if result is None: | |
state2 = re.sub(' ','',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('West ','W',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('West ','We',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('South ','S',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('South ','So',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('North ','no',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('North ','n',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None: | |
state2 = re.sub('New ','ne',state) | |
result = re.search(token,state2,flags=re.I) | |
key = state | |
if result is None and translate.get(token) is not None: | |
result = re.search(translate[token],state,flags=re.I) | |
key = state | |
if result is not None: break | |
if not oddFormat and result is None: | |
result = re.search(token,country[0],flags=re.I) | |
key = country[0] | |
if not oddFormat and result is None: | |
result = re.search(token,country[1],flags=re.I) | |
key = country[1] | |
if not oddFormat and result: | |
failed = False | |
if not oddFormat and failed: | |
print(nameTokens+[name]) | |
print(len(nameTokens[0])) | |
name1 = re.sub("^"+nameTokens[0],'',name) | |
inOrout = '' if oddFormat else InOrOut(name) | |
year = re.sub(nameTokens[-1]+"$",'',name1) | |
year = name[2:4] if oddFormat else MapYear(year) | |
if oddFormat: key = HandleOddFormatName(name,stateDict) | |
data[key+inOrout+":"+year] = LoadFileToDF(dataDir, name) | |
return(data) | |
def LoadFileToDF(dirName, fileName): | |
DF = pd.read_excel(dirName+"/"+fileName) | |
if DF.shape[1] not in [6,7,8]: raise ValueError("bad column size") | |
DF[DF==''] = np.NaN | |
naCols = DF.isnull().all(axis=0) | |
DF = DF.loc[:,~naCols] | |
redCol = [t for t in DF.columns if not DF[t].str.contains('PERCENT',case=False).any()] | |
DF = DF[redCol] | |
columns = DF.columns | |
gross = [idx for idx, t in enumerate(DF.columns) if \ | |
(DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('AGGREGATE',case=False)[:8].any())\ | |
or (DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('MONEY',case=False)[:8].any())] | |
if len(gross)>0: | |
idx=gross[-1] | |
if idx==0: raise ValueError("") | |
DF = DF.iloc[:,idx-3:idx+1] | |
DF.columns = ['Name','Num Returns','Num Exempt','AAGI'] | |
else: | |
DF = DF.iloc[:,-3:] | |
DF.columns = ['Name','Num Returns','Num Exempt'] | |
missingRow = DF.isnull().any(axis=1) | |
DF = DF.loc[~missingRow,:] | |
return(DF.apply(pd.to_numeric,errors='ignore')) | |
def InOrOut(name): | |
token = name[-7:-4].lower() | |
token = re.sub('[0-9]+','',token) | |
if re.search('i|n',token): return(':IN') | |
elif re.search('o',token): return(':OUT') | |
raise ValueError(name+' makes me confused') | |
def HandleOddFormatName(name, stateDict): | |
name1 = name[-9:-4] | |
stateAbb = name1[-2:].upper() | |
inOrout = name1[:3].upper() | |
if (inOrout == 'GIN'): inOrout = 'IN' | |
stateName = stateDict[stateAbb] | |
return(stateName+":"+inOrout) | |
def StateDict(postAbb): | |
stateNames = postAbb.iloc[:,0] | |
stateAbbs = postAbb.iloc[:,1] | |
ans = dict() | |
for idx, abb in enumerate(stateAbbs): | |
ans[abb] = stateNames[idx] | |
return(ans) | |
def MapYear(year): | |
if len(year) == 2: | |
pass | |
elif len(year) == 4: | |
year = year[2:4] | |
if len(year) not in [2,4]: | |
#print(year) | |
#print(name) | |
#print(nameTokens) | |
#956 -> 95-96 | |
#8 -> 98 | |
if year == '8': year = '98' | |
elif year == '956': year = '96' | |
else: print(year) | |
return(year) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import numpy.random as random | |
# Plot the descending sorted SVD singular values | |
def PlotSingularValues(singularValues,key='avg',logOdds=True): | |
token = ' Log Odd ' if logOdds else 'Original' | |
numStates = singularValues[key].shape[0] | |
plt.plot(np.arange(numStates),singularValues[key]) | |
plt.xlabel("Descending Sorted Order") | |
plt.ylabel("Singular Values of %s Migration Prob. SVD" %(token)) | |
# Plot the SVD descending sorted singular values from a random matrix | |
def PlotRandomSVD(nrow=51,ncol=51,std=1,dist='normal'): | |
if dist=='normal': | |
Q = random.normal(0,std,nrow * ncol).reshape(nrow,ncol) | |
elif dist=='chisq': | |
Q = random.chisquare(std,nrow * ncol).reshape(nrow,ncol) | |
else: raise ValueError(dist + " Unknown dist choice") | |
U,S,V = np.linalg.svd(np.matrix(Q),full_matrices=True) | |
plt.plot(np.arange(S.shape[0]),S) | |
plt.xlabel('Descending Sorted Order') | |
plt.ylabel('%s Random Eigen Values'%(dist.upper())) | |
# Plot the average of L1 norm of the matrix entries | |
def PlotMeanResiduals(residuals): | |
N = residuals['avg'].shape[0] | |
x = np.arange(N).reshape(1,-1) | |
y = np.arange(N).reshape(-1,1) | |
years = list(residuals.keys()) | |
years = [y for y in years if y != 'avg'] | |
years.sort() | |
mL1 = np.zeros(len(years)) | |
l1 = 0.0 | |
for yIdx, key in enumerate(years): | |
if key == 'avg': continue | |
value = residuals[key].as_matrix() | |
mL1[yIdx] = np.mean(np.abs(value[x!=y])) | |
l1 += np.abs(value[x!=y]) | |
l1/=len(years) | |
years = np.array([int(y) for y in years]) | |
plt.plot(years, mL1) | |
return(l1.mean()) | |
#Plot the collection of the correlations values between the K(K-1) residuals of different years | |
def PlotTSCorrDist(X,ResFct,Time=21): | |
if X.shape[0] != 0.5*Time*(Time-1): raise ValueError("X size not compatible with Time variable") | |
sns.kdeplot(X) | |
plt.xlabel('Cosine values of the residuals from different years') | |
plt.ylabel('Distribution Density') | |
plt.title('Correlation Value KDE Plot of %d degree of freedom' %(ResFct*(ResFct-1))) | |
Q = random.normal(0,1,ResFct*(ResFct-1)*Time).reshape(Time,-1) | |
A = np.matrix(Q) * np.matrix(Q).transpose() | |
A = A/np.sqrt(np.diag(A).reshape(1,-1)*np.diag(A).reshape(-1,1)) | |
x = np.arange(Time).reshape(-1,1) | |
y = np.arange(Time).reshape(1,-1) | |
sns.kdeplot(np.array(A)[x>y].flatten()) | |
#Plot the ratio of cumulative lambda*2 covered by the truncated SVD | |
def PlotRSquare(singularValues,key='avg', logOdds=True): | |
eigen = singularValues.get(key) | |
if eigen is None: raise ValueError("Invalid key "+key+"!") | |
X = eigen | |
z = np.cumsum(X**2)/np.sum(X**2) | |
plt.plot(np.arange(z.shape[0]),z) | |
token = ' Log Odds ' if logOdds else '' | |
plt.title(key+" Migration Prob. "+token+" Matrix T-SVD R^2 Plot") | |
plt.grid(True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(reshape2) | |
library(gridExtra) | |
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_original/avg.csv') | |
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_1/avg.csv') | |
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_2/avg.csv') | |
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_3/avg.csv') | |
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_4/avg.csv') | |
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_7/avg.csv') | |
avgME = read.csv('/Users/ikovsky/r_wd/data/MigrationME/MM_ME.csv',header=T) | |
#rownames(avgME)<-avgME$X | |
#avgME$X<-NULL | |
avgME_long<-melt(avgME,id='X',variable.name='Truncation',value.name='ML1') | |
avg_long<-melt(avg,id=c('US.State.'),variable.name='US.State.2',value.name='probability') | |
ggplot(avg_long, aes(x=factor(US.State.), y=factor(US.State.2), | |
fill=probability)) + geom_tile() + scale_fill_gradientn( | |
colors=c('black','dark blue', 'blue','red','white' | |
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Out of State') + ylab( | |
'Into State')+labs(fill='prob. in bps') | |
ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation), | |
fill=ML1)) + geom_tile() + scale_fill_gradientn( | |
colors=c('black','dark blue', 'blue','red','white' | |
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab( | |
'SVD Truncation')+labs(fill='mean L1 error') | |
ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation), | |
fill=ML1)) + geom_tile() + scale_fill_gradientn( | |
colors=c('black','dark blue', 'blue','red','white' | |
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab( | |
'SVD Truncation')+labs(fill='root mean squared error') | |
topTable=head(arrange(avg_long,desc(probability)),20) | |
names(topTable) = c('Out Of','Into','Probability in bps') | |
ss<-tableGrob(topTable, theme=ttheme_default(base_size=9)) | |
grid.arrange(ss) | |
unique(topTable$'Into') | |
unique(topTable$'Out of') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[DEFAULT] | |
rootDir = /Users/ikovsky/python_data/IRS | |
search = | |
findAllTokens = | |
excludeFile = pdf | |
includeFile = xls, zip | |
ignore1 = notice, work, response, appeal, phish, advocate, help, rights, treasury, chinese, korean, contact, us | |
ignore2 = identity, fear, freedom, privacy,Accessibility,filing,payments, refunds, vietnamese,spanish,FollowLinks | |
ignore3 = laws,shares, pros, wire, navigation, share, form, news | |
ignore = %(ignore1)s, %(ignore2)s, %(ignore3)s | |
followNumNodes = 1 | |
prePend = 0 | |
nodes_nickName = | |
logLevel = 2 | |
[ScrapeURL] | |
url = 'https://www.irs.gov/uac/tax-stats' | |
[TAX-STATS] | |
search = migration, wealth, individual, personal | |
loglevel = 2 | |
findAllTokens = table, tr, td | |
excludeFile = pdf | |
followNumNodes = 5 | |
[Personal Wealth] | |
search = Wealthholders | |
loglevel = 2 | |
findAllTokens = | |
excludeFile = pdf | |
followNumNodes = 5 | |
[Individual Income Tax] | |
search = wealth, state, zip, income, migration, estate | |
loglevel = 2 | |
findAllTokens = | |
excludeFile = pdf, report | |
followNumNodes = 5 | |
[soi-tax-stats-migration-data] | |
findAllTokens = | |
followNumNodes = 50 | |
search = ^[12][0-9]{3} | |
prePend = 1 | |
nodes_nickName = YYYY to YYYY+1 | |
logLevel = 1 | |
[YYYY to YYYY+1] | |
search = state, county |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from collections import OrderedDict | |
import urllib | |
import requests | |
import re,configparser | |
import argparse | |
# The main entry point of this module | |
def ScrapeURL(cfgFile='./ScrapeURL.cfg'): | |
cfg = configparser.ConfigParser() | |
cfg.read(cfgFile) | |
url = cfg.get('ScrapeURL','url') | |
linkObj = LinkObj(cfg, name='TAX-STATS',url=url,ancesters=[]) | |
linkObj.ScrapeLinks() | |
linkObj.FollowLinks() | |
return(linkObj) | |
# The Node Object abstracting a link, storing the related information related to scraping | |
# The program will follow the instruction from the user and follow the link to scrape/download | |
# the files at the terminal nodes. | |
# To avoid scraping too much garbage, we need to design letting the user decide what key words | |
# of interest to follow, what to avoid | |
class LinkObj(object): | |
def __init__(self,cfg,name,url,ancesters=[],nodes_nickName=''): | |
self.cfg = cfg | |
self.nameSpace = '' | |
x = str.split(name,'/') | |
if len(x)>1: | |
self.name = x[-1] | |
self.nameSpace = x[0] | |
else: self.name = name | |
if nodes_nickName is not None and nodes_nickName != '': | |
self.nickName = nodes_nickName | |
else: self.nickName = self.name | |
if not self.cfg.has_section(self.nickName):self.cfg.add_section(self.nickName) | |
self.url = url | |
self.prePend = bool(int(self.cfg.get(self.nickName,'prePend'))) | |
self.ancesters = ancesters | |
self.linkDict = OrderedDict() | |
x=str.split(re.sub(' ','',self.cfg.get(self.nickName, 'search')),',') | |
self.findAllTokens = str.split(re.sub(' ','',self.cfg.get(self.nickName,'findAllTokens')),',') | |
self.findAllTokens = [x for x in self.findAllTokens if x!=''] | |
self.searchToken = '|'.join(x) | |
ignore = str.split(re.sub(' ','',self.cfg.get(self.nickName,'ignore')),',') | |
self.ignore = '|'.join(ignore) | |
self.nodes = [] | |
self.nodes_nickName = self.cfg.get(self.nickName,'nodes_nickName') | |
self.excludeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'excludeFile')),',') | |
self.excludeFile = [x for x in self.excludeFile if x!=''] | |
self.excludeFile = "|".join([t+"$" for t in self.excludeFile]) | |
self.followNumNodes = int(self.cfg.get(self.nickName,'followNumNodes')) | |
self.includeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'includeFile')),',') | |
self.includeFile = [x for x in self.includeFile if x!=''] | |
self.includeFile = "|".join([t+"$" for t in self.includeFile]) | |
self.rootDir = self.cfg.get(self.nickName,'rootDir') | |
self.terminal = bool(re.search(self.includeFile,self.url)) | |
self.logLevel = int(self.cfg.get(self.nickName,'loglevel')) | |
def ScrapeLinks(self): | |
if self.terminal: | |
self.DownLoadLink() | |
return | |
if self.logLevel == 1: print(self.name+" ScrapeLinks") | |
r = urllib.request.urlopen(self.url) | |
soup = BeautifulSoup(r,'lxml') | |
A = soup.find_all('a',href=True) | |
self.PrintResultSet(A) | |
self.IterFindAll(soup,level=0) | |
def FollowLinks(self): | |
if self.terminal: return | |
followNumNodes = self.followNumNodes if self.followNumNodes>=0 else len(self.nodes) | |
print(self.name + " FollowLinks") | |
count = 0 | |
for token, url in self.linkDict.items(): | |
if count >= self.followNumNodes: break | |
self.nodes.append(LinkObj(self.cfg,name=token,url=url,ancesters=self.ancesters+[self.name],nodes_nickName=self.nodes_nickName)) | |
self.nodes[-1].ScrapeLinks() | |
self.nodes[-1].FollowLinks() | |
count+=1 | |
def IterFindAll(self, X, level): | |
if self.logLevel==1: print(self.name + " IterFindAll") | |
depth = len(self.findAllTokens) | |
if (depth>=level+1): | |
Y = X.find_all(self.findAllTokens[level]) | |
for y in Y: | |
self.IterFindAll(y,level+1) | |
else: | |
Y = X.find_all(['a','b']) | |
#Y = X.find_all('a',href=True) | |
strong = '' | |
for a in Y: | |
text = a.string | |
if a.name=='b': | |
if a.string is not None and re.search('^[A-Za-z]+[\w_\- ]*[\w]',a.string): | |
strong = a.string | |
#print(strong) | |
continue | |
elif a.get('href') is not None: link = a['href'] | |
else: continue | |
if text is None: text = str.split(link,'/')[-1] | |
if self.searchToken=='': | |
if self.logLevel == 1: print(self.name + "no search Token") | |
continue | |
elif (len(self.excludeFile)>0 and re.search(self.excludeFile,link,flags=re.I|re.X)): continue | |
elif re.search(self.searchToken,text,flags=re.I|re.X): | |
if strong != '' and self.prePend: text = strong + "/" + text | |
self.linkDict[text] = urllib.parse.urljoin(self.url,link) | |
def PrintResultSet(self,A): | |
if self.logLevel > 1: return | |
for item in A: | |
link = item['href'] | |
text = str.split(link,'/')[-1] if item.string is None else item.string | |
if re.search(self.ignore,text,flags=re.I|re.X): continue | |
if re.search(self.ignore,link,flags=re.I|re.X): continue | |
print(self.name+"<<"+text+":"+link) | |
def DownLoadLink(self): | |
path=self.rootDir + "/" + "/".join(self.ancesters) + "/" + self.nameSpace+"/"+self.name+"/" | |
fileName = self.url.split('/')[-1] | |
if not re.search(self.searchToken,fileName): return | |
print(path+fileName) | |
import os | |
if not os.path.exists(path): | |
os.makedirs(path) | |
elif os.path.exists(path+fileName): | |
print(path+fileName+" exists!") | |
return | |
urllib.request.urlretrieve(self.url,path+fileName) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser('use ScrapeURL to scrape and download a tree of links') | |
parser.add_argument('-cfg',type=str,help='command line option to control the ScrapeURL by passing a config fileName') | |
args = parser.parse_args() | |
if args.cfg is not None: | |
ScrapeURL(cfgFile=args.cfg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import os | |
#The main function to process a dictionary of Migration probability Matrices | |
#and return the truncated SVD, the singular values, log-odd domain residuals, the residuals | |
#in the sigmoid transformed (original) domain. | |
# numFct controls the cut off. | |
def SVD_MM(MM,logOdds=True,numFct=4): | |
MM2,states = ReturnAvg(MM) | |
if logOdds: MM_logOdds = GetLogOdds(MM2) | |
ans,singularValues,residuals,residuals2 = \ | |
GetTSVD(MM2 if not logOdds else MM_logOdds,logOdds=logOdds,truncated=numFct) | |
ans = DressUpToDF(ans,states) | |
residuals = DressUpToDF(residuals,states) | |
return(ans,singularValues,residuals,residuals2) | |
#Save the end result, pandas data frames, of the TSVD to csv files | |
def SaveDF2CSV(ans,dirName='/Users/ikovsky/r_wd/data/MigrationData/'): | |
if not os.path.isdir(dirName): raise IOError(dirName+" not a valid directory name") | |
for key, value in ans.items(): | |
fileName = dirName+"/"+key + ".csv" | |
value.to_csv(fileName) | |
# Compute the average MM of the whole 1991-2011 period | |
def ReturnAvg(MM): | |
MM2 = {} | |
avg = 0 | |
colNames = [] | |
for key, value in MM.items(): | |
avg += value.as_matrix() | |
MM2[key] = value.as_matrix() | |
if len(colNames)==0: colNames = value.columns | |
MM2['avg'] = avg/len(MM.keys()) | |
return(MM2,colNames) | |
# Perform the log odd transformation, Notice that | |
# our original MMs are scaled up by 1e4=10000 because the percentages of | |
# people migrating are very tiny. | |
def GetLogOdds(MM2): | |
ans = {} | |
x = np.arange(MM2['avg'].shape[0]).reshape(1,-1) | |
y = x.reshape((-1,1)) | |
dia = x==y | |
for key, value in MM2.items(): | |
T = np.log(1e-4*value/(1-1e-4*value)) # we had scale up the transfer matrix by 1e4 for the ease of | |
# visualization, we need to scale it back. | |
T[dia] = 0.0 | |
T[np.isinf(T)] = np.log(1e-5) | |
ans[key] = T | |
return(ans) | |
# Get the truncated SVD | |
def GetTSVD(MM,truncated=None,logOdds=True): | |
ans = {} | |
residuals = {} | |
residuals2 = {} | |
singularValues = {} | |
x = np.arange(MM['avg'].shape[0]).reshape(1,-1) | |
y = x.reshape((-1,1)) | |
dia = x==y | |
for key, value in MM.items(): | |
U,S,V=np.linalg.svd(value,full_matrices=True) | |
singularValues[key] = S.copy() | |
S1 = S.copy() | |
S2 = S.copy() | |
S1[truncated:] = 0 | |
S2[:truncated] = 0 | |
Lambda1 = np.diag(S1) | |
A = np.matrix(U)*np.matrix(Lambda1)*np.matrix(V) | |
Lambda2 = np.diag(S2) | |
B = np.matrix(U)*np.matrix(Lambda2)*np.matrix(V) | |
B[dia] = 0 | |
residuals[key] = B | |
if logOdds: | |
A = 1e4/(1+np.exp(-A)) | |
A[dia] = 0 | |
ans[key] = A | |
rA = 1e4/(1+np.exp(-value))-A | |
rA[dia] = 0 | |
residuals2[key] = pd.DataFrame(rA) | |
return(ans,singularValues,residuals,residuals2) | |
# In the SVD computation, we need to convert the Data Frames into numpy matrices | |
# For display or for storage into csv, we need to convert them back to DataFrame | |
# This is the funtionality of this function | |
def DressUpToDF(MMDict,states): | |
ans = {} | |
for key, value in MMDict.items(): | |
ans[key] = pd.DataFrame(value,index=states,columns=states) | |
return(ans) | |
# Given a dictionary of the numpy matrices of the same size, extract the | |
# off diagonals and collect them into a 1D numpy array | |
def PickUpOffDiag(MMDict): | |
K = MMDict['avg'].shape[0] | |
x = np.arange(K).reshape(-1,1) | |
y = np.arange(K).reshape(1,-1) | |
offDiag = x != y | |
collection = [] | |
for key, value in MMDict.items(): | |
if key == 'avg': continue | |
collection.extend(value.as_matrix()[offDiag].flatten()) | |
return(np.array(collection)) | |
# Check if there is evidence against the IID assumption | |
def CheckIID(MMDict,numFct=4,Time=False,cov=False): | |
years = list(MMDict.keys()) | |
years.sort() | |
K = MMDict['avg'].shape[0]-numFct | |
U,S,V=np.linalg.svd(MMDict['avg'].as_matrix(),full_matrices=True) | |
x = np.arange(K).reshape(-1,1) | |
y = np.arange(K).reshape(1,-1) | |
offDiag = x != y | |
u = np.arange(K*(K-1)).reshape(-1,1) | |
v = np.arange(K*(K-1)).reshape(1,-1) | |
upperDiag1 = u > v | |
T = len(years)-1 | |
a = np.arange(T).reshape(-1,1) | |
b = np.arange(T).reshape(1,-1) | |
upperDiag2 = a > b | |
resMatrix = np.zeros((np.sum(offDiag),T)) | |
for idx, key in enumerate(years): | |
if key == 'avg': continue | |
value = MMDict[key].as_matrix() | |
R = np.matrix(U).transpose()*value*np.matrix(V).transpose() | |
R = R[numFct:,numFct:] | |
resMatrix[:,idx] = np.array(R[offDiag]).flatten() | |
if Time is False: | |
X = (np.matrix(resMatrix) * np.matrix(resMatrix).transpose()) | |
if not cov: | |
X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1)) | |
else: X = np.array(X)/T | |
X = X[upperDiag1] | |
else: | |
#Z = np.matrix(resMatrix-np.mean(resMatrix,axis=0).reshape(1,-1)) | |
Z = np.matrix(resMatrix) | |
X = np.array(Z.transpose() * Z) | |
if not cov: | |
X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1)) | |
else: | |
X = np.array(X)/Z.shape[0] | |
X = X[upperDiag2] | |
return(np.array(X).flatten()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment