Skip to content

Instantly share code, notes, and snippets.

Last active August 20, 2016 15:43
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save ikovsky/49e4ac3cf75e3f4450499c303255868e to your computer and use it in GitHub Desktop.
The Codes for The Web Scrapping Project
import pandas as pd
import numpy as np
import string
#MM means Migration Probability Matrix
#In this utility function we input the raw migration pandas dataframes as an ordered dict
#Then we normalize by the non-immigrant tax population to get the migration probability matrices
def ConstructMM(dataDict, outputByYear=False):
postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'
postAbb = pd.read_csv(postAbbFile,sep='\t')
numStates = postAbb.shape[0]
stateNames = postAbb.iloc[:,0]
years = [str(yyyy) for yyyy in range(1991,2012)]
non_migrants = {}
yearlyMM = {}
for year in years:
yearlyMM[year] = pd.DataFrame(np.zeros((numStates,numStates)))
yearlyMM[year].columns = postAbb.iloc[:,0]
yearlyMM[year].index = postAbb.iloc[:,0]
non_migrants[year] = pd.DataFrame(np.zeros((numStates,1)),index=postAbb.iloc[:,0])
for key, DF in dataDict.items():
region, inOrout, yy = key.split(':')
if inOrout=='OUT': continue
if sum(stateNames.isin([region]))<1: continue
if yy[0]=='9': year = '19'+yy
else: year = '20'+yy
DF = DF.set_index(['Name'])
hasTotal = DF.index.str.contains('Non-Migrant',case=False)
rowIdxes = np.arange(DF.shape[0])[hasTotal]
if len(rowIdxes)<1:raise ValueError("no non-imigrant row")
non_migrants[year].ix[region,0] = DF.iloc[rowIdxes[0],0]
for key, DF in dataDict.items():
region, inOrout, yy = key.split(':')
if inOrout=='IN': continue
if sum(stateNames.isin([region]))<1:
if yy[0]=='9': year = '19'+yy
else: year = '20'+yy
DF = DF.set_index(['Name'])
#hasTotal = DF.index.str.contains('Non-Migrant',case=False)
#rowIdxes = np.arange(DF.shape[0])[hasTotal]
#if len(rowIdxes)<1:raise ValueError("no non-imigrant row")
for state in stateNames:
if state==region:continue
stateLoc = DF.index.str.contains(state,case=False)
if sum(stateLoc)<1: continue
value = DF[stateLoc].ix[0,0]
if isinstance(value,str): continue
meanPopulation = 0.5 * (non_migrants[year].ix[region,0] + non_migrants[year].ix[state,0])
yearlyMM[year].ix[region, state] = value*1.0/meanPopulation*10000
return yearlyMM
import pandas as pd
import numpy as np
import os, re, sys
# The state2state migration data consists of 2000+ small files of 20+ years of yearly
# state 2 state migration data (tracked by the tax returns filed to IRS).
# The data is very dirty in that the format of each file is not always the same, the state
#abbreviations are named almost semi-randomly.
#We put the soft links of these 2000+ files into a single directory.
#Then we load them, mapping to the state abbreviations back to the state names and store into
#pandas data frames.
def LoadMigration():
postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'
postAbb = pd.read_csv(postAbbFile,sep='\t')
country =['United States','ForeignOverseas']
dataDir = '/Users/ikovsky/python_data/IRS/TAX-STATS/soi-tax-stats-migration-data/LinkedData'
translate = {'^oeg':'oregon','^az':'arizona','^DiCo':'Distr','^dico':'Distr','^vrg':'Virginia','^Miso':'Misso',\
'^miso':'Misso','^wsc':'Wiscon', '^wiso':'Wiscon','^RhIs':'Rhode','^aka':'arkan','^arkas':'arkan'\
fileNameList = os.listdir(dataDir)
p = re.compile('\d+')
q = re.compile('^[0-9]{4}')
key = None
stateDict = StateDict(postAbb)
data = {}
oddFormat = False
for name in fileNameList:
if not'.xls$',name): continue
if oddFormat = True
else: oddFormat = False
nameTokens = p.split(name)
token = "^"+nameTokens[0]
failed = True
key = None
for i, state in enumerate(postAbb.iloc[:,0]):
if oddFormat:
result =,state,flags=re.I)
if result is not None: key = state
if result is None:
state2 = re.sub(' ','',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('West ','W',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('West ','We',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('South ','S',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('South ','So',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('North ','no',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('North ','n',state)
result =,state2,flags=re.I)
key = state
if result is None:
state2 = re.sub('New ','ne',state)
result =,state2,flags=re.I)
key = state
if result is None and translate.get(token) is not None:
result =[token],state,flags=re.I)
key = state
if result is not None: break
if not oddFormat and result is None:
result =,country[0],flags=re.I)
key = country[0]
if not oddFormat and result is None:
result =,country[1],flags=re.I)
key = country[1]
if not oddFormat and result:
failed = False
if not oddFormat and failed:
name1 = re.sub("^"+nameTokens[0],'',name)
inOrout = '' if oddFormat else InOrOut(name)
year = re.sub(nameTokens[-1]+"$",'',name1)
year = name[2:4] if oddFormat else MapYear(year)
if oddFormat: key = HandleOddFormatName(name,stateDict)
data[key+inOrout+":"+year] = LoadFileToDF(dataDir, name)
def LoadFileToDF(dirName, fileName):
DF = pd.read_excel(dirName+"/"+fileName)
if DF.shape[1] not in [6,7,8]: raise ValueError("bad column size")
DF[DF==''] = np.NaN
naCols = DF.isnull().all(axis=0)
DF = DF.loc[:,~naCols]
redCol = [t for t in DF.columns if not DF[t].str.contains('PERCENT',case=False).any()]
DF = DF[redCol]
columns = DF.columns
gross = [idx for idx, t in enumerate(DF.columns) if \
(DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('AGGREGATE',case=False)[:8].any())\
or (DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('MONEY',case=False)[:8].any())]
if len(gross)>0:
if idx==0: raise ValueError("")
DF = DF.iloc[:,idx-3:idx+1]
DF.columns = ['Name','Num Returns','Num Exempt','AAGI']
DF = DF.iloc[:,-3:]
DF.columns = ['Name','Num Returns','Num Exempt']
missingRow = DF.isnull().any(axis=1)
DF = DF.loc[~missingRow,:]
def InOrOut(name):
token = name[-7:-4].lower()
token = re.sub('[0-9]+','',token)
if'i|n',token): return(':IN')
elif'o',token): return(':OUT')
raise ValueError(name+' makes me confused')
def HandleOddFormatName(name, stateDict):
name1 = name[-9:-4]
stateAbb = name1[-2:].upper()
inOrout = name1[:3].upper()
if (inOrout == 'GIN'): inOrout = 'IN'
stateName = stateDict[stateAbb]
def StateDict(postAbb):
stateNames = postAbb.iloc[:,0]
stateAbbs = postAbb.iloc[:,1]
ans = dict()
for idx, abb in enumerate(stateAbbs):
ans[abb] = stateNames[idx]
def MapYear(year):
if len(year) == 2:
elif len(year) == 4:
year = year[2:4]
if len(year) not in [2,4]:
#956 -> 95-96
#8 -> 98
if year == '8': year = '98'
elif year == '956': year = '96'
else: print(year)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.random as random
# Plot the descending sorted SVD singular values
def PlotSingularValues(singularValues,key='avg',logOdds=True):
token = ' Log Odd ' if logOdds else 'Original'
numStates = singularValues[key].shape[0]
plt.xlabel("Descending Sorted Order")
plt.ylabel("Singular Values of %s Migration Prob. SVD" %(token))
# Plot the SVD descending sorted singular values from a random matrix
def PlotRandomSVD(nrow=51,ncol=51,std=1,dist='normal'):
if dist=='normal':
Q = random.normal(0,std,nrow * ncol).reshape(nrow,ncol)
elif dist=='chisq':
Q = random.chisquare(std,nrow * ncol).reshape(nrow,ncol)
else: raise ValueError(dist + " Unknown dist choice")
U,S,V = np.linalg.svd(np.matrix(Q),full_matrices=True)
plt.xlabel('Descending Sorted Order')
plt.ylabel('%s Random Eigen Values'%(dist.upper()))
# Plot the average of L1 norm of the matrix entries
def PlotMeanResiduals(residuals):
N = residuals['avg'].shape[0]
x = np.arange(N).reshape(1,-1)
y = np.arange(N).reshape(-1,1)
years = list(residuals.keys())
years = [y for y in years if y != 'avg']
mL1 = np.zeros(len(years))
l1 = 0.0
for yIdx, key in enumerate(years):
if key == 'avg': continue
value = residuals[key].as_matrix()
mL1[yIdx] = np.mean(np.abs(value[x!=y]))
l1 += np.abs(value[x!=y])
years = np.array([int(y) for y in years])
plt.plot(years, mL1)
#Plot the collection of the correlations values between the K(K-1) residuals of different years
def PlotTSCorrDist(X,ResFct,Time=21):
if X.shape[0] != 0.5*Time*(Time-1): raise ValueError("X size not compatible with Time variable")
plt.xlabel('Cosine values of the residuals from different years')
plt.ylabel('Distribution Density')
plt.title('Correlation Value KDE Plot of %d degree of freedom' %(ResFct*(ResFct-1)))
Q = random.normal(0,1,ResFct*(ResFct-1)*Time).reshape(Time,-1)
A = np.matrix(Q) * np.matrix(Q).transpose()
A = A/np.sqrt(np.diag(A).reshape(1,-1)*np.diag(A).reshape(-1,1))
x = np.arange(Time).reshape(-1,1)
y = np.arange(Time).reshape(1,-1)
#Plot the ratio of cumulative lambda*2 covered by the truncated SVD
def PlotRSquare(singularValues,key='avg', logOdds=True):
eigen = singularValues.get(key)
if eigen is None: raise ValueError("Invalid key "+key+"!")
X = eigen
z = np.cumsum(X**2)/np.sum(X**2)
token = ' Log Odds ' if logOdds else ''
plt.title(key+" Migration Prob. "+token+" Matrix T-SVD R^2 Plot")
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_original/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_1/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_2/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_3/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_4/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_7/avg.csv')
avgME = read.csv('/Users/ikovsky/r_wd/data/MigrationME/MM_ME.csv',header=T)
ggplot(avg_long, aes(x=factor(US.State.), y=factor(US.State.2),
fill=probability)) + geom_tile() + scale_fill_gradientn(
colors=c('black','dark blue', 'blue','red','white'
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Out of State') + ylab(
'Into State')+labs(fill='prob. in bps')
ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
fill=ML1)) + geom_tile() + scale_fill_gradientn(
colors=c('black','dark blue', 'blue','red','white'
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
'SVD Truncation')+labs(fill='mean L1 error')
ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
fill=ML1)) + geom_tile() + scale_fill_gradientn(
colors=c('black','dark blue', 'blue','red','white'
)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
'SVD Truncation')+labs(fill='root mean squared error')
names(topTable) = c('Out Of','Into','Probability in bps')
ss<-tableGrob(topTable, theme=ttheme_default(base_size=9))
unique(topTable$'Out of')
rootDir = /Users/ikovsky/python_data/IRS
search =
findAllTokens =
excludeFile = pdf
includeFile = xls, zip
ignore1 = notice, work, response, appeal, phish, advocate, help, rights, treasury, chinese, korean, contact, us
ignore2 = identity, fear, freedom, privacy,Accessibility,filing,payments, refunds, vietnamese,spanish,FollowLinks
ignore3 = laws,shares, pros, wire, navigation, share, form, news
ignore = %(ignore1)s, %(ignore2)s, %(ignore3)s
followNumNodes = 1
prePend = 0
nodes_nickName =
logLevel = 2
url = ''
search = migration, wealth, individual, personal
loglevel = 2
findAllTokens = table, tr, td
excludeFile = pdf
followNumNodes = 5
[Personal Wealth]
search = Wealthholders
loglevel = 2
findAllTokens =
excludeFile = pdf
followNumNodes = 5
[Individual Income Tax]
search = wealth, state, zip, income, migration, estate
loglevel = 2
findAllTokens =
excludeFile = pdf, report
followNumNodes = 5
findAllTokens =
followNumNodes = 50
search = ^[12][0-9]{3}
prePend = 1
nodes_nickName = YYYY to YYYY+1
logLevel = 1
[YYYY to YYYY+1]
search = state, county
from bs4 import BeautifulSoup
from collections import OrderedDict
import urllib
import requests
import re,configparser
import argparse
# The main entry point of this module
def ScrapeURL(cfgFile='./ScrapeURL.cfg'):
cfg = configparser.ConfigParser()
url = cfg.get('ScrapeURL','url')
linkObj = LinkObj(cfg, name='TAX-STATS',url=url,ancesters=[])
# The Node Object abstracting a link, storing the related information related to scraping
# The program will follow the instruction from the user and follow the link to scrape/download
# the files at the terminal nodes.
# To avoid scraping too much garbage, we need to design letting the user decide what key words
# of interest to follow, what to avoid
class LinkObj(object):
def __init__(self,cfg,name,url,ancesters=[],nodes_nickName=''):
self.cfg = cfg
self.nameSpace = ''
x = str.split(name,'/')
if len(x)>1: = x[-1]
self.nameSpace = x[0]
else: = name
if nodes_nickName is not None and nodes_nickName != '':
self.nickName = nodes_nickName
else: self.nickName =
if not self.cfg.has_section(self.nickName):self.cfg.add_section(self.nickName)
self.url = url
self.prePend = bool(int(self.cfg.get(self.nickName,'prePend')))
self.ancesters = ancesters
self.linkDict = OrderedDict()
x=str.split(re.sub(' ','',self.cfg.get(self.nickName, 'search')),',')
self.findAllTokens = str.split(re.sub(' ','',self.cfg.get(self.nickName,'findAllTokens')),',')
self.findAllTokens = [x for x in self.findAllTokens if x!='']
self.searchToken = '|'.join(x)
ignore = str.split(re.sub(' ','',self.cfg.get(self.nickName,'ignore')),',')
self.ignore = '|'.join(ignore)
self.nodes = []
self.nodes_nickName = self.cfg.get(self.nickName,'nodes_nickName')
self.excludeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'excludeFile')),',')
self.excludeFile = [x for x in self.excludeFile if x!='']
self.excludeFile = "|".join([t+"$" for t in self.excludeFile])
self.followNumNodes = int(self.cfg.get(self.nickName,'followNumNodes'))
self.includeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'includeFile')),',')
self.includeFile = [x for x in self.includeFile if x!='']
self.includeFile = "|".join([t+"$" for t in self.includeFile])
self.rootDir = self.cfg.get(self.nickName,'rootDir')
self.terminal = bool(,self.url))
self.logLevel = int(self.cfg.get(self.nickName,'loglevel'))
def ScrapeLinks(self):
if self.terminal:
if self.logLevel == 1: print(" ScrapeLinks")
r = urllib.request.urlopen(self.url)
soup = BeautifulSoup(r,'lxml')
A = soup.find_all('a',href=True)
def FollowLinks(self):
if self.terminal: return
followNumNodes = self.followNumNodes if self.followNumNodes>=0 else len(self.nodes)
print( + " FollowLinks")
count = 0
for token, url in self.linkDict.items():
if count >= self.followNumNodes: break
def IterFindAll(self, X, level):
if self.logLevel==1: print( + " IterFindAll")
depth = len(self.findAllTokens)
if (depth>=level+1):
Y = X.find_all(self.findAllTokens[level])
for y in Y:
Y = X.find_all(['a','b'])
#Y = X.find_all('a',href=True)
strong = ''
for a in Y:
text = a.string
if a.string is not None and'^[A-Za-z]+[\w_\- ]*[\w]',a.string):
strong = a.string
elif a.get('href') is not None: link = a['href']
else: continue
if text is None: text = str.split(link,'/')[-1]
if self.searchToken=='':
if self.logLevel == 1: print( + "no search Token")
elif (len(self.excludeFile)>0 and,link,flags=re.I|re.X)): continue
if strong != '' and self.prePend: text = strong + "/" + text
self.linkDict[text] = urllib.parse.urljoin(self.url,link)
def PrintResultSet(self,A):
if self.logLevel > 1: return
for item in A:
link = item['href']
text = str.split(link,'/')[-1] if item.string is None else item.string
if,text,flags=re.I|re.X): continue
if,link,flags=re.I|re.X): continue
def DownLoadLink(self):
path=self.rootDir + "/" + "/".join(self.ancesters) + "/" + self.nameSpace+"/""/"
fileName = self.url.split('/')[-1]
if not,fileName): return
import os
if not os.path.exists(path):
elif os.path.exists(path+fileName):
print(path+fileName+" exists!")
if __name__ == "__main__":
parser = argparse.ArgumentParser('use ScrapeURL to scrape and download a tree of links')
parser.add_argument('-cfg',type=str,help='command line option to control the ScrapeURL by passing a config fileName')
args = parser.parse_args()
if args.cfg is not None:
import pandas as pd
import numpy as np
import os
#The main function to process a dictionary of Migration probability Matrices
#and return the truncated SVD, the singular values, log-odd domain residuals, the residuals
#in the sigmoid transformed (original) domain.
# numFct controls the cut off.
def SVD_MM(MM,logOdds=True,numFct=4):
MM2,states = ReturnAvg(MM)
if logOdds: MM_logOdds = GetLogOdds(MM2)
ans,singularValues,residuals,residuals2 = \
GetTSVD(MM2 if not logOdds else MM_logOdds,logOdds=logOdds,truncated=numFct)
ans = DressUpToDF(ans,states)
residuals = DressUpToDF(residuals,states)
#Save the end result, pandas data frames, of the TSVD to csv files
def SaveDF2CSV(ans,dirName='/Users/ikovsky/r_wd/data/MigrationData/'):
if not os.path.isdir(dirName): raise IOError(dirName+" not a valid directory name")
for key, value in ans.items():
fileName = dirName+"/"+key + ".csv"
# Compute the average MM of the whole 1991-2011 period
def ReturnAvg(MM):
MM2 = {}
avg = 0
colNames = []
for key, value in MM.items():
avg += value.as_matrix()
MM2[key] = value.as_matrix()
if len(colNames)==0: colNames = value.columns
MM2['avg'] = avg/len(MM.keys())
# Perform the log odd transformation, Notice that
# our original MMs are scaled up by 1e4=10000 because the percentages of
# people migrating are very tiny.
def GetLogOdds(MM2):
ans = {}
x = np.arange(MM2['avg'].shape[0]).reshape(1,-1)
y = x.reshape((-1,1))
dia = x==y
for key, value in MM2.items():
T = np.log(1e-4*value/(1-1e-4*value)) # we had scale up the transfer matrix by 1e4 for the ease of
# visualization, we need to scale it back.
T[dia] = 0.0
T[np.isinf(T)] = np.log(1e-5)
ans[key] = T
# Get the truncated SVD
def GetTSVD(MM,truncated=None,logOdds=True):
ans = {}
residuals = {}
residuals2 = {}
singularValues = {}
x = np.arange(MM['avg'].shape[0]).reshape(1,-1)
y = x.reshape((-1,1))
dia = x==y
for key, value in MM.items():
singularValues[key] = S.copy()
S1 = S.copy()
S2 = S.copy()
S1[truncated:] = 0
S2[:truncated] = 0
Lambda1 = np.diag(S1)
A = np.matrix(U)*np.matrix(Lambda1)*np.matrix(V)
Lambda2 = np.diag(S2)
B = np.matrix(U)*np.matrix(Lambda2)*np.matrix(V)
B[dia] = 0
residuals[key] = B
if logOdds:
A = 1e4/(1+np.exp(-A))
A[dia] = 0
ans[key] = A
rA = 1e4/(1+np.exp(-value))-A
rA[dia] = 0
residuals2[key] = pd.DataFrame(rA)
# In the SVD computation, we need to convert the Data Frames into numpy matrices
# For display or for storage into csv, we need to convert them back to DataFrame
# This is the funtionality of this function
def DressUpToDF(MMDict,states):
ans = {}
for key, value in MMDict.items():
ans[key] = pd.DataFrame(value,index=states,columns=states)
# Given a dictionary of the numpy matrices of the same size, extract the
# off diagonals and collect them into a 1D numpy array
def PickUpOffDiag(MMDict):
K = MMDict['avg'].shape[0]
x = np.arange(K).reshape(-1,1)
y = np.arange(K).reshape(1,-1)
offDiag = x != y
collection = []
for key, value in MMDict.items():
if key == 'avg': continue
# Check if there is evidence against the IID assumption
def CheckIID(MMDict,numFct=4,Time=False,cov=False):
years = list(MMDict.keys())
K = MMDict['avg'].shape[0]-numFct
x = np.arange(K).reshape(-1,1)
y = np.arange(K).reshape(1,-1)
offDiag = x != y
u = np.arange(K*(K-1)).reshape(-1,1)
v = np.arange(K*(K-1)).reshape(1,-1)
upperDiag1 = u > v
T = len(years)-1
a = np.arange(T).reshape(-1,1)
b = np.arange(T).reshape(1,-1)
upperDiag2 = a > b
resMatrix = np.zeros((np.sum(offDiag),T))
for idx, key in enumerate(years):
if key == 'avg': continue
value = MMDict[key].as_matrix()
R = np.matrix(U).transpose()*value*np.matrix(V).transpose()
R = R[numFct:,numFct:]
resMatrix[:,idx] = np.array(R[offDiag]).flatten()
if Time is False:
X = (np.matrix(resMatrix) * np.matrix(resMatrix).transpose())
if not cov:
X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1))
else: X = np.array(X)/T
X = X[upperDiag1]
#Z = np.matrix(resMatrix-np.mean(resMatrix,axis=0).reshape(1,-1))
Z = np.matrix(resMatrix)
X = np.array(Z.transpose() * Z)
if not cov:
X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1))
X = np.array(X)/Z.shape[0]
X = X[upperDiag2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment