ikovsky/ConstructMMDict.py Secret

## ConstructMMDict.py
import pandas as pd
import numpy  as np
import string

#MM means Migration Probability Matrix
#In this utility function we input the raw migration pandas dataframes as an ordered dict
#Then we normalize by the non-immigrant tax population to get the migration probability matrices
def ConstructMM(dataDict, outputByYear=False):

    postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'

    postAbb = pd.read_csv(postAbbFile,sep='\t')
    numStates  = postAbb.shape[0]
    stateNames = postAbb.iloc[:,0]

    years      = [str(yyyy) for yyyy in range(1991,2012)]


    non_migrants = {}
    yearlyMM = {}

    for year in years:

        yearlyMM[year] = pd.DataFrame(np.zeros((numStates,numStates)))
        yearlyMM[year].columns = postAbb.iloc[:,0]
        yearlyMM[year].index   = postAbb.iloc[:,0]
        non_migrants[year] = pd.DataFrame(np.zeros((numStates,1)),index=postAbb.iloc[:,0])

    for key, DF in dataDict.items():

        region, inOrout, yy = key.split(':')
        if inOrout=='OUT': continue
        if sum(stateNames.isin([region]))<1: continue

        if yy[0]=='9': year = '19'+yy
        else:          year = '20'+yy

        DF = DF.set_index(['Name'])

        hasTotal = DF.index.str.contains('Non-Migrant',case=False)
        rowIdxes = np.arange(DF.shape[0])[hasTotal]

        if len(rowIdxes)<1:raise ValueError("no non-imigrant row")

        non_migrants[year].ix[region,0] = DF.iloc[rowIdxes[0],0]


    for key, DF in dataDict.items():

        region, inOrout, yy = key.split(':')

        if inOrout=='IN': continue
        if sum(stateNames.isin([region]))<1:
            continue


        if yy[0]=='9': year = '19'+yy
        else:          year = '20'+yy

        DF = DF.set_index(['Name'])

        #hasTotal = DF.index.str.contains('Non-Migrant',case=False)
        #rowIdxes = np.arange(DF.shape[0])[hasTotal]

        #if len(rowIdxes)<1:raise ValueError("no non-imigrant row")

        for state in stateNames:

            if state==region:continue
            stateLoc = DF.index.str.contains(state,case=False)
            if sum(stateLoc)<1: continue
            value = DF[stateLoc].ix[0,0]
            if isinstance(value,str): continue
            meanPopulation = 0.5 * (non_migrants[year].ix[region,0] + non_migrants[year].ix[state,0])
            yearlyMM[year].ix[region, state] = value*1.0/meanPopulation*10000


    return yearlyMM

## DataCleaning.py
import pandas as pd
import numpy as np
import os, re, sys
# The state2state migration data consists of 2000+ small files of 20+ years of yearly
# state 2 state migration data (tracked by the tax returns filed to IRS).
# The data is very dirty in that the format of each file is not always the same, the state
#abbreviations are named almost semi-randomly.
#We put the soft links of these 2000+ files into a single directory.
#Then we load them, mapping to the state abbreviations back to the state names and store into
#pandas data frames.

def LoadMigration():

    postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'
    postAbb     = pd.read_csv(postAbbFile,sep='\t')
    country     =['United States','ForeignOverseas']
    dataDir     = '/Users/ikovsky/python_data/IRS/TAX-STATS/soi-tax-stats-migration-data/LinkedData'

    translate   = {'^oeg':'oregon','^az':'arizona','^DiCo':'Distr','^dico':'Distr','^vrg':'Virginia','^Miso':'Misso',\
    '^miso':'Misso','^wsc':'Wiscon', '^wiso':'Wiscon','^RhIs':'Rhode','^aka':'arkan','^arkas':'arkan'\
    ,'^Misi':'missi','^nhio':'ohio','^nrbt':'nebra'}

    fileNameList = os.listdir(dataDir)
    p = re.compile('\d+')
    q = re.compile('^[0-9]{4}')
    key = None
    stateDict = StateDict(postAbb)
    data = {}
    oddFormat = False

    for name in fileNameList:

             if not re.search('.xls$',name): continue

             if q.search(name): oddFormat = True
             else:  oddFormat = False
             nameTokens = p.split(name)
             token  = "^"+nameTokens[0]

             failed = True
             key    = None

             for i, state in enumerate(postAbb.iloc[:,0]):

                    if oddFormat:
                                 break

                    result = re.search(token,state,flags=re.I)
                    if result is not None: key = state
                    if result is None:
                              state2 = re.sub(' ','',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('West ','W',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('West ','We',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('South ','S',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('South ','So',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('North ','no',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('North ','n',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None:
                              state2 = re.sub('New ','ne',state)
                              result = re.search(token,state2,flags=re.I)
                              key    = state
                    if result is None and translate.get(token) is not None:
                              result = re.search(translate[token],state,flags=re.I)
                              key    = state
                    if result is not None: break


             if not oddFormat and result is None:
                         result = re.search(token,country[0],flags=re.I)
                         key    = country[0]
             if not oddFormat and result is None:
                         result = re.search(token,country[1],flags=re.I)
                         key    = country[1]

             if not oddFormat and result:
                       failed = False


             if not oddFormat and failed:
                      print(nameTokens+[name])
                      print(len(nameTokens[0]))

             name1 = re.sub("^"+nameTokens[0],'',name)
             inOrout = '' if oddFormat else InOrOut(name)
             year  = re.sub(nameTokens[-1]+"$",'',name1)
             year  = name[2:4] if oddFormat else MapYear(year)

             if oddFormat: key = HandleOddFormatName(name,stateDict)
             data[key+inOrout+":"+year] = LoadFileToDF(dataDir, name)

    return(data)

def LoadFileToDF(dirName, fileName):

    DF = pd.read_excel(dirName+"/"+fileName)
    if DF.shape[1] not in [6,7,8]: raise ValueError("bad column size")
    DF[DF==''] = np.NaN
    naCols = DF.isnull().all(axis=0)
    DF     = DF.loc[:,~naCols]
    redCol  = [t for t in DF.columns if not DF[t].str.contains('PERCENT',case=False).any()]
    DF      = DF[redCol]
    columns = DF.columns
    gross   = [idx for idx, t in enumerate(DF.columns) if \
    (DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('AGGREGATE',case=False)[:8].any())\
    or (DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('MONEY',case=False)[:8].any())]
    if len(gross)>0:
              idx=gross[-1]
              if idx==0: raise ValueError("")
              DF = DF.iloc[:,idx-3:idx+1]
              DF.columns = ['Name','Num Returns','Num Exempt','AAGI']
    else:
              DF = DF.iloc[:,-3:]
              DF.columns = ['Name','Num Returns','Num Exempt']
    missingRow = DF.isnull().any(axis=1)
    DF         = DF.loc[~missingRow,:]

    return(DF.apply(pd.to_numeric,errors='ignore'))

def InOrOut(name):

    token = name[-7:-4].lower()
    token = re.sub('[0-9]+','',token)
    if re.search('i|n',token): return(':IN')
    elif re.search('o',token): return(':OUT')
    raise ValueError(name+' makes me confused')

def HandleOddFormatName(name, stateDict):

    name1 = name[-9:-4]
    stateAbb  = name1[-2:].upper()
    inOrout   = name1[:3].upper()
    if (inOrout == 'GIN'): inOrout = 'IN'
    stateName = stateDict[stateAbb]
    return(stateName+":"+inOrout)

def StateDict(postAbb):

    stateNames = postAbb.iloc[:,0]
    stateAbbs  = postAbb.iloc[:,1]

    ans = dict()
    for idx, abb in enumerate(stateAbbs):
        ans[abb] = stateNames[idx]

    return(ans)


def MapYear(year):

             if len(year) == 2:
                           pass
             elif len(year) == 4:
                           year = year[2:4]
             if len(year) not in [2,4]:
                           #print(year)
                           #print(name)
                           #print(nameTokens)
                           #956 -> 95-96
                           #8   -> 98
                           if year == '8': year = '98'
                           elif year == '956': year = '96'
                           else: print(year)

             return(year)

## PlotMigration.py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.random as random

# Plot the descending sorted SVD singular values
def PlotSingularValues(singularValues,key='avg',logOdds=True):

    token = ' Log Odd ' if logOdds else 'Original'
    numStates = singularValues[key].shape[0]
    plt.plot(np.arange(numStates),singularValues[key])
    plt.xlabel("Descending Sorted Order")
    plt.ylabel("Singular Values of %s Migration Prob. SVD" %(token))

# Plot the SVD descending sorted singular values from a random matrix
def PlotRandomSVD(nrow=51,ncol=51,std=1,dist='normal'):

    if dist=='normal':
       Q = random.normal(0,std,nrow * ncol).reshape(nrow,ncol)
    elif dist=='chisq':
       Q = random.chisquare(std,nrow * ncol).reshape(nrow,ncol)
    else:  raise ValueError(dist + " Unknown dist choice")

    U,S,V = np.linalg.svd(np.matrix(Q),full_matrices=True)

    plt.plot(np.arange(S.shape[0]),S)

    plt.xlabel('Descending Sorted Order')
    plt.ylabel('%s Random Eigen Values'%(dist.upper()))

# Plot the average of L1 norm of the matrix entries
def PlotMeanResiduals(residuals):

    N = residuals['avg'].shape[0]
    x = np.arange(N).reshape(1,-1)
    y = np.arange(N).reshape(-1,1)

    years = list(residuals.keys())
    years = [y for y in years if y != 'avg']
    years.sort()
    mL1   = np.zeros(len(years))
    l1    = 0.0

    for yIdx, key in enumerate(years):

        if key == 'avg': continue
        value  = residuals[key].as_matrix()
        mL1[yIdx] = np.mean(np.abs(value[x!=y]))
        l1       += np.abs(value[x!=y])

    l1/=len(years)
    years = np.array([int(y) for y in years])
    plt.plot(years, mL1)

    return(l1.mean())

#Plot the collection of the correlations values between the K(K-1) residuals of different years
def PlotTSCorrDist(X,ResFct,Time=21):

    if X.shape[0] != 0.5*Time*(Time-1): raise ValueError("X size not compatible with Time variable")

    sns.kdeplot(X)
    plt.xlabel('Cosine values of the residuals from different years')
    plt.ylabel('Distribution Density')
    plt.title('Correlation Value KDE Plot of %d degree of freedom' %(ResFct*(ResFct-1)))
    Q = random.normal(0,1,ResFct*(ResFct-1)*Time).reshape(Time,-1)
    A = np.matrix(Q) * np.matrix(Q).transpose()
    A = A/np.sqrt(np.diag(A).reshape(1,-1)*np.diag(A).reshape(-1,1))
    x = np.arange(Time).reshape(-1,1)
    y = np.arange(Time).reshape(1,-1)
    sns.kdeplot(np.array(A)[x>y].flatten())

#Plot the ratio of cumulative lambda*2 covered by the truncated SVD
def PlotRSquare(singularValues,key='avg', logOdds=True):

    eigen = singularValues.get(key)
    if eigen is None: raise ValueError("Invalid key "+key+"!")
    X = eigen
    z = np.cumsum(X**2)/np.sum(X**2)
    plt.plot(np.arange(z.shape[0]),z)
    token = ' Log Odds ' if logOdds else ''
    plt.title(key+" Migration Prob. "+token+" Matrix T-SVD R^2 Plot")
    plt.grid(True)

## PlotMigrationHeatMap.R
library(reshape2)
library(gridExtra)

avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_original/avg.csv')

avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_1/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_2/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_3/avg.csv')
avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_4/avg.csv')

avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_7/avg.csv')

avgME = read.csv('/Users/ikovsky/r_wd/data/MigrationME/MM_ME.csv',header=T)
#rownames(avgME)<-avgME$X
#avgME$X<-NULL
avgME_long<-melt(avgME,id='X',variable.name='Truncation',value.name='ML1')

avg_long<-melt(avg,id=c('US.State.'),variable.name='US.State.2',value.name='probability')


ggplot(avg_long, aes(x=factor(US.State.), y=factor(US.State.2),
                     fill=probability)) + geom_tile() + scale_fill_gradientn(
                       colors=c('black','dark blue', 'blue','red','white'
                       )) + theme(axis.text.x = element_text(angle = 90)) + xlab('Out of State') + ylab(
                         'Into State')+labs(fill='prob. in bps')

ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
                     fill=ML1)) + geom_tile() + scale_fill_gradientn(
                       colors=c('black','dark blue', 'blue','red','white'
                       )) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
                         'SVD Truncation')+labs(fill='mean L1 error')

ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
                       fill=ML1)) + geom_tile() + scale_fill_gradientn(
                         colors=c('black','dark blue', 'blue','red','white'
                         )) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
                           'SVD Truncation')+labs(fill='root mean squared error')


topTable=head(arrange(avg_long,desc(probability)),20)
names(topTable) = c('Out Of','Into','Probability in bps')
ss<-tableGrob(topTable, theme=ttheme_default(base_size=9))
grid.arrange(ss)

unique(topTable$'Into')
unique(topTable$'Out of')

## ScrapeURL.cfg
[DEFAULT]


rootDir        = /Users/ikovsky/python_data/IRS
search =
findAllTokens  =
excludeFile    = pdf
includeFile    = xls, zip
ignore1        =  notice, work, response, appeal, phish, advocate, help, rights, treasury, chinese, korean, contact, us
ignore2        =  identity, fear, freedom, privacy,Accessibility,filing,payments, refunds, vietnamese,spanish,FollowLinks
ignore3        =  laws,shares, pros, wire, navigation, share, form, news

ignore         = %(ignore1)s, %(ignore2)s, %(ignore3)s
followNumNodes = 1
prePend        = 0
nodes_nickName =
logLevel  = 2

[ScrapeURL]

url = 'https://www.irs.gov/uac/tax-stats'


[TAX-STATS]
search    = migration, wealth, individual, personal
loglevel  = 2
findAllTokens = table, tr, td
excludeFile = pdf
followNumNodes = 5

[Personal Wealth]

search = Wealthholders
loglevel  = 2
findAllTokens =
excludeFile = pdf
followNumNodes = 5

[Individual Income Tax]

search = wealth, state, zip, income, migration, estate
loglevel  = 2
findAllTokens =
excludeFile = pdf, report
followNumNodes = 5


[soi-tax-stats-migration-data]

findAllTokens =
followNumNodes = 50
search = ^[12][0-9]{3}
prePend        = 1
nodes_nickName = YYYY to YYYY+1
logLevel = 1

[YYYY to YYYY+1]

search = state, county

## ScrapeURL.py
from bs4 import BeautifulSoup
from collections import OrderedDict
import urllib
import requests
import re,configparser
import argparse

# The main entry point of this module
def ScrapeURL(cfgFile='./ScrapeURL.cfg'):

    cfg = configparser.ConfigParser()
    cfg.read(cfgFile)
    url = cfg.get('ScrapeURL','url')
    linkObj = LinkObj(cfg, name='TAX-STATS',url=url,ancesters=[])
    linkObj.ScrapeLinks()
    linkObj.FollowLinks()
    return(linkObj)

# The Node Object abstracting a link, storing the related information related to scraping
# The program will follow the instruction from the user and follow the link to scrape/download
# the files at the terminal nodes.
# To avoid scraping too much garbage, we need to design letting the user decide what key words
# of interest to follow, what to avoid
class LinkObj(object):

      def __init__(self,cfg,name,url,ancesters=[],nodes_nickName=''):

          self.cfg       = cfg
          self.nameSpace = ''
          x = str.split(name,'/')
          if len(x)>1:
             self.name = x[-1]
             self.nameSpace = x[0]
          else: self.name      = name
          if nodes_nickName is not None and nodes_nickName != '':
                   self.nickName = nodes_nickName
          else: self.nickName = self.name
          if not self.cfg.has_section(self.nickName):self.cfg.add_section(self.nickName)
          self.url       = url
          self.prePend   = bool(int(self.cfg.get(self.nickName,'prePend')))
          self.ancesters = ancesters
          self.linkDict = OrderedDict()
          x=str.split(re.sub(' ','',self.cfg.get(self.nickName, 'search')),',')
          self.findAllTokens = str.split(re.sub(' ','',self.cfg.get(self.nickName,'findAllTokens')),',')
          self.findAllTokens = [x for x in self.findAllTokens if x!='']
          self.searchToken = '|'.join(x)
          ignore = str.split(re.sub(' ','',self.cfg.get(self.nickName,'ignore')),',')
          self.ignore      = '|'.join(ignore)
          self.nodes       = []
          self.nodes_nickName = self.cfg.get(self.nickName,'nodes_nickName')
          self.excludeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'excludeFile')),',')
          self.excludeFile = [x for x in self.excludeFile if x!='']
          self.excludeFile = "|".join([t+"$" for t in self.excludeFile])
          self.followNumNodes = int(self.cfg.get(self.nickName,'followNumNodes'))
          self.includeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'includeFile')),',')
          self.includeFile = [x for x in self.includeFile if x!='']
          self.includeFile = "|".join([t+"$" for t in self.includeFile])
          self.rootDir     = self.cfg.get(self.nickName,'rootDir')
          self.terminal    = bool(re.search(self.includeFile,self.url))
          self.logLevel    = int(self.cfg.get(self.nickName,'loglevel'))


      def ScrapeLinks(self):

          if self.terminal:
                    self.DownLoadLink()
                    return
          if self.logLevel == 1: print(self.name+" ScrapeLinks")
          r = urllib.request.urlopen(self.url)
          soup  = BeautifulSoup(r,'lxml')
          A     = soup.find_all('a',href=True)
          self.PrintResultSet(A)
          self.IterFindAll(soup,level=0)

      def FollowLinks(self):

          if self.terminal: return
          followNumNodes = self.followNumNodes if self.followNumNodes>=0 else len(self.nodes)
          print(self.name + " FollowLinks")
          count = 0
          for token, url in self.linkDict.items():
              if count >= self.followNumNodes: break
              self.nodes.append(LinkObj(self.cfg,name=token,url=url,ancesters=self.ancesters+[self.name],nodes_nickName=self.nodes_nickName))
              self.nodes[-1].ScrapeLinks()
              self.nodes[-1].FollowLinks()
              count+=1


      def IterFindAll(self, X, level):

          if self.logLevel==1: print(self.name + " IterFindAll")
          depth = len(self.findAllTokens)
          if (depth>=level+1):
             Y = X.find_all(self.findAllTokens[level])
             for y in Y:
                      self.IterFindAll(y,level+1)
          else:
             Y = X.find_all(['a','b'])
             #Y = X.find_all('a',href=True)
             strong = ''
             for a in Y:
                   text = a.string
                   if a.name=='b':
                                     if a.string is not None and re.search('^[A-Za-z]+[\w_\- ]*[\w]',a.string):
                                           strong = a.string
                                     #print(strong)
                                     continue
                   elif a.get('href') is not None: link = a['href']
                   else: continue
                   if text is None:  text = str.split(link,'/')[-1]
                   if self.searchToken=='':
                                        if self.logLevel == 1: print(self.name + "no search Token")
                                        continue
                   elif (len(self.excludeFile)>0 and re.search(self.excludeFile,link,flags=re.I|re.X)): continue
                   elif re.search(self.searchToken,text,flags=re.I|re.X):
                                        if strong != '' and self.prePend: text = strong + "/" + text
                                        self.linkDict[text] = urllib.parse.urljoin(self.url,link)


      def PrintResultSet(self,A):

            if self.logLevel > 1: return
            for item in A:
                link = item['href']
                text = str.split(link,'/')[-1] if item.string is None else item.string
                if re.search(self.ignore,text,flags=re.I|re.X): continue
                if re.search(self.ignore,link,flags=re.I|re.X): continue
                print(self.name+"<<"+text+":"+link)

      def DownLoadLink(self):

          path=self.rootDir + "/" + "/".join(self.ancesters) + "/" + self.nameSpace+"/"+self.name+"/"
          fileName = self.url.split('/')[-1]
          if not re.search(self.searchToken,fileName): return
          print(path+fileName)
          import os
          if not os.path.exists(path):
                 os.makedirs(path)
          elif os.path.exists(path+fileName):
                print(path+fileName+" exists!")
                return
          urllib.request.urlretrieve(self.url,path+fileName)

if __name__ == "__main__":

   parser = argparse.ArgumentParser('use ScrapeURL to scrape and download a tree of links')
   parser.add_argument('-cfg',type=str,help='command line option to control the ScrapeURL by passing a config fileName')
   args  = parser.parse_args()
   if args.cfg is not None:
           ScrapeURL(cfgFile=args.cfg)

## SVD_MM.py
import pandas as pd
import numpy  as np
import os

#The main function to process a dictionary of Migration probability Matrices
#and return the truncated SVD, the singular values, log-odd domain residuals, the residuals
#in the sigmoid transformed (original) domain.
# numFct controls the cut off.

def SVD_MM(MM,logOdds=True,numFct=4):

    MM2,states    = ReturnAvg(MM)

    if logOdds:   MM_logOdds = GetLogOdds(MM2)


    ans,singularValues,residuals,residuals2 = \
    GetTSVD(MM2 if not logOdds else MM_logOdds,logOdds=logOdds,truncated=numFct)
    ans       = DressUpToDF(ans,states)
    residuals = DressUpToDF(residuals,states)
    return(ans,singularValues,residuals,residuals2)

#Save the end result, pandas data frames, of the TSVD to csv files
def SaveDF2CSV(ans,dirName='/Users/ikovsky/r_wd/data/MigrationData/'):

    if not os.path.isdir(dirName): raise IOError(dirName+" not a valid directory name")
    for key, value in ans.items():

        fileName = dirName+"/"+key + ".csv"
        value.to_csv(fileName)

# Compute the average MM of the whole 1991-2011 period
def ReturnAvg(MM):

    MM2 = {}
    avg = 0
    colNames = []
    for key, value in MM.items():
        avg += value.as_matrix()
        MM2[key] = value.as_matrix()
        if len(colNames)==0: colNames = value.columns

    MM2['avg'] = avg/len(MM.keys())
    return(MM2,colNames)


# Perform the log odd transformation, Notice that
# our original MMs are scaled up by 1e4=10000 because the percentages of
# people migrating are very tiny.
def GetLogOdds(MM2):

    ans = {}
    x   = np.arange(MM2['avg'].shape[0]).reshape(1,-1)
    y   = x.reshape((-1,1))
    dia = x==y

    for key, value in MM2.items():

        T = np.log(1e-4*value/(1-1e-4*value)) # we had scale up the transfer matrix by 1e4 for the ease of
        # visualization, we need to scale it back.
        T[dia] = 0.0
        T[np.isinf(T)] = np.log(1e-5)
        ans[key] = T

    return(ans)

# Get the truncated SVD
def GetTSVD(MM,truncated=None,logOdds=True):

    ans         = {}
    residuals   = {}
    residuals2  = {}
    singularValues = {}

    x   = np.arange(MM['avg'].shape[0]).reshape(1,-1)
    y   = x.reshape((-1,1))
    dia = x==y

    for key, value in MM.items():

        U,S,V=np.linalg.svd(value,full_matrices=True)
        singularValues[key] = S.copy()
        S1               = S.copy()
        S2               = S.copy()
        S1[truncated:]   = 0
        S2[:truncated]   = 0
        Lambda1          = np.diag(S1)
        A = np.matrix(U)*np.matrix(Lambda1)*np.matrix(V)
        Lambda2          = np.diag(S2)
        B                = np.matrix(U)*np.matrix(Lambda2)*np.matrix(V)
        B[dia]           = 0
        residuals[key]   = B

        if logOdds:
           A = 1e4/(1+np.exp(-A))
        A[dia]   = 0
        ans[key] = A
        rA       = 1e4/(1+np.exp(-value))-A
        rA[dia]  = 0
        residuals2[key]  = pd.DataFrame(rA)

    return(ans,singularValues,residuals,residuals2)

# In the SVD computation, we need to convert the Data Frames into numpy matrices
# For display or for storage into csv, we need to convert them back to DataFrame
# This is the funtionality of this function
def DressUpToDF(MMDict,states):

    ans = {}

    for key, value in MMDict.items():

        ans[key] = pd.DataFrame(value,index=states,columns=states)

    return(ans)

# Given a dictionary of the numpy matrices of the same size, extract the
# off diagonals and collect them into a 1D numpy array
def PickUpOffDiag(MMDict):

    K = MMDict['avg'].shape[0]
    x = np.arange(K).reshape(-1,1)
    y = np.arange(K).reshape(1,-1)
    offDiag = x != y
    collection = []


    for key, value in MMDict.items():
             if key == 'avg': continue
             collection.extend(value.as_matrix()[offDiag].flatten())

    return(np.array(collection))

# Check if there is evidence against the IID assumption
def CheckIID(MMDict,numFct=4,Time=False,cov=False):


    years = list(MMDict.keys())
    years.sort()
    K = MMDict['avg'].shape[0]-numFct
    U,S,V=np.linalg.svd(MMDict['avg'].as_matrix(),full_matrices=True)

    x = np.arange(K).reshape(-1,1)
    y = np.arange(K).reshape(1,-1)
    offDiag = x != y

    u = np.arange(K*(K-1)).reshape(-1,1)
    v = np.arange(K*(K-1)).reshape(1,-1)
    upperDiag1 = u > v

    T = len(years)-1
    a = np.arange(T).reshape(-1,1)
    b = np.arange(T).reshape(1,-1)
    upperDiag2 = a > b


    resMatrix = np.zeros((np.sum(offDiag),T))


    for idx, key in enumerate(years):

        if key == 'avg': continue
        value = MMDict[key].as_matrix()
        R     = np.matrix(U).transpose()*value*np.matrix(V).transpose()
        R     = R[numFct:,numFct:]
        resMatrix[:,idx] = np.array(R[offDiag]).flatten()

    if Time is False:
       X = (np.matrix(resMatrix) * np.matrix(resMatrix).transpose())
       if not cov:
          X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1))
       else: X = np.array(X)/T
       X = X[upperDiag1]
    else:
       #Z = np.matrix(resMatrix-np.mean(resMatrix,axis=0).reshape(1,-1))
       Z = np.matrix(resMatrix)
       X = np.array(Z.transpose() * Z)
       if not cov:
          X = np.array(X)/np.sqrt(np.diag(X).reshape(1,-1)*np.diag(X).reshape(-1,1))
       else:
          X = np.array(X)/Z.shape[0]
       X = X[upperDiag2]
    return(np.array(X).flatten())
	import pandas as pd
	import numpy as np
	import string

	#MM means Migration Probability Matrix
	#In this utility function we input the raw migration pandas dataframes as an ordered dict
	#Then we normalize by the non-immigrant tax population to get the migration probability matrices
	def ConstructMM(dataDict, outputByYear=False):

	postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'

	postAbb = pd.read_csv(postAbbFile,sep='\t')
	numStates = postAbb.shape[0]
	stateNames = postAbb.iloc[:,0]

	years = [str(yyyy) for yyyy in range(1991,2012)]



	non_migrants = {}
	yearlyMM = {}

	for year in years:

	yearlyMM[year] = pd.DataFrame(np.zeros((numStates,numStates)))
	yearlyMM[year].columns = postAbb.iloc[:,0]
	yearlyMM[year].index = postAbb.iloc[:,0]
	non_migrants[year] = pd.DataFrame(np.zeros((numStates,1)),index=postAbb.iloc[:,0])

	for key, DF in dataDict.items():

	region, inOrout, yy = key.split(':')
	if inOrout=='OUT': continue
	if sum(stateNames.isin([region]))<1: continue

	if yy[0]=='9': year = '19'+yy
	else: year = '20'+yy

	DF = DF.set_index(['Name'])

	hasTotal = DF.index.str.contains('Non-Migrant',case=False)
	rowIdxes = np.arange(DF.shape[0])[hasTotal]

	if len(rowIdxes)<1:raise ValueError("no non-imigrant row")

	non_migrants[year].ix[region,0] = DF.iloc[rowIdxes[0],0]





	for key, DF in dataDict.items():

	region, inOrout, yy = key.split(':')

	if inOrout=='IN': continue
	if sum(stateNames.isin([region]))<1:
	continue


	if yy[0]=='9': year = '19'+yy
	else: year = '20'+yy

	DF = DF.set_index(['Name'])

	#hasTotal = DF.index.str.contains('Non-Migrant',case=False)
	#rowIdxes = np.arange(DF.shape[0])[hasTotal]

	#if len(rowIdxes)<1:raise ValueError("no non-imigrant row")

	for state in stateNames:

	if state==region:continue
	stateLoc = DF.index.str.contains(state,case=False)
	if sum(stateLoc)<1: continue
	value = DF[stateLoc].ix[0,0]
	if isinstance(value,str): continue
	meanPopulation = 0.5 * (non_migrants[year].ix[region,0] + non_migrants[year].ix[state,0])
	yearlyMM[year].ix[region, state] = value1.0/meanPopulation10000



	return yearlyMM
	import pandas as pd
	import numpy as np
	import os, re, sys
	# The state2state migration data consists of 2000+ small files of 20+ years of yearly
	# state 2 state migration data (tracked by the tax returns filed to IRS).
	# The data is very dirty in that the format of each file is not always the same, the state
	#abbreviations are named almost semi-randomly.
	#We put the soft links of these 2000+ files into a single directory.
	#Then we load them, mapping to the state abbreviations back to the state names and store into
	#pandas data frames.

	def LoadMigration():

	postAbbFile = '/Users/ikovsky/r_wd/data/statePostCode.csv'
	postAbb = pd.read_csv(postAbbFile,sep='\t')
	country =['United States','ForeignOverseas']
	dataDir = '/Users/ikovsky/python_data/IRS/TAX-STATS/soi-tax-stats-migration-data/LinkedData'

	translate = {'^oeg':'oregon','^az':'arizona','^DiCo':'Distr','^dico':'Distr','^vrg':'Virginia','^Miso':'Misso',\
	'^miso':'Misso','^wsc':'Wiscon', '^wiso':'Wiscon','^RhIs':'Rhode','^aka':'arkan','^arkas':'arkan'\
	,'^Misi':'missi','^nhio':'ohio','^nrbt':'nebra'}

	fileNameList = os.listdir(dataDir)
	p = re.compile('\d+')
	q = re.compile('^[0-9]{4}')
	key = None
	stateDict = StateDict(postAbb)
	data = {}
	oddFormat = False

	for name in fileNameList:

	if not re.search('.xls$',name): continue

	if q.search(name): oddFormat = True
	else: oddFormat = False
	nameTokens = p.split(name)
	token = "^"+nameTokens[0]

	failed = True
	key = None

	for i, state in enumerate(postAbb.iloc[:,0]):

	if oddFormat:
	break

	result = re.search(token,state,flags=re.I)
	if result is not None: key = state
	if result is None:
	state2 = re.sub(' ','',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('West ','W',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('West ','We',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('South ','S',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('South ','So',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('North ','no',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('North ','n',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None:
	state2 = re.sub('New ','ne',state)
	result = re.search(token,state2,flags=re.I)
	key = state
	if result is None and translate.get(token) is not None:
	result = re.search(translate[token],state,flags=re.I)
	key = state
	if result is not None: break


	if not oddFormat and result is None:
	result = re.search(token,country[0],flags=re.I)
	key = country[0]
	if not oddFormat and result is None:
	result = re.search(token,country[1],flags=re.I)
	key = country[1]

	if not oddFormat and result:
	failed = False


	if not oddFormat and failed:
	print(nameTokens+[name])
	print(len(nameTokens[0]))

	name1 = re.sub("^"+nameTokens[0],'',name)
	inOrout = '' if oddFormat else InOrOut(name)
	year = re.sub(nameTokens[-1]+"$",'',name1)
	year = name[2:4] if oddFormat else MapYear(year)

	if oddFormat: key = HandleOddFormatName(name,stateDict)
	data[key+inOrout+":"+year] = LoadFileToDF(dataDir, name)

	return(data)

	def LoadFileToDF(dirName, fileName):

	DF = pd.read_excel(dirName+"/"+fileName)
	if DF.shape[1] not in [6,7,8]: raise ValueError("bad column size")
	DF[DF==''] = np.NaN
	naCols = DF.isnull().all(axis=0)
	DF = DF.loc[:,~naCols]
	redCol = [t for t in DF.columns if not DF[t].str.contains('PERCENT',case=False).any()]
	DF = DF[redCol]
	columns = DF.columns
	gross = [idx for idx, t in enumerate(DF.columns) if \
	(DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('AGGREGATE',case=False)[:8].any())\
	or (DF[t].str.contains('INCOME',case=False)[:8].any() and DF[t].str.contains('MONEY',case=False)[:8].any())]
	if len(gross)>0:
	idx=gross[-1]
	if idx==0: raise ValueError("")
	DF = DF.iloc[:,idx-3:idx+1]
	DF.columns = ['Name','Num Returns','Num Exempt','AAGI']
	else:
	DF = DF.iloc[:,-3:]
	DF.columns = ['Name','Num Returns','Num Exempt']
	missingRow = DF.isnull().any(axis=1)
	DF = DF.loc[~missingRow,:]

	return(DF.apply(pd.to_numeric,errors='ignore'))

	def InOrOut(name):

	token = name[-7:-4].lower()
	token = re.sub('[0-9]+','',token)
	if re.search('i\|n',token): return(':IN')
	elif re.search('o',token): return(':OUT')
	raise ValueError(name+' makes me confused')

	def HandleOddFormatName(name, stateDict):

	name1 = name[-9:-4]
	stateAbb = name1[-2:].upper()
	inOrout = name1[:3].upper()
	if (inOrout == 'GIN'): inOrout = 'IN'
	stateName = stateDict[stateAbb]
	return(stateName+":"+inOrout)

	def StateDict(postAbb):

	stateNames = postAbb.iloc[:,0]
	stateAbbs = postAbb.iloc[:,1]

	ans = dict()
	for idx, abb in enumerate(stateAbbs):
	ans[abb] = stateNames[idx]

	return(ans)


	def MapYear(year):

	if len(year) == 2:
	pass
	elif len(year) == 4:
	year = year[2:4]
	if len(year) not in [2,4]:
	#print(year)
	#print(name)
	#print(nameTokens)
	#956 -> 95-96
	#8 -> 98
	if year == '8': year = '98'
	elif year == '956': year = '96'
	else: print(year)

	return(year)
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy.random as random

	# Plot the descending sorted SVD singular values
	def PlotSingularValues(singularValues,key='avg',logOdds=True):

	token = ' Log Odd ' if logOdds else 'Original'
	numStates = singularValues[key].shape[0]
	plt.plot(np.arange(numStates),singularValues[key])
	plt.xlabel("Descending Sorted Order")
	plt.ylabel("Singular Values of %s Migration Prob. SVD" %(token))

	# Plot the SVD descending sorted singular values from a random matrix
	def PlotRandomSVD(nrow=51,ncol=51,std=1,dist='normal'):

	if dist=='normal':
	Q = random.normal(0,std,nrow * ncol).reshape(nrow,ncol)
	elif dist=='chisq':
	Q = random.chisquare(std,nrow * ncol).reshape(nrow,ncol)
	else: raise ValueError(dist + " Unknown dist choice")

	U,S,V = np.linalg.svd(np.matrix(Q),full_matrices=True)

	plt.plot(np.arange(S.shape[0]),S)

	plt.xlabel('Descending Sorted Order')
	plt.ylabel('%s Random Eigen Values'%(dist.upper()))

	# Plot the average of L1 norm of the matrix entries
	def PlotMeanResiduals(residuals):

	N = residuals['avg'].shape[0]
	x = np.arange(N).reshape(1,-1)
	y = np.arange(N).reshape(-1,1)

	years = list(residuals.keys())
	years = [y for y in years if y != 'avg']
	years.sort()
	mL1 = np.zeros(len(years))
	l1 = 0.0

	for yIdx, key in enumerate(years):

	if key == 'avg': continue
	value = residuals[key].as_matrix()
	mL1[yIdx] = np.mean(np.abs(value[x!=y]))
	l1 += np.abs(value[x!=y])

	l1/=len(years)
	years = np.array([int(y) for y in years])
	plt.plot(years, mL1)

	return(l1.mean())

	#Plot the collection of the correlations values between the K(K-1) residuals of different years
	def PlotTSCorrDist(X,ResFct,Time=21):

	if X.shape[0] != 0.5Time(Time-1): raise ValueError("X size not compatible with Time variable")

	sns.kdeplot(X)
	plt.xlabel('Cosine values of the residuals from different years')
	plt.ylabel('Distribution Density')
	plt.title('Correlation Value KDE Plot of %d degree of freedom' %(ResFct*(ResFct-1)))
	Q = random.normal(0,1,ResFct(ResFct-1)Time).reshape(Time,-1)
	A = np.matrix(Q) * np.matrix(Q).transpose()
	A = A/np.sqrt(np.diag(A).reshape(1,-1)*np.diag(A).reshape(-1,1))
	x = np.arange(Time).reshape(-1,1)
	y = np.arange(Time).reshape(1,-1)
	sns.kdeplot(np.array(A)[x>y].flatten())

	#Plot the ratio of cumulative lambda*2 covered by the truncated SVD
	def PlotRSquare(singularValues,key='avg', logOdds=True):

	eigen = singularValues.get(key)
	if eigen is None: raise ValueError("Invalid key "+key+"!")
	X = eigen
	z = np.cumsum(X2)/np.sum(X2)
	plt.plot(np.arange(z.shape[0]),z)
	token = ' Log Odds ' if logOdds else ''
	plt.title(key+" Migration Prob. "+token+" Matrix T-SVD R^2 Plot")
	plt.grid(True)
	library(reshape2)
	library(gridExtra)

	avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_original/avg.csv')

	avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_1/avg.csv')
	avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_2/avg.csv')
	avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_3/avg.csv')
	avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_4/avg.csv')

	avg = read.csv('/Users/ikovsky/r_wd/data/MigrationData_7/avg.csv')

	avgME = read.csv('/Users/ikovsky/r_wd/data/MigrationME/MM_ME.csv',header=T)
	#rownames(avgME)<-avgME$X
	#avgME$X<-NULL
	avgME_long<-melt(avgME,id='X',variable.name='Truncation',value.name='ML1')

	avg_long<-melt(avg,id=c('US.State.'),variable.name='US.State.2',value.name='probability')


	ggplot(avg_long, aes(x=factor(US.State.), y=factor(US.State.2),
	fill=probability)) + geom_tile() + scale_fill_gradientn(
	colors=c('black','dark blue', 'blue','red','white'
	)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Out of State') + ylab(
	'Into State')+labs(fill='prob. in bps')

	ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
	fill=ML1)) + geom_tile() + scale_fill_gradientn(
	colors=c('black','dark blue', 'blue','red','white'
	)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
	'SVD Truncation')+labs(fill='mean L1 error')

	ggplot(avgME_long, aes(x=factor(X), y=factor(Truncation),
	fill=ML1)) + geom_tile() + scale_fill_gradientn(
	colors=c('black','dark blue', 'blue','red','white'
	)) + theme(axis.text.x = element_text(angle = 90)) + xlab('Years') + ylab(
	'SVD Truncation')+labs(fill='root mean squared error')


	topTable=head(arrange(avg_long,desc(probability)),20)
	names(topTable) = c('Out Of','Into','Probability in bps')
	ss<-tableGrob(topTable, theme=ttheme_default(base_size=9))
	grid.arrange(ss)

	unique(topTable$'Into')
	unique(topTable$'Out of')
	[DEFAULT]


	rootDir = /Users/ikovsky/python_data/IRS
	search =
	findAllTokens =
	excludeFile = pdf
	includeFile = xls, zip
	ignore1 = notice, work, response, appeal, phish, advocate, help, rights, treasury, chinese, korean, contact, us
	ignore2 = identity, fear, freedom, privacy,Accessibility,filing,payments, refunds, vietnamese,spanish,FollowLinks
	ignore3 = laws,shares, pros, wire, navigation, share, form, news

	ignore = %(ignore1)s, %(ignore2)s, %(ignore3)s
	followNumNodes = 1
	prePend = 0
	nodes_nickName =
	logLevel = 2

	[ScrapeURL]

	url = 'https://www.irs.gov/uac/tax-stats'


	[TAX-STATS]
	search = migration, wealth, individual, personal
	loglevel = 2
	findAllTokens = table, tr, td
	excludeFile = pdf
	followNumNodes = 5

	[Personal Wealth]

	search = Wealthholders
	loglevel = 2
	findAllTokens =
	excludeFile = pdf
	followNumNodes = 5

	[Individual Income Tax]

	search = wealth, state, zip, income, migration, estate
	loglevel = 2
	findAllTokens =
	excludeFile = pdf, report
	followNumNodes = 5


	[soi-tax-stats-migration-data]

	findAllTokens =
	followNumNodes = 50
	search = ^[12][0-9]{3}
	prePend = 1
	nodes_nickName = YYYY to YYYY+1
	logLevel = 1

	[YYYY to YYYY+1]

	search = state, county
	from bs4 import BeautifulSoup
	from collections import OrderedDict
	import urllib
	import requests
	import re,configparser
	import argparse

	# The main entry point of this module
	def ScrapeURL(cfgFile='./ScrapeURL.cfg'):

	cfg = configparser.ConfigParser()
	cfg.read(cfgFile)
	url = cfg.get('ScrapeURL','url')
	linkObj = LinkObj(cfg, name='TAX-STATS',url=url,ancesters=[])
	linkObj.ScrapeLinks()
	linkObj.FollowLinks()
	return(linkObj)

	# The Node Object abstracting a link, storing the related information related to scraping
	# The program will follow the instruction from the user and follow the link to scrape/download
	# the files at the terminal nodes.
	# To avoid scraping too much garbage, we need to design letting the user decide what key words
	# of interest to follow, what to avoid
	class LinkObj(object):

	def __init__(self,cfg,name,url,ancesters=[],nodes_nickName=''):

	self.cfg = cfg
	self.nameSpace = ''
	x = str.split(name,'/')
	if len(x)>1:
	self.name = x[-1]
	self.nameSpace = x[0]
	else: self.name = name
	if nodes_nickName is not None and nodes_nickName != '':
	self.nickName = nodes_nickName
	else: self.nickName = self.name
	if not self.cfg.has_section(self.nickName):self.cfg.add_section(self.nickName)
	self.url = url
	self.prePend = bool(int(self.cfg.get(self.nickName,'prePend')))
	self.ancesters = ancesters
	self.linkDict = OrderedDict()
	x=str.split(re.sub(' ','',self.cfg.get(self.nickName, 'search')),',')
	self.findAllTokens = str.split(re.sub(' ','',self.cfg.get(self.nickName,'findAllTokens')),',')
	self.findAllTokens = [x for x in self.findAllTokens if x!='']
	self.searchToken = '\|'.join(x)
	ignore = str.split(re.sub(' ','',self.cfg.get(self.nickName,'ignore')),',')
	self.ignore = '\|'.join(ignore)
	self.nodes = []
	self.nodes_nickName = self.cfg.get(self.nickName,'nodes_nickName')
	self.excludeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'excludeFile')),',')
	self.excludeFile = [x for x in self.excludeFile if x!='']
	self.excludeFile = "\|".join([t+"$" for t in self.excludeFile])
	self.followNumNodes = int(self.cfg.get(self.nickName,'followNumNodes'))
	self.includeFile = str.split(re.sub(' ','',self.cfg.get(self.nickName, 'includeFile')),',')
	self.includeFile = [x for x in self.includeFile if x!='']
	self.includeFile = "\|".join([t+"$" for t in self.includeFile])
	self.rootDir = self.cfg.get(self.nickName,'rootDir')
	self.terminal = bool(re.search(self.includeFile,self.url))
	self.logLevel = int(self.cfg.get(self.nickName,'loglevel'))


	def ScrapeLinks(self):

	if self.terminal:
	self.DownLoadLink()
	return
	if self.logLevel == 1: print(self.name+" ScrapeLinks")
	r = urllib.request.urlopen(self.url)
	soup = BeautifulSoup(r,'lxml')
	A = soup.find_all('a',href=True)
	self.PrintResultSet(A)
	self.IterFindAll(soup,level=0)

	def FollowLinks(self):

	if self.terminal: return
	followNumNodes = self.followNumNodes if self.followNumNodes>=0 else len(self.nodes)
	print(self.name + " FollowLinks")
	count = 0
	for token, url in self.linkDict.items():
	if count >= self.followNumNodes: break
	self.nodes.append(LinkObj(self.cfg,name=token,url=url,ancesters=self.ancesters+[self.name],nodes_nickName=self.nodes_nickName))
	self.nodes[-1].ScrapeLinks()
	self.nodes[-1].FollowLinks()
	count+=1


	def IterFindAll(self, X, level):

	if self.logLevel==1: print(self.name + " IterFindAll")
	depth = len(self.findAllTokens)
	if (depth>=level+1):
	Y = X.find_all(self.findAllTokens[level])
	for y in Y:
	self.IterFindAll(y,level+1)
	else:
	Y = X.find_all(['a','b'])
	#Y = X.find_all('a',href=True)
	strong = ''
	for a in Y:
	text = a.string
	if a.name=='b':
	if a.string is not None and re.search('^[A-Za-z]+[\w_\- ]*[\w]',a.string):
	strong = a.string
	#print(strong)
	continue
	elif a.get('href') is not None: link = a['href']
	else: continue
	if text is None: text = str.split(link,'/')[-1]
	if self.searchToken=='':
	if self.logLevel == 1: print(self.name + "no search Token")
	continue
	elif (len(self.excludeFile)>0 and re.search(self.excludeFile,link,flags=re.I\|re.X)): continue
	elif re.search(self.searchToken,text,flags=re.I\|re.X):
	if strong != '' and self.prePend: text = strong + "/" + text
	self.linkDict[text] = urllib.parse.urljoin(self.url,link)


	def PrintResultSet(self,A):

	if self.logLevel > 1: return
	for item in A:
	link = item['href']
	text = str.split(link,'/')[-1] if item.string is None else item.string
	if re.search(self.ignore,text,flags=re.I\|re.X): continue
	if re.search(self.ignore,link,flags=re.I\|re.X): continue
	print(self.name+"<<"+text+":"+link)

	def DownLoadLink(self):

	path=self.rootDir + "/" + "/".join(self.ancesters) + "/" + self.nameSpace+"/"+self.name+"/"
	fileName = self.url.split('/')[-1]
	if not re.search(self.searchToken,fileName): return
	print(path+fileName)
	import os
	if not os.path.exists(path):
	os.makedirs(path)
	elif os.path.exists(path+fileName):
	print(path+fileName+" exists!")
	return
	urllib.request.urlretrieve(self.url,path+fileName)

	if __name__ == "__main__":

	parser = argparse.ArgumentParser('use ScrapeURL to scrape and download a tree of links')
	parser.add_argument('-cfg',type=str,help='command line option to control the ScrapeURL by passing a config fileName')
	args = parser.parse_args()
	if args.cfg is not None:
	ScrapeURL(cfgFile=args.cfg)