Created
November 25, 2017 23:12
-
-
Save renjiege/bf838fd9b6a7e6e5b73316b756a1d4b4 to your computer and use it in GitHub Desktop.
Parsing textual data and extracting information with regular expression.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd /Users/renjiege/Dropbox/workslot/mystata/Fernanda_Worldbank | |
set more off | |
use ida15_zichao.dta, clear | |
sort country | |
save ida15_zichao.dta, replace | |
*insheet using "/Users/renjiege/Downloads/ppijuly8.csv", clear | |
cd /Users/renjiege/Downloads/ | |
use ppijuly8_zichao.dta, clear | |
/*Drop the telecome sector*/ | |
drop if primarysector=="Information and communication technology (ICT)" | |
drop if projectstatus=="Advanced Development" | |
/*Drop the ones that are not reaching FC projects */ | |
destring projectid, replace force | |
drop if projectid==6523 | projectid==8404 | |
/*change text to number*/ | |
destring totalinvestment, replace force | |
/*Cross border Issue*/ | |
sort projectid | |
merge m:1 projectid using borderprojects.dta | |
gen shareborder="Yes" if _merge==3 | |
replace shareborder="No" if _merge==1 | |
drop _merge | |
gen totalinvestment_raw=totalinvestment | |
replace totalinvestment=totalinvestment/2 if projectid!=2214 & projectid!=4449 & shareborder=="Yes" | |
replace totalinvestment=totalinvestment/3 if projectid==2214 | |
replace totalinvestment=totalinvestment/4 if projectid==4449 | |
rename investmentyear year | |
destring year, replace force | |
keep if year>=2011 & year<=2015 | |
sort country | |
merge m:1 country using ida15_zichao.dta | |
keep if ida15==1 | |
/* Figure 2.8: MDB Support, by Type of Support */ | |
local mdblist = "IFC IDA ADB MIGA IBRD EIB IDB EBRD BSTDB IADB IAIC CAF BCIE Caribank NADB AFDB BOAD" // list of mdb providers | |
foreach type in `mdblist' { | |
gen `type'6=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") | |
gen `type'5=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") | |
gen `type'4=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") | |
gen `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'") | |
gen `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") | |
gen `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") | |
replace `type'5=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'5=="" | |
replace `type'4=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'4=="" | |
replace `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") & `type'3=="" | |
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'4=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'4=="" | |
replace `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'3=="" | |
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") & `type'1=="" | |
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'2=="" | |
replace `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'3=="" | |
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'1=="" | |
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'2=="" | |
replace `type'1= regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'1=="" | |
} | |
foreach type in `mdblist' { | |
gen `type'7=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+)") & IFC6~="" | |
replace `type'6=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'") & IFC6~="" | |
replace `type'5=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'") & IFC6~="" | |
replace `type'4=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'") & IFC6~="" | |
replace `type'3=regexs(3) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & IFC6~="" | |
replace `type'2=regexs(3) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & IFC6~="" | |
replace `type'1=regexs(3) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & IFC6~="" | |
} | |
foreach type in `mdblist' { | |
destring `type'*, replace | |
egen `type'=rowtotal(`type'*) | |
la var `type' "US$ from `type'" | |
} | |
egen mdbsum=rowtotal(`mdblist') // US$ sum from the MDBs providers | |
la var mdbsum "US$ sum from all MDBs providers" | |
xx | |
*/ | |
// Generate the amount of money for each type of investment | |
foreach type in Loan Guarantee Syndication Equity equity Insurance { | |
gen `type'9=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") | |
gen `type'8=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") | |
gen `type'7=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") | |
gen `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") | |
gen `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") | |
gen `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") | |
gen `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") | |
gen `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") | |
gen `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") | |
replace `type'8=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'8=="" | |
replace `type'7=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'7=="" | |
replace `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'6=="" | |
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'5=="" | |
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'4=="" | |
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & `type'3=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'7=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'7=="" | |
replace `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'6=="" | |
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'5=="" | |
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'4=="" | |
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'3=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'6=="" | |
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'5=="" | |
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'4=="" | |
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'3=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type.*`type'.? +/ +[\$]([0-9]+)'") & `type'5=="" | |
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'4=="" | |
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'3=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'4=="" | |
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'3=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'2=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'1=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'1=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'2=="" | |
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'3=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'") & `type'1=="" | |
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'2=="" | |
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+)") & `type'1=="" | |
} | |
destring Loan* Guarantee* Syndication* Equity* equity* Insurance*, replace | |
// Aggregate each type of financial sources for each project | |
foreach type in Loan Guarantee Syndication Equity equity Insurance { | |
egen `type'sum=rowtotal(`type'*) | |
} | |
// Aggregate each type of financial sources in total | |
collapse(sum) Loansum Syndicationsum Guaranteesum Equitysum equitysum Insurancesum | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment