Skip to content

Instantly share code, notes, and snippets.

@renjiege
Created November 25, 2017 23:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save renjiege/bf838fd9b6a7e6e5b73316b756a1d4b4 to your computer and use it in GitHub Desktop.
Save renjiege/bf838fd9b6a7e6e5b73316b756a1d4b4 to your computer and use it in GitHub Desktop.
Parsing textual data and extracting information with regular expression.
cd /Users/renjiege/Dropbox/workslot/mystata/Fernanda_Worldbank
set more off
use ida15_zichao.dta, clear
sort country
save ida15_zichao.dta, replace
*insheet using "/Users/renjiege/Downloads/ppijuly8.csv", clear
cd /Users/renjiege/Downloads/
use ppijuly8_zichao.dta, clear
/*Drop the telecome sector*/
drop if primarysector=="Information and communication technology (ICT)"
drop if projectstatus=="Advanced Development"
/*Drop the ones that are not reaching FC projects */
destring projectid, replace force
drop if projectid==6523 | projectid==8404
/*change text to number*/
destring totalinvestment, replace force
/*Cross border Issue*/
sort projectid
merge m:1 projectid using borderprojects.dta
gen shareborder="Yes" if _merge==3
replace shareborder="No" if _merge==1
drop _merge
gen totalinvestment_raw=totalinvestment
replace totalinvestment=totalinvestment/2 if projectid!=2214 & projectid!=4449 & shareborder=="Yes"
replace totalinvestment=totalinvestment/3 if projectid==2214
replace totalinvestment=totalinvestment/4 if projectid==4449
rename investmentyear year
destring year, replace force
keep if year>=2011 & year<=2015
sort country
merge m:1 country using ida15_zichao.dta
keep if ida15==1
/* Figure 2.8: MDB Support, by Type of Support */
local mdblist = "IFC IDA ADB MIGA IBRD EIB IDB EBRD BSTDB IADB IAIC CAF BCIE Caribank NADB AFDB BOAD" // list of mdb providers
foreach type in `mdblist' {
gen `type'6=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)")
gen `type'5=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'")
gen `type'4=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'")
gen `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'")
gen `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'")
gen `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'")
replace `type'5=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'5==""
replace `type'4=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'4==""
replace `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") & `type'3==""
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'") & `type'2==""
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'1==""
replace `type'4=regexs(2) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'4==""
replace `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'3==""
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") & `type'2==""
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'.*`type'") & `type'1==""
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'.*`type'") & `type'1==""
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'2==""
replace `type'3=regexs(2) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'3==""
replace `type'1=regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+).*`type'") & `type'1==""
replace `type'2=regexs(2) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'2==""
replace `type'1= regexs(2) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)? / [\$]([0-9]+)") & `type'1==""
}
foreach type in `mdblist' {
gen `type'7=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+)") & IFC6~=""
replace `type'6=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'") & IFC6~=""
replace `type'5=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'") & IFC6~=""
replace `type'4=regexs(3) if regexm(multilater, "`type'.*`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'") & IFC6~=""
replace `type'3=regexs(3) if regexm(multilater, "`type'.*`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & IFC6~=""
replace `type'2=regexs(3) if regexm(multilater, "`type'.*`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & IFC6~=""
replace `type'1=regexs(3) if regexm(multilater, "`type' ?\([a-zA-Z]+-?([a-zA-Z]+)?(.*Insurance\))? / [\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & IFC6~=""
}
foreach type in `mdblist' {
destring `type'*, replace
egen `type'=rowtotal(`type'*)
la var `type' "US$ from `type'"
}
egen mdbsum=rowtotal(`mdblist') // US$ sum from the MDBs providers
la var mdbsum "US$ sum from all MDBs providers"
xx
*/
// Generate the amount of money for each type of investment
foreach type in Loan Guarantee Syndication Equity equity Insurance {
gen `type'9=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)")
gen `type'8=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'")
gen `type'7=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'")
gen `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'")
gen `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'")
gen `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'")
gen `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'")
gen `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'")
gen `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'")
replace `type'8=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'8==""
replace `type'7=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'7==""
replace `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'6==""
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'5==""
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'4==""
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & `type'3==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & `type'2==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & `type'1==""
replace `type'7=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'7==""
replace `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'6==""
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'5==""
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'4==""
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'3==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & `type'2==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'.*`type'") & `type'1==""
replace `type'6=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'6==""
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'5==""
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'4==""
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'3==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'2==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'.*`type'") & `type'1==""
replace `type'5=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type.*`type'.? +/ +[\$]([0-9]+)'") & `type'5==""
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'4==""
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'3==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'2==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'.*`type'") & `type'1==""
replace `type'4=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'4==""
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'3==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'2==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'.*`type'") & `type'1==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'.*`type'") & `type'1==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+).*`type'") & `type'2==""
replace `type'3=regexs(1) if regexm(multilater, "`type'.*`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'3==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+).*`type'") & `type'1==""
replace `type'2=regexs(1) if regexm(multilater, "`type'.*`type'.? +/ +[\$]([0-9]+)") & `type'2==""
replace `type'1=regexs(1) if regexm(multilater, "`type'.? +/ +[\$]([0-9]+)") & `type'1==""
}
destring Loan* Guarantee* Syndication* Equity* equity* Insurance*, replace
// Aggregate each type of financial sources for each project
foreach type in Loan Guarantee Syndication Equity equity Insurance {
egen `type'sum=rowtotal(`type'*)
}
// Aggregate each type of financial sources in total
collapse(sum) Loansum Syndicationsum Guaranteesum Equitysum equitysum Insurancesum
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment