Skip to content

Instantly share code, notes, and snippets.

@rmoff
Last active December 15, 2016 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rmoff/17025830c81e60d6446e34a37273f705 to your computer and use it in GitHub Desktop.
Save rmoff/17025830c81e60d6446e34a37273f705 to your computer and use it in GitHub Desktop.
##
## @rmoff July 8, 2016
##
## See https://community.oracle.com/docs/DOC-1006400 for more details
## Graph Analysis of the Panama Papers with Oracle Big Data Spatial and Graph
##
## Source dataset: https://offshoreleaks.icij.org/pages/database
##
## Vertex format: https://docs.oracle.com/bigdata/bda45/BDSPA/using-pgs-bigdata.htm#BDSPA256
## Edge format: https://docs.oracle.com/bigdata/bda45/BDSPA/using-pgs-bigdata.htm#BDSPA258
##----
## If necessary, these can be installed using the command:
## install.packages(c('readr','dplyr','stringr'))
library('readr')
library('dplyr')
library('stringr')
## --
## Helper function, from Jordan Meyer
rename_fields <- function(df, df.name = NA){
if(!is.na(df.name)){
names(df) %>%
tolower %>%
str_replace_all("_", ".") %>%
paste(df.name, ., sep=".")
} else {
names(df) %>%
tolower %>%
str_replace_all("_", ".")
}
}
## --
## Load Addresses
addresses <- read_csv("Addresses.csv",
trim_ws = TRUE,
col_names = c("name","icij_id","valid_until","country_codes","countries","node_id","sourceID"),
skip=1)
names(addresses) <- rename_fields(addresses)
## > str(addresses)
## Classes 'tbl_df', 'tbl' and 'data.frame': 151054 obs. of 7 variables:
## $ name : chr "One Bearer Secured Debenture" "11 Coomber Road, The Peak, Hong Kong" "4 Irish Place 2nd Floor, Gibraltar." "11A, Branksome 3 Tregunter Path Mid-Levels Hong Kong" ...
## $ icij.id : chr NA NA NA NA ...
## $ valid.until : chr "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" ...
## $ country.codes: chr "XXX" "HKG" "GIB" "HKG" ...
## $ countries : chr "Not identified" "Hong Kong" "Gibraltar" "Hong Kong" ...
## $ node.id : int 2004267 67277 67346 67278 67282 67293 67294 67297 67298 67299 ...
## $ sourceid : chr "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" ...
## >
## --
## We need to reshape the data, pushing columns into rows for the various types of information
## to store about each node.
##
## Node instances:
## 1. Address
## node_id, name
## 2. Country Codes
## node_id, country_code
## 3. Countries
## node_id,countries
## 4. Source
## node_id,sourceID
## First, create a dataframe with each set of data
address.type <- addresses %>% select(node.id) %>% mutate(key='Type') %>% mutate(datatype=1) %>% mutate(strval = 'Address')
address.nodes <- addresses %>% select(node.id) %>% mutate(key='Name') %>% mutate(datatype=1) %>% bind_cols(select(addresses,c(name))) %>% rename(strval = name)
country.code.nodes <- addresses %>% select(node.id) %>% mutate(key='Country Codes') %>% mutate(datatype=1) %>% bind_cols(select(addresses,c(country.codes))) %>% rename(strval = country.codes)
country.nodes <- addresses %>% select(node.id) %>% mutate(key='Country') %>% mutate(datatype=1) %>% bind_cols(select(addresses,c(countries))) %>% rename(strval = countries)
source.nodes <- addresses %>% select(node.id) %>% mutate(key='Source ID') %>% mutate(datatype=1) %>% bind_cols(select(addresses,c(sourceid))) %>% rename(strval = sourceid)
## Join all the instances together 'vertically' (as if a SQL union, in effect)
nodes.1 <-
bind_rows(address.type,address.nodes,country.code.nodes,country.nodes,source.nodes)
## ------------------------------------------------------
## ------------------------------------------------------
## --
## Load Entities
entities <- read_csv("Entities.csv",
trim_ws = TRUE,
col_names = c("name","original_name","former_name","jurisdiction","jurisdiction_description","company_type","address","internal_id","incorporation_date","inactivation_date","struck_off_date","dorm_date","status","service_provider","ibcRUC","country_codes","countries","note","valid_until","node_id","sourceID"),
skip=1)
names(entities) <- rename_fields(entities)
## > str(entities)
## Classes 'tbl_df', 'tbl' and 'data.frame': 319150 obs. of 21 variables:
## $ name : chr "ZODIAK LTD" "Zven Ltd." "Anson 11A Ltd." "GIADA LTD" ...
## $ original.name : chr NA NA NA NA ...
## $ former.name : chr NA NA NA NA ...
## $ jurisdiction : chr "XXX" "XXX" "XXX" "XXX" ...
## $ jurisdiction.description: chr "Undetermined" "Undetermined" "Undetermined" "Undetermined" ...
## $ company.type : chr "Business Company Limited by Shares" "Business Company Limited by Shares" "The GRA group of Companies for express Co." "Business Company Limited by Shares" ...
## $ address : chr "Christabel Corporate Services Limited Christabel House 118 Agias Fylaxeos 3087 Limassol Cyprus" "Alesta Consulting, S.A. INVOICE BY EMAIL ONLY" "Express Co Registration Pte Ltd 138 Cecil Street #18-00 Cecil Court Singapore 069538" "Christabel Corporate Services Limited Christabel House 118 Agias Fylaxeos 3087 Limassol Cyprus" ...
## $ internal.id : chr NA NA NA NA ...
## $ incorporation.date : chr "24-AUG-2007" "29-JAN-2007" "09-MAY-2000" "14-JUN-2007" ...
## $ inactivation.date : chr NA NA NA NA ...
## $ struck.off.date : chr NA NA NA NA ...
## $ dorm.date : chr "01-MAY-2009" "01-NOV-2008" NA NA ...
## $ status : chr "Dead" "Dead" "Liquidated" "Transferred OUT" ...
## $ service.provider : chr "Commonwealth Trust Limited" "Commonwealth Trust Limited" "Commonwealth Trust Limited" "Commonwealth Trust Limited" ...
## $ ibcruc : chr "1427813" "1382756" "386675" "1411204" ...
## $ country.codes : chr "VGB;CYP" "XXX;VGB" "SGP;VGB" "CYP;VGB" ...
## $ countries : chr "Virgin Islands, British;Cyprus" "Not identified;Virgin Islands, British" "Singapore;Virgin Islands, British" "Cyprus;Virgin Islands, British" ...
## $ note : chr NA NA NA NA ...
## $ valid.until : chr "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" ...
## $ node.id : int 67028 67243 67258 67266 108050 108051 108110 108141 108147 108148 ...
## $ sourceid : chr "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" ...
## Create a dataframe with each set of data
entity.type <- entities %>% select(node.id) %>% mutate(key='Type') %>% mutate(datatype=1) %>% mutate(strval = 'Entity')
entity.source.nodes <- entities %>% select(node.id) %>% mutate(key='Source ID') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(sourceid))) %>% rename(strval = sourceid)
entity.name.nodes <- entities %>% select(node.id) %>% mutate(key='Name') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(name))) %>% rename(strval = name)
entity.jurisdiction <- entities %>% select(node.id) %>% mutate(key='Entity jurisdiction') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(jurisdiction))) %>% rename(strval = jurisdiction)
entity.jurisdiction.description <- entities %>% select(node.id) %>% mutate(key='Entity jurisdiction.description') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(jurisdiction.description))) %>% rename(strval = jurisdiction.description)
entity.company.type <- entities %>% select(node.id) %>% mutate(key='Entity company.type') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(company.type))) %>% rename(strval = company.type)
entity.address <- entities %>% select(node.id) %>% mutate(key='Entity address') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(address))) %>% rename(strval = address)
entity.incorporation.date <- entities %>% select(node.id) %>% mutate(key='Entity incorporation.date') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(incorporation.date))) %>% rename(strval = incorporation.date)
entity.dorm.date <- entities %>% select(node.id) %>% mutate(key='Entity dorm.date') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(dorm.date))) %>% rename(strval = dorm.date)
entity.struck.off.date <- entities %>% select(node.id) %>% mutate(key='Entity struck.off.date') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(struck.off.date))) %>% rename(strval = struck.off.date)
entity.status <- entities %>% select(node.id) %>% mutate(key='Entity status') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(status))) %>% rename(strval = status)
entity.country.codes <- entities %>% select(node.id) %>% mutate(key='Country Codes') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(country.codes))) %>% rename(strval = country.codes)
entity.countries <- entities %>% select(node.id) %>% mutate(key='Countries') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(countries))) %>% rename(strval = countries)
entity.note <- entities %>% select(node.id) %>% mutate(key='Entity note') %>% mutate(datatype=1) %>% bind_cols(select(entities,c(note))) %>% rename(strval = note)
## Join all the instances together (as if a SQL union, in effect)
nodes.2 <-
bind_rows(entity.type,entity.source.nodes,entity.name.nodes ,entity.jurisdiction ,entity.jurisdiction.description ,entity.company.type ,entity.address ,entity.incorporation.date ,entity.dorm.date ,entity.struck.off.date ,entity.status ,entity.country.codes ,entity.countries ,entity.note )
## ------------------------------------------------------
## ------------------------------------------------------
## --
## Load Officers
officers <- read_csv("Officers.csv",
trim_ws = TRUE,
col_names = c("name","icij_id","valid_until","country_codes","countries","node_id","sourceID"),
skip=1)
names(officers) <- rename_fields(officers)
## > str(officers)
## Classes 'tbl_df', 'tbl' and 'data.frame': 345594 obs. of 7 variables:
## $ name : chr "Bisbro Investment Company Ltd." "Mohamed Mustafa Bin A Mohamed Ismail" "Teh Hong Eng" "New Sights Enterprises Limited" ...
## $ icij.id : chr NA NA NA NA ...
## $ valid.until : chr "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" ...
## $ country.codes: chr "VGB" "MYS" "HKG" "VGB" ...
## $ countries : chr "Virgin Islands, British" "Malaysia" "Hong Kong" "Virgin Islands, British" ...
## $ node.id : int 51113 51114 51115 51116 51117 51118 60877 60879 60880 51119 ...
## $ sourceid : chr "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" ...
## Create a dataframe with each set of data
officer.type <- officers %>% select(node.id) %>% mutate(key='Type') %>% mutate(datatype=1) %>% mutate(strval = 'Officer')
officer.name <- officers %>% select(node.id) %>% mutate(key='Name') %>% mutate(datatype=1) %>% bind_cols(select(officers,c(name))) %>% rename(strval = name)
officer.icij.id <- officers %>% select(node.id) %>% mutate(key='Officer icij.id') %>% mutate(datatype=1) %>% bind_cols(select(officers,c(icij.id))) %>% rename(strval = icij.id)
officer.country.codes <- officers %>% select(node.id) %>% mutate(key='Country Codes') %>% mutate(datatype=1) %>% bind_cols(select(officers,c(country.codes))) %>% rename(strval = country.codes)
officer.countries <- officers %>% select(node.id) %>% mutate(key='Countries') %>% mutate(datatype=1) %>% bind_cols(select(officers,c(countries))) %>% rename(strval = countries)
officer.sourceid <- officers %>% select(node.id) %>% mutate(key='Source ID') %>% mutate(datatype=1) %>% bind_cols(select(officers,c(sourceid))) %>% rename(strval = sourceid)
## Join all the instances together (as if a SQL union, in effect)
nodes.3 <-
bind_rows(officer.type,officer.name,officer.icij.id,officer.country.codes,officer.countries,officer.sourceid )
## ------------------------------------------------------
## ------------------------------------------------------
## --
## Load Intermediaries
intermediaries <- read_csv("Intermediaries.csv",
trim_ws = TRUE,
col_names = c("name","internal_id","address","valid_until","country_codes","countries","status","node_id","sourceID"),
skip=1)
names(intermediaries) <- rename_fields(intermediaries)
## > str(intermediaries)
## Classes 'tbl_df', 'tbl' and 'data.frame': 23636 obs. of 9 variables:
## $ name : chr "Peng, Wan-Hsiung" "Shin Tae Young" "Lin Celina" "R E Douglas B Juday" ...
## $ internal.id : chr NA NA NA NA ...
## $ address : chr NA NA NA NA ...
## $ valid.until : chr "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" "The Offshore Leaks data is current through 2010" ...
## $ country.codes: chr "TWN" "ARE" "HKG" "USA" ...
## $ countries : chr "Taiwan, Province of China" "United Arab Emirates" "Hong Kong" "United States" ...
## $ status : chr NA NA NA NA ...
## $ node.id : int 51122 51149 51162 51224 51270 51418 51436 51457 51479 51496 ...
## $ sourceid : chr "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" "Offshore Leaks" ...
## >
## Create a dataframe with each set of data
intermediary.type <- intermediaries %>% select(node.id) %>% mutate(key='Type') %>% mutate(datatype=1) %>% mutate(strval = 'Intermediary')
intermediary.name <- intermediaries %>% select(node.id) %>% mutate(key='Name') %>% mutate(datatype=1) %>% bind_cols(select(intermediaries,c(name))) %>% rename(strval = name)
intermediary.country.codes <- intermediaries %>% select(node.id) %>% mutate(key='Country Codes') %>% mutate(datatype=1) %>% bind_cols(select(intermediaries,c(country.codes))) %>% rename(strval = country.codes)
intermediary.countries <- intermediaries %>% select(node.id) %>% mutate(key='Countries') %>% mutate(datatype=1) %>% bind_cols(select(intermediaries,c(countries))) %>% rename(strval = countries)
intermediary.sourceid <- intermediaries %>% select(node.id) %>% mutate(key='Source ID') %>% mutate(datatype=1) %>% bind_cols(select(intermediaries,c(sourceid))) %>% rename(strval = sourceid)
## Join all the instances together (as if a SQL union, in effect)
nodes.4 <-
bind_rows(intermediary.type,intermediary.name,intermediary.country.codes,intermediary.countries,intermediary.sourceid )
## ------------------------------------------------------
## ------------------------------------------------------
## Join the four datasets
nodes <-
nodes.1 %>%
bind_rows(nodes.2) %>%
bind_rows(nodes.3) %>%
bind_rows(nodes.4)
## Tidy up the data
## The DataLoader gets upset with commas in field values (even if quote-enclosed)
nodes$strval <- gsub(',','-',nodes$strval)
## Newlines in field values really knacker things up, so replace them with spaces
nodes$strval <- gsub('\r\n',' ',nodes$strval)
## Add empty columns for numeric/date values
nodes <-
nodes %>%
mutate(numval=NA) %>%
mutate(dateval=NA)
## Add ID value for each node itself (to make searching possible)
addresses.ids <- addresses %>% select(node.id) %>% mutate(key='ID') %>% mutate(datatype=2) %>% mutate(strval='') %>%bind_cols(select(addresses,c(node.id)))
colnames(addresses.ids)[5] <- 'numval'
officers.ids <- officers %>% select(node.id) %>% mutate(key='ID') %>% mutate(datatype=2) %>% mutate(strval='') %>%bind_cols(select(officers,c(node.id)))
colnames(officers.ids)[5] <- 'numval'
intermediaries.ids <- intermediaries %>% select(node.id) %>% mutate(key='ID') %>% mutate(datatype=2) %>% mutate(strval='') %>%bind_cols(select(intermediaries,c(node.id)))
colnames(intermediaries.ids)[5] <- 'numval'
entities.ids <- entities %>% select(node.id) %>% mutate(key='ID') %>% mutate(datatype=2) %>% mutate(strval='') %>%bind_cols(select(entities,c(node.id)))
colnames(entities.ids)[5] <- 'numval'
id.nodes <-
bind_rows(addresses.ids) %>%
bind_rows(entities.ids) %>%
bind_rows(intermediaries.ids) %>%
bind_rows(officers.ids) %>%
mutate(dateval=NA)
## Join node values with IDs
nodes <-
nodes %>%
bind_rows(id.nodes)
## Sort the dataset by node.id
nodes <-
nodes %>%
arrange(node.id)
## Write to file
write_csv(nodes,
path='panama_nodes.opv',
col_names=FALSE,
na='')
## ------------------------------------------------------
## ------------------------------------------------------
edges <- read_csv("all_edges.csv",
trim_ws = TRUE,
col_names = c("node_1","rel_type","node_2"),
skip=1)
names(edges) <- rename_fields(edges)
## > str(edges)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1265690 obs. of 3 variables:
## $ node.1 : int 11000001 11000001 11000001 11000001 11000001 11000002 11000002 11000002 11000002 11000002 ...
## $ rel.type: chr "intermediary_of" "intermediary_of" "intermediary_of" "intermediary_of" ...
## $ node.2 : int 10208879 10198662 10159927 10165779 10152967 10005174 10008130 10011418 10010781 10188436 ...
## >
##
## Target output is: (no edge properties)
## edge ID,node 1,node 2,label,%20,,,,
## e.g.
## 1009,6,37,mentions,%20,,,,
## Need to generate a sequence column as the artifical key for the edges
edges <-
edges %>%
mutate(id = row_number(edges$node.1))
## Put the columns in the correct order:
reordered.edges <-
edges %>%
select(id) %>%
bind_cols(select(edges,node.1)) %>%
bind_cols(select(edges,node.2)) %>%
bind_cols(select(edges,rel.type)) %>%
mutate(property.key='%20') %>%
mutate(empty1='') %>%
mutate(empty2='') %>%
mutate(empty3='') %>%
mutate(empty4='')
## Write the data to CSV.
write_csv(reordered.edges,
path='panama_edges.ope',
col_names=FALSE)
## ------------------------------------------------------
## ------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment