Skip to content

Instantly share code, notes, and snippets.

@eirikbakke
Created May 10, 2016 03:31
Show Gist options
  • Save eirikbakke/4b77eeea5fbc79511f563f5dd15a0076 to your computer and use it in GitHub Desktop.
Save eirikbakke/4b77eeea5fbc79511f563f5dd15a0076 to your computer and use it in GitHub Desktop.
Panamapapers PostgreSQL import script
-- PostgreSQL import script for panamapapers CSV files available from https://panamapapers.icij.org/blog/20160509-offshore-database-release.html
-- First convert CSV files from MacRoman to UTF-8 encoding:
-- iconv -f MACROMAN -t UTF-8 Entities.csv > Entities-utf8.csv
-- iconv -f MACROMAN -t UTF-8 Addresses.csv > Addresses-utf8.csv
-- iconv -f MACROMAN -t UTF-8 Intermediaries.csv > Intermediaries-utf8.csv
-- iconv -f MACROMAN -t UTF-8 Officers.csv > Officers-utf8.csv
-- iconv -f MACROMAN -t UTF-8 all_edges.csv > all_edges-utf8.csv
drop table if exists addresses cascade;
drop table if exists countries cascade;
create table addresses(
"address" varchar(1024) not null,
icij_id character(32),
valid_until character(47),
country_code character(3),
country varchar(64),
node_id integer not null,
source_id character(16) not null
);
copy addresses("address", icij_id, valid_until, country_code, country, node_id, source_id) from
'/Path/To/Files/offshore_leaks_csvs/Addresses-utf8.csv'
with delimiter E','
header
null as ''
csv quote as E'"'
ENCODING 'UTF-8';
select distinct country_code as "code", country as "name" into countries from addresses;
delete from countries where "code" is null and "name" is null;
alter table countries alter column "code" set not null;
alter table countries alter column "name" set not null;
alter table countries add primary key("code");
alter table addresses add constraint addresses_country_fk foreign key (country_code) references countries("code");
alter table addresses drop column country;
alter table addresses owner to duser;
alter table countries owner to duser;
alter table addresses add primary key(node_id);
drop table if exists entities cascade;
drop table if exists entities_countries cascade;
create table entities(
"name" varchar(256),
original_name varchar(256),
former_name varchar(256),
jurisdiction character(5) not null,
jurisdiction_description varchar(64) not null,
company_type varchar(64),
"address" varchar(1024),
internal_id character(8),
incorporation_date text,
inactivation_date text,
struck_off_date text,
dorm_date text,
"status" varchar(64),
service_provider varchar(32),
ibcRUC varchar(32),
country_codes text,
countries text,
note varchar(256),
valid_until character(47) not null,
node_id integer not null,
source_id character(16) not null
);
copy entities("name", original_name, former_name, jurisdiction, jurisdiction_description, company_type, "address", internal_id, incorporation_date, inactivation_date, struck_off_date, dorm_date, "status", service_provider, ibcRUC, country_codes, countries, note, valid_until, node_id, source_id) from
'/Path/To/Files/offshore_leaks_csvs/Entities-utf8.csv'
with delimiter E','
header
null as ''
csv quote as E'"'
ENCODING 'UTF-8';
alter table entities owner to duser;
ALTER TABLE entities ALTER COLUMN "incorporation_date" TYPE DATE
using to_date(incorporation_date, 'd-Mon-yyyy');
ALTER TABLE entities ALTER COLUMN "inactivation_date" TYPE DATE
using to_date(inactivation_date, 'd-Mon-yyyy');
ALTER TABLE entities ALTER COLUMN "struck_off_date" TYPE DATE
using to_date(struck_off_date, 'd-Mon-yyyy');
ALTER TABLE entities ALTER COLUMN "dorm_date" TYPE DATE
using to_date(dorm_date, 'd-Mon-yyyy');
create index entities_incorporation_date_idx on entities(incorporation_date);
create index entities_inactivation_date_idx on entities(inactivation_date);
create index entities_struck_off_date_idx on entities(struck_off_date);
create index entities_dorm_date_idx on entities(dorm_date);
alter table entities add primary key(node_id);
select distinct node_id, regexp_split_to_table(country_codes,';') as country_code into entities_countries from entities;
alter table entities_countries owner to duser;
alter table entities_countries add primary key("node_id", "country_code");
alter table entities_countries add constraint entities_countries_node_id_fk foreign key (node_id) references entities("node_id");
alter table entities_countries add constraint entities_countries_country_fk foreign key (country_code) references countries("code");
alter table entities drop column country_codes;
alter table entities drop column countries;
drop table if exists intermediaries cascade;
drop table if exists intermediaries_countries cascade;
create table intermediaries(
"name" varchar(128),
internal_id character(6),
"address" varchar(256),
valid_until character(47) not null,
country_codes text,
countries text,
"status" varchar(64),
node_id integer not null,
source_id character(16) not null
);
copy intermediaries("name", internal_id, "address", "valid_until", "country_codes", "countries", "status", node_id, source_id) from
'/Path/To/Files/offshore_leaks_csvs/Intermediaries-utf8.csv'
with delimiter E','
header
null as ''
csv quote as E'"'
ENCODING 'UTF-8';
alter table intermediaries owner to duser;
alter table intermediaries add primary key(node_id);
select distinct node_id, regexp_split_to_table(country_codes,';') as country_code into intermediaries_countries from intermediaries;
alter table intermediaries_countries owner to duser;
alter table intermediaries_countries add primary key("node_id", "country_code");
alter table intermediaries_countries add constraint intermediaries_countries_node_id_fk foreign key (node_id) references intermediaries("node_id");
alter table intermediaries_countries add constraint intermediaries_countries_country_fk foreign key (country_code) references countries("code");
alter table intermediaries drop column country_codes;
alter table intermediaries drop column countries;
drop table if exists officers cascade;
drop table if exists officers_countries cascade;
create table officers(
"name" varchar(256),
icij_id character(32),
valid_until character(47) not null,
country_codes text,
countries text,
node_id integer not null,
source_id character(16) not null
);
alter table officers owner to duser;
copy officers("name", icij_id, valid_until, country_codes, countries, node_id, source_id) from
'/Path/To/Files/offshore_leaks_csvs/Officers-utf8.csv'
with delimiter E','
header
null as ''
csv quote as E'"'
ENCODING 'UTF-8';
alter table officers add primary key(node_id);
select distinct node_id, regexp_split_to_table(country_codes,';') as country_code into officers_countries from officers;
alter table officers_countries owner to duser;
alter table officers_countries add primary key("node_id", "country_code");
alter table officers_countries add constraint officers_countries_node_id_fk foreign key (node_id) references officers("node_id");
alter table officers_countries add constraint officers_countries_country_fk foreign key (country_code) references countries("code");
alter table officers drop column country_codes;
alter table officers drop column countries;
drop table if exists all_edges_tmp cascade;
drop table if exists all_edges cascade;
create table all_edges_tmp(
node1 integer not null,
rel_type character(18) not null,
node2 integer not null
);
copy all_edges_tmp(node1, rel_type, node2) from
'/Path/To/Files/offshore_leaks_csvs/all_edges-utf8.csv'
with delimiter E','
header
null as ''
csv quote as E'"'
ENCODING 'UTF-8';
select distinct * into all_edges from all_edges_tmp;
drop table all_edges_tmp cascade;
alter table all_edges owner to duser;
-- Note: Duplicate (node1,node2) pairs known to exist.
alter table all_edges add primary key(node1, rel_type, node2);
create index all_edges_rel_type_idx on all_edges(rel_type);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment