Panama Papers Import Scripts for Neo4j
export NEO4J_HOME=${NEO4J_HOME-~/Downloads/neo4j-community-3.0.1} | |
if [ ! -f data-csv.zip ]; then | |
curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip | |
fi | |
export DATA=${PWD}/import | |
rm -rf $DATA | |
unzip -o -j data-csv.zip -d $DATA | |
wc -l $DATA/*.csv | |
tr -d '\\' < $DATA/Addresses.csv > $DATA/Addresses_fixed.csv | |
for i in $DATA/[AIEO]*.csv; do echo $i; sed -i '' -e '1,1 s/node_id/node_id:ID/' $i; done | |
sed -i '' -e '1 d' $DATA/all_edges.csv | |
tr '[:lower:]' '[:upper:]' < $DATA/all_edges.csv | sed -e 's/[^A-Z0-9,_ ]//g' -e 's/ */_/g' -e 's/,_/_/g' > $DATA/all_edges_cleaned.csv | |
echo 'node_id:START_ID,rel_type:TYPE,node_id:END_ID' > $DATA/all_edges_header.csv | |
rm -rf $DATA/panama.db | |
head -1 $DATA/*.csv | |
$NEO4J_HOME/bin/neo4j-import --into $DATA/panama.db --nodes:Address $DATA/Addresses_fixed.csv --nodes:Entity $DATA/Entities.csv --nodes:Intermediary $DATA/Intermediaries.csv --nodes:Officer $DATA/Officers.csv \ | |
--relationships $DATA/all_edges_header.csv,$DATA/all_edges_cleaned.csv --ignore-empty-strings true --skip-duplicate-nodes true --skip-bad-relationships true --bad-tolerance 1000000 --multiline-fields=true | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;' | |
x$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN labels(n),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-[r]->() RETURN type(r),r.detail,count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
# IMPORT DONE in 23s 391ms. Imported: | |
# 839434 nodes | |
# 1269796 relationships | |
# 8211010 properties |
export NEO4J_HOME=${NEO4J_HOME-~/Downloads/neo4j-community-3.0.1} | |
if [ ! -f data-csv.zip ]; then | |
echo Downloading ... | |
curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip | |
fi | |
# Need full path due to BUG in import tool with relative paths | |
export DATA=${PWD}/import | |
echo Extracting, Preparing, Cleaning up data ... | |
unzip -o -j data-csv.zip -d $DATA | |
tr -d '\\' < $DATA/Addresses.csv > $DATA/Addresses_fixed.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Address)/' $DATA/Addresses_fixed.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Officer)/' $DATA/Officers.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Entity)/' $DATA/Entities.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Intermediary)/' $DATA/Intermediaries.csv | |
sed -i '' -e '1 d' $DATA/all_edges.csv | |
for i in Entity Officer Intermediary; do | |
echo "node_id:START_ID($i),detail:IGNORE,node_id:END_ID(Address)" > $DATA/registered_address_$i.csv | |
done | |
grep ',registered address,' $DATA/all_edges.csv > $DATA/registered_address.csv | |
for i in Officer Intermediary; do | |
for j in Officer Intermediary; do | |
echo "node_id:START_ID(${i}),detail:IGNORE,node_id:END_ID(${j})" > $DATA/similar_${i}_${j}.csv | |
done | |
done | |
grep ',similar name and address as,' $DATA/all_edges.csv > $DATA/similar.csv | |
echo 'node_id:START_ID(Entity),detail,node_id:END_ID(Entity)' > $DATA/related.csv | |
grep ',\(related entity\|same name and registration date as\),' $DATA/all_edges.csv >> $DATA/related.csv | |
for i in Entity Intermediary; do | |
echo "node_id:START_ID(Officer),detail,node_id:END_ID($i)" > $DATA/officer_of_$i.csv | |
done | |
tr '[:upper:]' '[:lower:]' < $DATA/all_edges.csv | grep -v ',\(intermediary of\|registered address\|similar name and address as\|same name and registration date as\|same address as\|related entity\),' > $DATA/officer_of.csv | |
for i in Entity; do | |
echo "node_id:START_ID(Intermediary),detail,node_id:END_ID($i)" > $DATA/intermediary_of_$i.csv | |
done | |
sed -e 's/,intermediary of,/,,/' < $DATA/all_edges.csv > $DATA/intermediary_of.csv | |
echo CSV Overview ... | |
head -1 $DATA/*.csv | |
echo Importing ... | |
rm -rf $DATA/panama.db; $NEO4J_HOME/bin/neo4j-import --into $DATA/panama.db --nodes:Address $DATA/Addresses_fixed.csv --nodes:Entity $DATA/Entities.csv --nodes:Intermediary $DATA/Intermediaries.csv --nodes:Officer $DATA/Officers.csv \ | |
--relationships:REGISTERED_ADDRESS $DATA/registered_address_Officer.csv,$DATA/registered_address.csv \ | |
--relationships:REGISTERED_ADDRESS $DATA/registered_address_Entity.csv,$DATA/registered_address.csv \ | |
--relationships:REGISTERED_ADDRESS $DATA/registered_address_Intermediary.csv,$DATA/registered_address.csv \ | |
--relationships:RELATED $DATA/related.csv \ | |
--relationships:OFFICER_OF $DATA/officer_of_Entity.csv,$DATA/officer_of.csv \ | |
--relationships:OFFICER_OF $DATA/officer_of_Intermediary.csv,$DATA/officer_of.csv \ | |
--relationships:INTERMEDIARY_OF $DATA/intermediary_of_Entity.csv,$DATA/intermediary_of.csv \ | |
--relationships:SIMILAR $DATA/similar_Officer_Officer.csv,$DATA/similar.csv \ | |
--relationships:SIMILAR $DATA/similar_Officer_Intermediary.csv,$DATA/similar.csv \ | |
--relationships:SIMILAR $DATA/similar_Intermediary_Officer.csv,$DATA/similar.csv \ | |
--relationships:SIMILAR $DATA/similar_Intermediary_Intermediary.csv,$DATA/similar.csv \ | |
--ignore-empty-strings true --skip-duplicate-nodes true --skip-bad-relationships true --bad-tolerance 10000000 --multiline-fields=true | |
echo Imported Data Overview ... | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-->() RETURN count(*) as relationships;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN labels(n),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-[r]->() RETURN type(r),r.detail,count(*) ORDER BY count(*) DESC;' | |
# IMPORT DONE in 23s 361ms. Imported: | |
# 839434 nodes | |
# 1265690 relationships | |
# 8211012 properties | |
# cut -d, -f 2 all_edges.csv | sort | uniq -c | sort -nr | |
# 319121 intermediary of | |
# 316472 registered address | |
# 277380 shareholder of | |
# 118589 Director of | |
# 105408 Shareholder of | |
# 46761 similar name and address as | |
# 36318 Records & Registers of | |
# 15151 beneficiary of | |
# 14351 Secretary of | |
# 4031 Beneficiary of | |
# 3146 same name and registration date as | |
# 1847 Beneficial Owner of | |
# 1418 Trustee of Trust of | |
# 1234 Trust Settlor of | |
# 1229 Authorised Person / Signatory of | |
# 1198 Protector of | |
# 1130 Nominee Shareholder of | |
# 960 same address as | |
# 622 related entity | |
# 583 Assistant Secretary of | |
# 409 Alternate Director of | |
# 320 Co-Trustee of Trust of | |
# 281 Officer of | |
# 272 Resident Director of | |
# 207 Auditor of | |
# 173 Correspondent Addr. of | |
# 123 Bank Signatory of | |
# 120 General Accountant of | |
# 101 Nominated Person of |
mkdir -p panama | |
cd panama | |
# load apoc from https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/1.0.0 | |
mkdir -p plugins | |
curl -L https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/1.0.0/apoc-1.0.0.jar -o plugins/apoc-1.0.0.jar | |
# load public panama-papers files from: https://offshoreleaks.icij.org/pages/database | |
if [ ! -f data-csv.zip ]; then curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip; fi | |
unzip -o -j data-csv.zip -d import | |
tr -d '\\' < import/Addresses.csv > import/Addresses_fixed.csv | |
export PORT=`date +%S`123 | |
echo $PORT | |
export HERE=`pwd` | |
mkdir -p $HERE/data | |
rm -rf $HERE/data/* | |
export CONTAINER=`docker run \ | |
--name neo4j-panama \ | |
--detach \ | |
--publish=$PORT:7474 \ | |
--volume=$HERE/data:/data \ | |
--volume=$HERE/import:/var/lib/neo4j/import \ | |
--volume=$HERE/plugins:/plugins \ | |
--ulimit=nofile=40000:40000 \ | |
--env=NEO4J_dbms_memory_heap_maxSize=5000 \ | |
--env=NEO4J_dbms_memory_pagecache_size=500M \ | |
neo4j:3.0` | |
docker ps -f name=neo4j-panama | |
sleep 5 | |
docker exec -i $CONTAINER /var/lib/neo4j/bin/neo4j-shell -f << EOF | |
match (n) detach delete n; | |
create constraint on (n:Node) assert n.node_id is unique; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Addresses_fixed.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Address; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Intermediaries.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Intermediary; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Entities.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Entity; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Officers.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Officer; | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "intermediary_of" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:INTERMEDIARY_OF]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "officer_of" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:OFFICER_OF]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "registered_address" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:REGISTERED_ADDRESS]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "similar" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:SIMILAR]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "underlying" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:UNDERLYING]->(n2); | |
DROP CONSTRAINT ON (n:Node) ASSERT n.node_id IS UNIQUE; | |
MATCH (n) REMOVE n:Node; | |
CREATE INDEX ON :Officer(name); | |
CREATE INDEX ON :Entity(name); | |
CREATE INDEX ON :Entity(address); | |
CREATE INDEX ON :Intermediary(name); | |
CREATE INDEX ON :Address(address); | |
// stats | |
MATCH (n)-[r]->(m) | |
RETURN labels(n),type(r),labels(m),count(*) | |
ORDER BY count(*) DESC; | |
schema await | |
EOF | |
echo "Neo4j running on $PORT mounted $HERE/data and $HERE/import container: $CONTAINER" | |
echo "To kill run: docker rm -f $CONTAINER" |
CREATE INDEX ON :Intermediary(name); | |
CREATE INDEX ON :Address(address); | |
CREATE INDEX ON :Officer(name); | |
CREATE INDEX ON :Entity(name); | |
CREATE INDEX ON :Entity(address); | |
CREATE INDEX ON :Entity(jurisdiction); | |
CREATE INDEX ON :Entity(incorporation_date); | |
CREATE INDEX ON :Entity(inactivation_date); | |
CREATE INDEX ON :Entity(struck_off_date); | |
CREATE INDEX ON :Entity(service_provider); | |
CREATE INDEX ON :Entity(original_name); | |
CREATE INDEX ON :Entity(status); | |
CREATE INDEX ON :Entity(country_codes); | |
CREATE INDEX ON :Address(country_codes); | |
CREATE INDEX ON :Intermediary(country_codes); | |
CREATE INDEX ON :Officer(country_codes); | |
// everything below is optional for fun | |
// mark officers as companies | |
unwind [" LTD","SURVIVORSHIP"," CORP","LIMITED","INC","FOUNDATION"," S.A.","PORTADOR","TRUST","BEARER","INTERNATIONAL","COMPANY","ANSTALT","INVESTMENTS"," B.V."," AG"] as designation | |
match (o:Officer) | |
WHERE NOT o:Company AND toUpper(o.name) CONTAINS designation | |
SET o:Company; | |
// set sources as label for faster filtering | |
MATCH (n) WHERE n.sourceID = "Panama Papers" and NOT n:PP | |
SET n:PP; | |
MATCH (n) WHERE n.sourceID = "Offshore Leaks" and NOT n:OSL | |
SET n:OSL; | |
// extract country nodes | |
CREATE CONSTRAINT ON (c:Country) ASSERT c.code IS UNIQUE; | |
CALL apoc.periodic.commit(" | |
MATCH (n) WHERE exists(n.country_codes) | |
WITH n limit 50000 | |
WITH n, split(n.country_codes,';') as codes,split(n.countries,';') as countries | |
FOREACH (idx in range(0,size(codes)-1) | | |
MERGE (country:Country {code:codes[idx]}) ON CREATE SET country.name = countries[idx] | |
MERGE (n)-[:LOCATED_IN]->(country) | |
) | |
REMOVE n.country_codes, n.countries | |
RETURN count(*) | |
",{}); | |
// create a full-text index | |
CALL apoc.index.addAllNodes('offshore',{ | |
Officer: ["name"], | |
Intermediary: ["name","address"], | |
Address: ["address"], | |
Entity: ["name", "address", "service_provider", "former_name", "company_type"]}); | |
// top-20 officers | |
match (o:Officer:PP) | |
WHERE NOT o:Company | |
return o.name, | |
size( (o)-[:OFFICER_OF]->()) as deg | |
order by deg desc limit 100; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment