Last active
May 10, 2021 20:29
-
-
Save jexp/4a9dfd20cff6e270ea92dd3397ffb4ba to your computer and use it in GitHub Desktop.
Panama Papers Import Scripts for Neo4j
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export NEO4J_HOME=${NEO4J_HOME-~/Downloads/neo4j-community-3.0.1} | |
if [ ! -f data-csv.zip ]; then | |
curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip | |
fi | |
export DATA=${PWD}/import | |
rm -rf $DATA | |
unzip -o -j data-csv.zip -d $DATA | |
wc -l $DATA/*.csv | |
tr -d '\\' < $DATA/Addresses.csv > $DATA/Addresses_fixed.csv | |
for i in $DATA/[AIEO]*.csv; do echo $i; sed -i '' -e '1,1 s/node_id/node_id:ID/' $i; done | |
sed -i '' -e '1 d' $DATA/all_edges.csv | |
tr '[:lower:]' '[:upper:]' < $DATA/all_edges.csv | sed -e 's/[^A-Z0-9,_ ]//g' -e 's/ */_/g' -e 's/,_/_/g' > $DATA/all_edges_cleaned.csv | |
echo 'node_id:START_ID,rel_type:TYPE,node_id:END_ID' > $DATA/all_edges_header.csv | |
rm -rf $DATA/panama.db | |
head -1 $DATA/*.csv | |
$NEO4J_HOME/bin/neo4j-import --into $DATA/panama.db --nodes:Address $DATA/Addresses_fixed.csv --nodes:Entity $DATA/Entities.csv --nodes:Intermediary $DATA/Intermediaries.csv --nodes:Officer $DATA/Officers.csv \ | |
--relationships $DATA/all_edges_header.csv,$DATA/all_edges_cleaned.csv --ignore-empty-strings true --skip-duplicate-nodes true --skip-bad-relationships true --bad-tolerance 1000000 --multiline-fields=true | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;' | |
x$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN labels(n),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-[r]->() RETURN type(r),r.detail,count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
# IMPORT DONE in 23s 391ms. Imported: | |
# 839434 nodes | |
# 1269796 relationships | |
# 8211010 properties |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export NEO4J_HOME=${NEO4J_HOME-~/Downloads/neo4j-community-3.0.1} | |
if [ ! -f data-csv.zip ]; then | |
echo Downloading ... | |
curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip | |
fi | |
# Need full path due to BUG in import tool with relative paths | |
export DATA=${PWD}/import | |
echo Extracting, Preparing, Cleaning up data ... | |
unzip -o -j data-csv.zip -d $DATA | |
tr -d '\\' < $DATA/Addresses.csv > $DATA/Addresses_fixed.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Address)/' $DATA/Addresses_fixed.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Officer)/' $DATA/Officers.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Entity)/' $DATA/Entities.csv | |
sed -i '' -e '1,1 s/node_id/node_id:ID(Intermediary)/' $DATA/Intermediaries.csv | |
sed -i '' -e '1 d' $DATA/all_edges.csv | |
for i in Entity Officer Intermediary; do | |
echo "node_id:START_ID($i),detail:IGNORE,node_id:END_ID(Address)" > $DATA/registered_address_$i.csv | |
done | |
grep ',registered address,' $DATA/all_edges.csv > $DATA/registered_address.csv | |
for i in Officer Intermediary; do | |
for j in Officer Intermediary; do | |
echo "node_id:START_ID(${i}),detail:IGNORE,node_id:END_ID(${j})" > $DATA/similar_${i}_${j}.csv | |
done | |
done | |
grep ',similar name and address as,' $DATA/all_edges.csv > $DATA/similar.csv | |
echo 'node_id:START_ID(Entity),detail,node_id:END_ID(Entity)' > $DATA/related.csv | |
grep ',\(related entity\|same name and registration date as\),' $DATA/all_edges.csv >> $DATA/related.csv | |
for i in Entity Intermediary; do | |
echo "node_id:START_ID(Officer),detail,node_id:END_ID($i)" > $DATA/officer_of_$i.csv | |
done | |
tr '[:upper:]' '[:lower:]' < $DATA/all_edges.csv | grep -v ',\(intermediary of\|registered address\|similar name and address as\|same name and registration date as\|same address as\|related entity\),' > $DATA/officer_of.csv | |
for i in Entity; do | |
echo "node_id:START_ID(Intermediary),detail,node_id:END_ID($i)" > $DATA/intermediary_of_$i.csv | |
done | |
sed -e 's/,intermediary of,/,,/' < $DATA/all_edges.csv > $DATA/intermediary_of.csv | |
echo CSV Overview ... | |
head -1 $DATA/*.csv | |
echo Importing ... | |
rm -rf $DATA/panama.db; $NEO4J_HOME/bin/neo4j-import --into $DATA/panama.db --nodes:Address $DATA/Addresses_fixed.csv --nodes:Entity $DATA/Entities.csv --nodes:Intermediary $DATA/Intermediaries.csv --nodes:Officer $DATA/Officers.csv \ | |
--relationships:REGISTERED_ADDRESS $DATA/registered_address_Officer.csv,$DATA/registered_address.csv \ | |
--relationships:REGISTERED_ADDRESS $DATA/registered_address_Entity.csv,$DATA/registered_address.csv \ | |
--relationships:REGISTERED_ADDRESS $DATA/registered_address_Intermediary.csv,$DATA/registered_address.csv \ | |
--relationships:RELATED $DATA/related.csv \ | |
--relationships:OFFICER_OF $DATA/officer_of_Entity.csv,$DATA/officer_of.csv \ | |
--relationships:OFFICER_OF $DATA/officer_of_Intermediary.csv,$DATA/officer_of.csv \ | |
--relationships:INTERMEDIARY_OF $DATA/intermediary_of_Entity.csv,$DATA/intermediary_of.csv \ | |
--relationships:SIMILAR $DATA/similar_Officer_Officer.csv,$DATA/similar.csv \ | |
--relationships:SIMILAR $DATA/similar_Officer_Intermediary.csv,$DATA/similar.csv \ | |
--relationships:SIMILAR $DATA/similar_Intermediary_Officer.csv,$DATA/similar.csv \ | |
--relationships:SIMILAR $DATA/similar_Intermediary_Intermediary.csv,$DATA/similar.csv \ | |
--ignore-empty-strings true --skip-duplicate-nodes true --skip-bad-relationships true --bad-tolerance 10000000 --multiline-fields=true | |
echo Imported Data Overview ... | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-->() RETURN count(*) as relationships;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN labels(n),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),labels(m),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;' | |
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-[r]->() RETURN type(r),r.detail,count(*) ORDER BY count(*) DESC;' | |
# IMPORT DONE in 23s 361ms. Imported: | |
# 839434 nodes | |
# 1265690 relationships | |
# 8211012 properties | |
# cut -d, -f 2 all_edges.csv | sort | uniq -c | sort -nr | |
# 319121 intermediary of | |
# 316472 registered address | |
# 277380 shareholder of | |
# 118589 Director of | |
# 105408 Shareholder of | |
# 46761 similar name and address as | |
# 36318 Records & Registers of | |
# 15151 beneficiary of | |
# 14351 Secretary of | |
# 4031 Beneficiary of | |
# 3146 same name and registration date as | |
# 1847 Beneficial Owner of | |
# 1418 Trustee of Trust of | |
# 1234 Trust Settlor of | |
# 1229 Authorised Person / Signatory of | |
# 1198 Protector of | |
# 1130 Nominee Shareholder of | |
# 960 same address as | |
# 622 related entity | |
# 583 Assistant Secretary of | |
# 409 Alternate Director of | |
# 320 Co-Trustee of Trust of | |
# 281 Officer of | |
# 272 Resident Director of | |
# 207 Auditor of | |
# 173 Correspondent Addr. of | |
# 123 Bank Signatory of | |
# 120 General Accountant of | |
# 101 Nominated Person of |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mkdir -p panama | |
cd panama | |
# load apoc from https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/1.0.0 | |
mkdir -p plugins | |
curl -L https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/1.0.0/apoc-1.0.0.jar -o plugins/apoc-1.0.0.jar | |
# load public panama-papers files from: https://offshoreleaks.icij.org/pages/database | |
if [ ! -f data-csv.zip ]; then curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip; fi | |
unzip -o -j data-csv.zip -d import | |
tr -d '\\' < import/Addresses.csv > import/Addresses_fixed.csv | |
export PORT=`date +%S`123 | |
echo $PORT | |
export HERE=`pwd` | |
mkdir -p $HERE/data | |
rm -rf $HERE/data/* | |
export CONTAINER=`docker run \ | |
--name neo4j-panama \ | |
--detach \ | |
--publish=$PORT:7474 \ | |
--volume=$HERE/data:/data \ | |
--volume=$HERE/import:/var/lib/neo4j/import \ | |
--volume=$HERE/plugins:/plugins \ | |
--ulimit=nofile=40000:40000 \ | |
--env=NEO4J_dbms_memory_heap_maxSize=5000 \ | |
--env=NEO4J_dbms_memory_pagecache_size=500M \ | |
neo4j:3.0` | |
docker ps -f name=neo4j-panama | |
sleep 5 | |
docker exec -i $CONTAINER /var/lib/neo4j/bin/neo4j-shell -f << EOF | |
match (n) detach delete n; | |
create constraint on (n:Node) assert n.node_id is unique; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Addresses_fixed.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Address; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Intermediaries.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Intermediary; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Entities.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Entity; | |
USING PERIODIC COMMIT 10000 | |
LOAD CSV WITH HEADERS FROM "file:///Officers.csv" AS row MERGE (n:Node {node_id:row.node_id}) ON CREATE SET n = row, n:Officer; | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "intermediary_of" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:INTERMEDIARY_OF]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "officer_of" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:OFFICER_OF]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "registered_address" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:REGISTERED_ADDRESS]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "similar" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:SIMILAR]->(n2); | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM "file:///all_edges.csv" AS row | |
WITH row WHERE row.rel_type = "underlying" | |
MATCH (n1:Node) WHERE n1.node_id = row.node_1 | |
MATCH (n2:Node) WHERE n2.node_id = row.node_2 | |
CREATE (n1)-[:UNDERLYING]->(n2); | |
DROP CONSTRAINT ON (n:Node) ASSERT n.node_id IS UNIQUE; | |
MATCH (n) REMOVE n:Node; | |
CREATE INDEX ON :Officer(name); | |
CREATE INDEX ON :Entity(name); | |
CREATE INDEX ON :Entity(address); | |
CREATE INDEX ON :Intermediary(name); | |
CREATE INDEX ON :Address(address); | |
// stats | |
MATCH (n)-[r]->(m) | |
RETURN labels(n),type(r),labels(m),count(*) | |
ORDER BY count(*) DESC; | |
schema await | |
EOF | |
echo "Neo4j running on $PORT mounted $HERE/data and $HERE/import container: $CONTAINER" | |
echo "To kill run: docker rm -f $CONTAINER" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE INDEX ON :Intermediary(name); | |
CREATE INDEX ON :Address(address); | |
CREATE INDEX ON :Officer(name); | |
CREATE INDEX ON :Entity(name); | |
CREATE INDEX ON :Entity(address); | |
CREATE INDEX ON :Entity(jurisdiction); | |
CREATE INDEX ON :Entity(incorporation_date); | |
CREATE INDEX ON :Entity(inactivation_date); | |
CREATE INDEX ON :Entity(struck_off_date); | |
CREATE INDEX ON :Entity(service_provider); | |
CREATE INDEX ON :Entity(original_name); | |
CREATE INDEX ON :Entity(status); | |
CREATE INDEX ON :Entity(country_codes); | |
CREATE INDEX ON :Address(country_codes); | |
CREATE INDEX ON :Intermediary(country_codes); | |
CREATE INDEX ON :Officer(country_codes); | |
// everything below is optional for fun | |
// mark officers as companies | |
unwind [" LTD","SURVIVORSHIP"," CORP","LIMITED","INC","FOUNDATION"," S.A.","PORTADOR","TRUST","BEARER","INTERNATIONAL","COMPANY","ANSTALT","INVESTMENTS"," B.V."," AG"] as designation | |
match (o:Officer) | |
WHERE NOT o:Company AND toUpper(o.name) CONTAINS designation | |
SET o:Company; | |
// set sources as label for faster filtering | |
MATCH (n) WHERE n.sourceID = "Panama Papers" and NOT n:PP | |
SET n:PP; | |
MATCH (n) WHERE n.sourceID = "Offshore Leaks" and NOT n:OSL | |
SET n:OSL; | |
// extract country nodes | |
CREATE CONSTRAINT ON (c:Country) ASSERT c.code IS UNIQUE; | |
CALL apoc.periodic.commit(" | |
MATCH (n) WHERE exists(n.country_codes) | |
WITH n limit 50000 | |
WITH n, split(n.country_codes,';') as codes,split(n.countries,';') as countries | |
FOREACH (idx in range(0,size(codes)-1) | | |
MERGE (country:Country {code:codes[idx]}) ON CREATE SET country.name = countries[idx] | |
MERGE (n)-[:LOCATED_IN]->(country) | |
) | |
REMOVE n.country_codes, n.countries | |
RETURN count(*) | |
",{}); | |
// create a full-text index | |
CALL apoc.index.addAllNodes('offshore',{ | |
Officer: ["name"], | |
Intermediary: ["name","address"], | |
Address: ["address"], | |
Entity: ["name", "address", "service_provider", "former_name", "company_type"]}); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// top-20 officers | |
match (o:Officer:PP) | |
WHERE NOT o:Company | |
return o.name, | |
size( (o)-[:OFFICER_OF]->()) as deg | |
order by deg desc limit 100; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment