Last active
March 17, 2021 16:14
-
-
Save amaaradji/f697839cfa58e7ee69001e4a3ba6b2ff to your computer and use it in GitHub Desktop.
Importing Panama Papers CSV files into Neo4j (version 4.2.3) The Panama papers CSV files are only a part of the "offsore leaks database"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//You can download the Panama Paper CSV files here https://offshoreleaks-data.icij.org/offshoreleaks/csv/csv_panama_papers.2018-02-14.zip | |
//Unzip and drop the CSV files in the 'import' folder of your Neo4j install folder | |
//Note that the panama_papers.nodes.address.csv and panama_papers.nodes.officer.csv contain two small errors you need to fix in order to be able to import them | |
//to fix panama_papers.nodes.address.csv go line 63038 col 89 remove the \ (escape character) | |
//to fix panama_papers.nodes.officer.csv go line 199625 col 20 remove the \ (escape character) | |
//It is required to install the APOC plug-in in Neo4j (refer to documentation here https://neo4j.com/labs/apoc/4.1/installation/) | |
//-----------------import CSV files---------------------- | |
LOAD CSV WITH HEADERS FROM "file:///panama_papers.nodes.intermediary.csv" AS row | |
CREATE (n:Intermediary) | |
SET n=row; | |
LOAD CSV WITH HEADERS FROM "file:///panama_papers.nodes.address.csv" AS row | |
CREATE (n:Address) | |
SET n=row; | |
LOAD CSV WITH HEADERS FROM "file:///panama_papers.nodes.officer.csv" AS row | |
CREATE (n:Officer) | |
SET n=row; | |
LOAD CSV WITH HEADERS FROM "file:///panama_papers.nodes.entity.csv" AS row | |
CREATE (n:Entity) | |
SET n=row; | |
LOAD CSV WITH HEADERS FROM "file:///panama_papers.edges.csv" AS row | |
CREATE (n:Edge) | |
SET n=row; | |
//--------------create parameterized relationtype from the imported panama_papers.edges.csv | |
MATCH (n1 {node_id:e.START_ID}),(e:Edge),(n2 {node_id:e.END_ID}) | |
CALL apoc.create.relationship(n1,toUpper(e.TYPE),{sourceID: e.sourceID, end_date: e.end_date, valid_until: e.valid_until, link: e.link, start_date: e.start_date},n2) YIELD rel | |
Return n1,rel,n2; | |
// you can delete the Edge nodes | |
//***************************************************************************************** | |
//-----------Query Snippets--------------------------------------------------------------- | |
//***************************************************************************************** | |
CALL db.schema.visualization(); | |
//explore (reconsider) relations | |
MATCH (n:Edge) RETURN DISTINCT n.link, count(*); | |
//create new relations | |
MATCH (n1)-[r:OFFICER_OF {link:'beneficiary, shareholder and director of'}]->(n2) | |
CREATE (n1)-[b:BENEFICIARY]->(n2) SET b=r | |
CREATE (n1)-[s:SHAREHOLDER_OF]->(n2) SET s=r | |
CREATE (n1)-[d:DIRECTOR_OF]->(n2) SET d=r | |
//explore connection with 2 hops | |
MATCH(a:Officer),(b:Officer) WHERE a.name CONTAINS 'Smith' AND b.name CONTAINS 'Grant' MATCH p=allShortestPaths((a)-[*..2]-(b)) RETURN p LIMIT 50 | |
//any VIP | |
MATCH p=(o:Officer)-->(:Address)<--(:Officer)-->(:Entity)<--(:Intermediary) | |
WHERE o.name CONTAINS 'Gunnlaugsson' | |
RETURN p LIMIT 100 | |
////********************************************************************** | |
//DUMP | |
./neo4j-admin dump --database=panama --to=/home/abdu/Dropbox/Cergy/M1-BDA+/Project-20210215/panama_edge_REL_props.dump | |
//LOAD | |
./neo4j-admin load --force --database=panama --from=/home/abdu/neo4j-community-4.2.3/import/panama_edge_REL_props.dump | |
////********************************************************************** | |
//selectig random nodes and creating new node | |
call apoc.cypher.run("MATCH (i:Intermediary) return i.node_id as id, rand() as rand ORDER BY rand ASC Limit 1",{} ) YIELD value as i1 | |
call apoc.cypher.run("MATCH (i:Intermediary) return i.node_id as id, rand() as rand ORDER BY rand ASC Limit 1",{} ) YIELD value as i2 | |
CREATE (e:Edge) | |
SET e.START_ID= i1.id, e.END_ID = i2.id, e.TYPE = "thisisnewrelation" | |
MATCH (n1 {node_id:e.START_ID}),(e:Edge),(n2 {node_id:e.END_ID}) | |
CALL apoc.create.relationship(n1,e.TYPE,{},n2) YIELD rel | |
Return n1,rel,n2 | |
MATCH (i1:Intermediary {node_id:e.START_ID}),(e:Edge),(i2:Intermediary {node_id:e.END_ID}) | |
CALL apoc.create.relationship(i1,e.TYPE,{},i2) YIELD rel | |
Return i1,rel,i2 | |
MATCH (i1:Intermediary),(e:Edge),(i2:Intermediary) | |
WHERE i1.node_id=e.START_ID and i2.node_id = e.END_ID | |
CALL apoc.create.relationship(i1,e.TYPE,{},i2) YIELD rel | |
Return i1,rel,i2 | |
//it is recommended to use community | |
//activate apoc by moving lab folder apoc jar file into plugin folder + add dbms.security.procedures.unrestricted=apoc.* to config file | |
//curate the csv files (removing the escape character \ before " or ,) | |
//import csv with headers | |
//add unique contrainst and index | |
//to explore use :play https://guides.neo4j.com/sandbox/icij-panama-papers/datashape.html | |
// and https://guides.neo4j.com/sandbox/icij-panama-papers | |
MATCH (n1 {node_id:e.START_ID}),(e:Edge),(n2 {node_id:e.END_ID}) | |
CALL apoc.create.relationship(n1,e.TYPE,{sourceID: e.sourceID, end_date: e.end_date, valid_until: e.valid_until, link: e.link, start_date: e.start_date},n2) YIELD rel | |
Return n1,rel,n2 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment