Skip to content

Instantly share code, notes, and snippets.

@GregUK
Last active May 24, 2016 10:21
Show Gist options
  • Save GregUK/118c6d06bab861c8c38507db5ca1845f to your computer and use it in GitHub Desktop.
Save GregUK/118c6d06bab861c8c38507db5ca1845f to your computer and use it in GitHub Desktop.
ICIJ Panama Papers and Offshore Leaks - Loading into Neo4j
ICIJ Offshore Leaks Database 2016 - Loading into Neo4j
Data is available from: https://offshoreleaks.icij.org/pages/database
#Most of the CSV's arent that clean. Tidy them up to replace , delimiiter with |
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Addresses.csv' AS line
FIELDTERMINATOR '|'
WITH line
CREATE (:Address { id: toInt(line.node_id), address: line.address, icij_id: line.icij_id, valid_until: line.valid_until, country_codes: line.country_codes, countries: line.countries, node_id: toInt(line.node_id), sourceID: line.sourceID})
RETURN COUNT(*)
CREATE INDEX ON :Address(node_id)
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Intermediaries.csv' AS line
FIELDTERMINATOR '|'
WITH line
CREATE (:Intermediaries { id: toInt(line.node_id), name: line.name, address: line.address, internal_id: line.internal_id, valid_until: line.valid_until, country_codes: line.country_codes, countries: line.countries, node_id: toInt(line.node_id), sourceID: line.sourceID})
RETURN COUNT(*)
CREATE INDEX ON :Intermediaries(node_id)
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Officers.csv' AS line
FIELDTERMINATOR '|'
WITH line
CREATE (:Officers { id: toInt(line.node_id), name: line.name, icij_id: line.icij_id, valid_until: line.valid_until, country_codes: line.country_codes, countries: line.countries, node_id: toInt(line.node_id), sourceID: line.sourceID})
RETURN COUNT(*)
CREATE INDEX ON :Officers(node_id)
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Entities.csv' AS line
FIELDTERMINATOR '|'
WITH line
CREATE (:Entities { id: toInt(line.node_id), name: line.name, original_name: line.original_name, former_name: line.former_name, address: line.address, jurisdiction: line.jurisdiction, jurisdiction_description: line.jurisdiction_description, company_type: line.company_type, internal_id: line.internal_id, incorporation_date: line.incorporation_date, inactivation_date: line.inactivation_date, struck_off_date: line.struck_off_date, dorm_date: line.dorm_date, status: line.status, service_provider: line.service_provider, ibcRUC: line.ibcRUC, country_codes: line.country_codes, countries: line.countries, note: line.note, valid_until: line.valid_until, node_id: toInt(line.node_id), sourceID: line.sourceID})
RETURN COUNT(*)
CREATE INDEX ON :Entities(node_id)
#Work on getting the links right.
#Check edge types
cat all_edges.csv | cut -d , -f 2 | sort | uniq
#Add header to each of the above files
node_1,role,node_2
#Split the edges file by types
echo "node_1,role,node_2" > officer_edges.csv
grep officer all_edges.csv >> officer_edges.csv
echo "node_1,role,node_2" > intermediary_edges.csv
grep intermediary_of all_edges.csv >> intermediary_edges.csv
echo "node_1,role,node_2" > address_edges.csv
grep registered_address all_edges.csv >> address_edges.csv
echo "node_1,role,node_2" > similar_edges.csv
grep similar all_edges.csv >> similar_edges.csv
echo "node_1,role,node_2" > underlying_edges.csv
grep underlying all_edges.csv >> underlying_edges.csv
#Link via the all_edges.csv
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/intermediary_edges.csv' AS line
WITH line
MATCH (n:Intermediaries { node_id: toInt(line.node_1) })
MATCH (m:Entities { node_id: toInt(line.node_2) })
CREATE (n)-[:INTERMEDIARY_OF]->(m)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/officer_edges.csv' AS line
WITH line
MATCH (n:Officers { node_id: toInt(line.node_1) })
MATCH (m:Entities { node_id: toInt(line.node_2) })
CREATE (n)-[:OFFICER_OF]->(m)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/address_edges.csv' AS line
WITH line
MATCH (n:Entities { node_id: toInt(line.node_1) })
MATCH (m:Address { node_id: toInt(line.node_2) })
CREATE (m)-[:REGISTERED_ADDRESS_OF]->(n)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line
WITH line
MATCH (n:Intermediaries { node_id: toInt(line.node_1) })
MATCH (m:Intermediaries { node_id: toInt(line.node_2) })
CREATE (m)-[:SIMILAR]->(n)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line
WITH line
MATCH (n:Officers { node_id: toInt(line.node_1) })
MATCH (m:Officers { node_id: toInt(line.node_2) })
CREATE (m)-[:SIMILAR]->(n)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line
WITH line
WHERE line.rel_type = 'similar'
MATCH (n:Entities { node_id: toInt(line.node_1) })
MATCH (m:Entities { node_id: toInt(line.node_2) })
CREATE (m)-[:SIMILAR]->(n)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line
WITH line
MATCH (n:Address { node_id: toInt(line.node_1) })
MATCH (m:Address { node_id: toInt(line.node_2) })
CREATE (m)-[:SIMILAR]->(n)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/underlying_edges.csv' AS line
WITH line
MATCH (n:Officers { node_id: toInt(line.node_1) })
MATCH (m:Officers { node_id: toInt(line.node_2) })
CREATE (n)-[:UNDERLYING]->(m)
RETURN COUNT(*)
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/underlying_edges.csv' AS line
WITH line
MATCH (n:Entities { node_id: toInt(line.node_1) })
MATCH (m:Entities { node_id: toInt(line.node_2) })
CREATE (n)-[:UNDERLYING]->(m)
RETURN COUNT(*)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment