Last active
May 24, 2016 10:21
-
-
Save GregUK/118c6d06bab861c8c38507db5ca1845f to your computer and use it in GitHub Desktop.
ICIJ Panama Papers and Offshore Leaks - Loading into Neo4j
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ICIJ Offshore Leaks Database 2016 - Loading into Neo4j | |
Data is available from: https://offshoreleaks.icij.org/pages/database | |
#Most of the CSV's arent that clean. Tidy them up to replace , delimiiter with | | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Addresses.csv' AS line | |
FIELDTERMINATOR '|' | |
WITH line | |
CREATE (:Address { id: toInt(line.node_id), address: line.address, icij_id: line.icij_id, valid_until: line.valid_until, country_codes: line.country_codes, countries: line.countries, node_id: toInt(line.node_id), sourceID: line.sourceID}) | |
RETURN COUNT(*) | |
CREATE INDEX ON :Address(node_id) | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Intermediaries.csv' AS line | |
FIELDTERMINATOR '|' | |
WITH line | |
CREATE (:Intermediaries { id: toInt(line.node_id), name: line.name, address: line.address, internal_id: line.internal_id, valid_until: line.valid_until, country_codes: line.country_codes, countries: line.countries, node_id: toInt(line.node_id), sourceID: line.sourceID}) | |
RETURN COUNT(*) | |
CREATE INDEX ON :Intermediaries(node_id) | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Officers.csv' AS line | |
FIELDTERMINATOR '|' | |
WITH line | |
CREATE (:Officers { id: toInt(line.node_id), name: line.name, icij_id: line.icij_id, valid_until: line.valid_until, country_codes: line.country_codes, countries: line.countries, node_id: toInt(line.node_id), sourceID: line.sourceID}) | |
RETURN COUNT(*) | |
CREATE INDEX ON :Officers(node_id) | |
USING PERIODIC COMMIT | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/Entities.csv' AS line | |
FIELDTERMINATOR '|' | |
WITH line | |
CREATE (:Entities { id: toInt(line.node_id), name: line.name, original_name: line.original_name, former_name: line.former_name, address: line.address, jurisdiction: line.jurisdiction, jurisdiction_description: line.jurisdiction_description, company_type: line.company_type, internal_id: line.internal_id, incorporation_date: line.incorporation_date, inactivation_date: line.inactivation_date, struck_off_date: line.struck_off_date, dorm_date: line.dorm_date, status: line.status, service_provider: line.service_provider, ibcRUC: line.ibcRUC, country_codes: line.country_codes, countries: line.countries, note: line.note, valid_until: line.valid_until, node_id: toInt(line.node_id), sourceID: line.sourceID}) | |
RETURN COUNT(*) | |
CREATE INDEX ON :Entities(node_id) | |
#Work on getting the links right. | |
#Check edge types | |
cat all_edges.csv | cut -d , -f 2 | sort | uniq | |
#Add header to each of the above files | |
node_1,role,node_2 | |
#Split the edges file by types | |
echo "node_1,role,node_2" > officer_edges.csv | |
grep officer all_edges.csv >> officer_edges.csv | |
echo "node_1,role,node_2" > intermediary_edges.csv | |
grep intermediary_of all_edges.csv >> intermediary_edges.csv | |
echo "node_1,role,node_2" > address_edges.csv | |
grep registered_address all_edges.csv >> address_edges.csv | |
echo "node_1,role,node_2" > similar_edges.csv | |
grep similar all_edges.csv >> similar_edges.csv | |
echo "node_1,role,node_2" > underlying_edges.csv | |
grep underlying all_edges.csv >> underlying_edges.csv | |
#Link via the all_edges.csv | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/intermediary_edges.csv' AS line | |
WITH line | |
MATCH (n:Intermediaries { node_id: toInt(line.node_1) }) | |
MATCH (m:Entities { node_id: toInt(line.node_2) }) | |
CREATE (n)-[:INTERMEDIARY_OF]->(m) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/officer_edges.csv' AS line | |
WITH line | |
MATCH (n:Officers { node_id: toInt(line.node_1) }) | |
MATCH (m:Entities { node_id: toInt(line.node_2) }) | |
CREATE (n)-[:OFFICER_OF]->(m) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/address_edges.csv' AS line | |
WITH line | |
MATCH (n:Entities { node_id: toInt(line.node_1) }) | |
MATCH (m:Address { node_id: toInt(line.node_2) }) | |
CREATE (m)-[:REGISTERED_ADDRESS_OF]->(n) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line | |
WITH line | |
MATCH (n:Intermediaries { node_id: toInt(line.node_1) }) | |
MATCH (m:Intermediaries { node_id: toInt(line.node_2) }) | |
CREATE (m)-[:SIMILAR]->(n) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line | |
WITH line | |
MATCH (n:Officers { node_id: toInt(line.node_1) }) | |
MATCH (m:Officers { node_id: toInt(line.node_2) }) | |
CREATE (m)-[:SIMILAR]->(n) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line | |
WITH line | |
WHERE line.rel_type = 'similar' | |
MATCH (n:Entities { node_id: toInt(line.node_1) }) | |
MATCH (m:Entities { node_id: toInt(line.node_2) }) | |
CREATE (m)-[:SIMILAR]->(n) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/similar_edges.csv' AS line | |
WITH line | |
MATCH (n:Address { node_id: toInt(line.node_1) }) | |
MATCH (m:Address { node_id: toInt(line.node_2) }) | |
CREATE (m)-[:SIMILAR]->(n) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/underlying_edges.csv' AS line | |
WITH line | |
MATCH (n:Officers { node_id: toInt(line.node_1) }) | |
MATCH (m:Officers { node_id: toInt(line.node_2) }) | |
CREATE (n)-[:UNDERLYING]->(m) | |
RETURN COUNT(*) | |
USING PERIODIC COMMIT 500 | |
LOAD CSV WITH HEADERS FROM 'file:///offshore_leaks_csvs/underlying_edges.csv' AS line | |
WITH line | |
MATCH (n:Entities { node_id: toInt(line.node_1) }) | |
MATCH (m:Entities { node_id: toInt(line.node_2) }) | |
CREATE (n)-[:UNDERLYING]->(m) | |
RETURN COUNT(*) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment