Skip to content

Instantly share code, notes, and snippets.

@jexp
Forked from rvanbruggen/1-loadcsv
Created October 8, 2015 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jexp/d48a6dc4ccf36eb1aadb to your computer and use it in GitHub Desktop.
Save jexp/d48a6dc4ccf36eb1aadb to your computer and use it in GitHub Desktop.
Importing the Belgian Corporate Registry
//load the codes
create index on :CodeCategory(name);
using periodic commit 1000
load csv with headers from
"file:/<path>/sourcecsv/code.csv" as csv
with distinct csv.Category as Category
merge (:CodeCategory {name: Category});
create index on :Code(name);
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/code.csv" as csv
with distinct csv.Code as Code
merge (c:Code {name: Code});
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/code.csv" as csv
with distinct csv.Category as Category, csv.Code as Code
match (cc:CodeCategory {name: Category}), (c:Code {name: Code})
merge (cc)<-[:PART_OF]-(c);
create index on :CodeMeaning(description);
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/code.csv" as csv
merge (cm:CodeMeaning {language: csv.Language, description: csv.Description});
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/code.csv" as csv
match (cc:CodeCategory {name: csv.Category})<-[:PART_OF]-(c:Code {name: csv.Code}), (cm:CodeMeaning {language: csv.Language, description: csv.Description})
merge (c)<-[:MEANS]-(cm);
//load the enterprises
create constraint on (e:Enterprise)
assert e.EnterpriseNumber is unique;
using periodic commit 5000
load csv with headers from
"file:/<path>/sourcecsv/enterprise.csv" as csv
create (e:Enterprise {EnterpriseNumber: csv.EnterpriseNumber, Status: csv.Status, JuridicalSituation: csv.JuridicalSituation, TypeOfEnterprise: csv.TypeOfEnterprise, JuridicalForm: csv.JuridicalForm, StartDate: toInt(substring(csv.StartDate,0,2))+toInt(substring(csv.StartDate,3,2))*100 + toInt(substring(csv.StartDate,6,4))*10000});
//load the establishments
create constraint on (eb:Establishment)
assert eb.EstablishmentNumber is unique;
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/establishment.csv" as csv
create (es:Establishment {EstablishmentNumber: csv.EstablishmentNumber, StartDate: toInt(substring(csv.StartDate,0,2))+toInt(substring(csv.StartDate,3,2))*100+toInt(substring(csv.StartDate,6,4))*10000});
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/establishment.csv" as csv
match (e:Enterprise {EnterpriseNumber: csv.EnterpriseNumber}), (es:Establishment {EstablishmentNumber: csv.EstablishmentNumber})
create (es)-[:PART_OF]->(e);
//add contact information to enterprises and establishments
//add contact information to enterprises
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/contact.csv" as csv
match (e:Enterprise {EnterpriseNumber: csv.EntityNumber})
create (e)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: csv.ContactType, value: csv.Value});
//add contact information to establishments
using periodic commit
load csv with headers from
"file:/<path>/sourcecsv/contact.csv" as csv
match (es:Establishment {EstablishmentNumber: csv.EntityNumber})
create (es)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: csv.ContactType, value: csv.Value});
create constraint on (c:City)
assert c.name is unique;
create constraint on (z:Zip)
assert z.name is unique;
create index on :Street(name);
using periodic commit 100000
load csv with headers from
"file:/<path>/sourcecsv/address.csv" as csv
with distinct toUpper(csv.MunicipalityNL) as MunicipalityNL
merge (city:City {name: MunicipalityNL});
using periodic commit 100000
load csv with headers from
"file:/<path>/sourcecsv/address.csv" as csv
with distinct toUpper(csv.Zipcode) as Zipcode
merge (zip:Zip {name: Zipcode});
// connect the zips to the cities
using periodic commit 100000
load csv with headers from
"file:/<path>/sourcecsv/address.csv" as csv
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.MunicipalityNL) as MunicipalityNL
match (city:City {name: MunicipalityNL}), (zip:Zip {name: Zipcode})
create unique (city)-[:HAS_ZIP_CODE]->(zip);
using periodic commit 50000
load csv with headers from
"file:/<path>/sourcecsv/address.csv" as csv
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL
match (zip:Zip {name: Zipcode})
create unique (zip)<-[:PART_OF]-(str:Street {name: StreetNL, zip:Zipcode});
//add the houseNumbers
create index on :HouseNumber(houseNumber);
using periodic commit 1000
load csv with headers from
"file:/<path>/sourcecsv/address.csv" as csv
with toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber
match (street:Street {name: StreetNL, zip:Zipcode})
using index street:Street(name)
CREATE UNIQUE (:HouseNumber {houseNumber: HouseNumber})-[:PART_OF]->(street);
//connect the enterprises to the addresses
// DOES NOT WORK - solved with PYTHON script
// using periodic commit 10000
// load csv with headers from
// "file:/<path>/sourcecsv/address.csv" as csv
// with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber, csv.EntityNumber as EntityNumber
// match (e:Enterprise {EnterpriseNumber: EntityNumber}),
// (street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber})
// create (e)-[:HAS_ADDRESS]->(h);
//load the activities of Enterprises
using periodic commit 25000
load csv with headers from
"file:/<path>/sourcecsv/activity.csv" as csv
with csv limit 50000
match (e:Enterprise {EnterpriseNumber: csv.EntityNumber}),
(cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: csv.ActivityGroup})
merge (e)-[:HAS_ACTIVITY_GROUP {type: csv.Classification}]->(c2);
using periodic commit 25000
load csv with headers from
"file:/<path>/sourcecsv/activity.csv" as csv
with csv limit 1000
match (e:Enterprise {EnterpriseNumber: csv.EntityNumber}),
(cc1:CodeCategory {name: "Nace"+csv.NaceVersion})<-[:PART_OF]-(c1:Code {name: csv.NaceCode})
merge (e)-[hnc:HAS_NACE_CODE {type: csv.Classification}]->(c1);
//load the activities of Establishments
using periodic commit 25000
load csv with headers from
"file:/<path>/sourcecsv/activity.csv" as csv
match (e:Establishment {EstablishmentNumber: csv.EntityNumber}),
(cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: csv.ActivityGroup})
merge (e)-[:HAS_ACTIVITY_GROUP {type: csv.Classification}]->(c2);
using periodic commit 25000
load csv with headers from
"file:/<path>/sourcecsv/activity.csv" as csv
match (e:Establishment {EstablishmentNumber: csv.EntityNumber}),
(cc1:CodeCategory {name: "Nace"+csv.NaceVersion})--(c1:Code {name: csv.NaceCode})
merge (e)-[hnc:HAS_NACE_CODE]->(c1)
on create set hnc.type=csv.Classification
on match set hnc.type=csv.Classification;
import datetime
from py2neo import Graph
from py2neo.packages.httpstream import http
http.socket_timeout = 9999
graph = Graph()
print "Starting to process links between Enterprises and Addresses..."
for filenr in range(0,113):
tx1 = graph.cypher.begin()
statement1 = """
load csv with headers from
"file:/<path>/splitcsv/splitaddress/splitaddress_"""+str(filenr)+"""" as csv
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber, csv.EntityNumber as EntityNumber
match (e:Enterprise {EnterpriseNumber: EntityNumber}),
(street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber})
create (e)-[:HAS_ADDRESS]->(h);
"""
tx1.append(statement1)
tx1.process()
tx1.commit()
print "Enterprise Filenr: "+str(filenr)+" processed, at "+str(datetime.datetime.now())
print "Starting to process links between Establishments and Addresses..."
for filenr in range(0,113):
tx2 = graph.cypher.begin()
statement2 = """
load csv with headers from
"file:/<path>/splitcsv/splitaddress/splitaddress_"""+str(filenr)+"""" as csv
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber, csv.EntityNumber as EntityNumber
match (e:Establishment {EstablishmentNumber: EntityNumber}),
(street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber})
create (e)-[:HAS_ADDRESS]->(h);
"""
tx2.append(statement2)
tx2.process()
tx2.commit()
print "Establishment Filenr: "+str(filenr)+" processed, at "+str(datetime.datetime.now())
//THIS IS PART 1
//load the codes
create index on :CodeCategory(name);
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q with distinct {Category} as Category merge (:CodeCategory {name: Category});
create index on :Code(name);
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q with distinct {Code} as Code merge (c:Code {name: Code});
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q with distinct {Category} as Category, {Code} as Code match (cc:CodeCategory {name: Category}), (c:Code {name: Code}) merge (cc)<-[:PART_OF]-(c);
create index on :CodeMeaning(description);
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q merge (cm:CodeMeaning {language: {Language}, description: {Description}});
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q match (cc:CodeCategory {name: {Category}})<-[:PART_OF]-(c:Code {name: {Code}}), (cm:CodeMeaning {language: {Language}, description: {Description}}) merge (c)<-[:MEANS]-(cm);
//load the enterprises
create constraint on (e:Enterprise)
assert e.EnterpriseNumber is unique;
import-cypher -i /<path>/sourcecsv/enterprise.csv -b 10000 -d , -q create (e:Enterprise {EnterpriseNumber: {EnterpriseNumber}, Status: {Status}, JuridicalSituation: {JuridicalSituation}, TypeOfEnterprise: {TypeOfEnterprise}, JuridicalForm: {JuridicalForm}, StartDate: toInt(substring({StartDate},0,2))+toInt(substring({StartDate},3,2))*100 + toInt(substring({StartDate},6,4))*10000});
//load the establishments
create constraint on (eb:Establishment)
assert eb.EstablishmentNumber is unique;
import-cypher -i /<path>/sourcecsv/establishment.csv -b 10000 -d , -q create (es:Establishment {EstablishmentNumber: {EstablishmentNumber}, StartDate: toInt(substring({StartDate},0,2))+toInt(substring({StartDate},3,2))*100+toInt(substring({StartDate},6,4))*10000});
import-cypher -i /<path>/sourcecsv/establishment.csv -b 10000 -d , -q match (e:Enterprise {EnterpriseNumber: {EnterpriseNumber}}), (es:Establishment {EstablishmentNumber: {EstablishmentNumber}}) create (es)-[:PART_OF]->(e);
//add contact information to enterprises and establishments
//add contact information to enterprises
import-cypher -i /<path>/sourcecsv/contact.csv -b 10000 -d , -q match (e:Enterprise {EnterpriseNumber: {EntityNumber}}) create (e)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: {ContactType}, value: {Value}});
//add contact information to establishments
import-cypher -i /<path>/sourcecsv/contact.csv -b 10000 -d , -q match (es:Establishment {EstablishmentNumber: {EntityNumber}}) create (es)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: {ContactType}, value: {Value}});
//execution time: 7min 16sec
//3180356 nodes
//1602575 rels
//THIS IS PART 2
return timestamp();
create constraint on (c:City)
assert c.name is unique;
create constraint on (z:Zip)
assert z.name is unique;
create index on :Street(name);
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({MunicipalityNL}) as MunicipalityNL merge (city:City {name: MunicipalityNL});
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode merge (zip:Zip {name: Zipcode});
// connect the zips to the cities
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({MunicipalityNL}) as MunicipalityNL match (city:City {name: MunicipalityNL}), (zip:Zip {name: Zipcode}) create unique (city)-[:HAS_ZIP_CODE]->(zip);
import-cypher -i file:/<path>/sourcecsv/address.csv -b 5000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL match (zip:Zip {name: Zipcode}) create unique (zip)<-[:PART_OF]-(str:Street {name: StreetNL, zip: Zipcode});
//add the houseNumbers
create index on :HouseNumber(houseNumber);
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL, toUpper({HouseNumber}) as HouseNumber match (street:Street {name: StreetNL, zip: Zipcode}) using index street:Street(name) CREATE UNIQUE (:HouseNumber {houseNumber: HouseNumber})-[:PART_OF]->(street);
//connect the enterprises to the addresses
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL, toUpper({HouseNumber}) as HouseNumber, {EntityNumber} as EntityNumber match (e:Enterprise {EnterpriseNumber: EntityNumber}), (street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) create (e)-[:HAS_ADDRESS]->(h);
//connect the Establishments to the addresses
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL, toUpper({HouseNumber}) as HouseNumber, {EntityNumber} as EntityNumber match (e:Establishment {EstablishmentNumber: EntityNumber}), (street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) create (e)-[:HAS_ADDRESS]->(h);
//load the activities of Enterprises
import-cypher -i /<path>/sourcecsv/activity.csv -b 10000 -d , -q match (e:Enterprise {EnterpriseNumber: {EntityNumber}}), (cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: {ActivityGroup}}) merge (e)-[:HAS_ACTIVITY_GROUP {type: {Classification}}]->(c2);
MATCH (n:CodeCategory {name:"Nace2008"})--(c:Code)
set c.fullname="Nace2008-"+c.name;
MATCH (n:CodeCategory {name:"Nace2003"})--(c:Code)
set c.fullname="Nace2003-"+c.name;
create index on :Code(fullname);
import-cypher -i /<path>/sourcecsv/activity.csv -b 5000 -d , -q match (e:Enterprise {EnterpriseNumber: {EntityNumber}}), (c1:Code {fullname: "Nace"+{NaceVersion}+"-"+{NaceCode}}) create unique (e)-[hnc:HAS_NACE_CODE {type: {Classification}}]->(c1);
//load the activities of Establishments
import-cypher -i /<path>/sourcecsv/activity.csv -b 10000 -d , -q match (e:Establishment {EstablishmentNumber: {EntityNumber}}), (cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: {ActivityGroup}}) merge (e)-[:HAS_ACTIVITY_GROUP {type: {Classification}}]->(c2);
import-cypher -i /<path>/sourcecsv/activity.csv -b 5000 -d , -q match (e:Establishment {EstablishmentNumber: {EntityNumber}}), (c1:Code {fullname: "Nace"+{NaceVersion}+"-"+{NaceCode}}) create unique (e)-[hnc:HAS_NACE_CODE {type: {Classification}}]->(c1);
return timestamp();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment