Skip to content

Instantly share code, notes, and snippets.

@jvilledieu
Last active November 3, 2020 19:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jvilledieu/7845e027705f5d861736 to your computer and use it in GitHub Desktop.
Save jvilledieu/7845e027705f5d861736 to your computer and use it in GitHub Desktop.
Import script for Crunchbase. This is used to create a Neo4j data populated with the Crunchbase data (see blog post: http://linkurio.us/the-crunchbase-graph-data-modelling/). The data remains the property of Crunchbase and its use is limited by the following license: https://info.crunchbase.com/docs/licensing-policy-v1/
//Cleaning up the database
MATCH (n)
WITH n LIMIT 10000
OPTIONAL MATCH (n)-[r]->()
DELETE n,r;
//Importing the file Companies.csv
//Creating the constraints
create constraint on (a:COMPANY) assert a.permalink is unique;
create constraint on (b:CATEGORY) assert b.name is unique;
create constraint on (c:MARKET) assert c.name is unique;
create constraint on (d:STATUS) assert d.name is unique;
create constraint on (e:COUNTRY) assert e.name is unique;
create constraint on (f:STATE) assert f.name is unique;
create constraint on (g:REGION) assert g.name is unique;
create constraint on (h:CITY) assert h.name is unique;
//Creating the nodes
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:c:/Companies.csv" AS line
FIELDTERMINATOR ';'
MERGE (b:CATEGORY {name: line.category_list})
MERGE (c:MARKET {name: line.market})
MERGE (d:STATUS {name: line.status})
MERGE (e:COUNTRY {name: line.country_code})
MERGE (f:STATE {name: line.state_code})
MERGE (g:REGION {name: line.region})
MERGE (h:CITY {name: line.city});
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:c:/Companies.csv" AS line
FIELDTERMINATOR ';'
CREATE (a:COMPANY {
name: line.name,
permalink: line.permalink,
homepage_url: line.homepage_url,
funding_total: toInt(line.funding_total_usd),
funding_rounds: toInt(line.funding_rounds),
founded_at: line.founded_at,
founded_month: line.founded_month,
founded_quarter: line.founded_quarter,
founded_year: toInt(line.founded_year),
first_funding_at: line.first_funding_at,
last_funding_at: line.last_funding_at,
category: line.category_list,
market: line.market,
status: line.status,
country: line.country_code,
state: line.state_code,
region: line.region,
city: line.city,
url: 'http://www.crunchbase.com' + line.permalink,
logo: 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw'
});
//Creating the edges
//Creating an index on the companies to speed up the import
DROP CONSTRAINT ON (a:COMPANY) ASSERT a.permalink IS UNIQUE;
CREATE INDEX ON :COMPANY(permalink);
//Companies & Categories
MATCH (a:COMPANY)
MATCH (b:CATEGORY {name: a.category})
CREATE (a)-[:HAS_CATEGORY]->(b);
//Companies & Market
MATCH (a:COMPANY)
MATCH (b:MARKET {name: a.market})
CREATE (a)-[:HAS_MARKET]->(b);
//Companies & Status
MATCH (a:COMPANY)
MATCH (b:STATUS {name: a.status})
CREATE (a)-[:HAS_STATUS]->(b);
//Companies & Country
MATCH (a:COMPANY)
MATCH (b:COUNTRY {name: a.country})
CREATE (a)-[:HAS_COUNTRY]->(b);
//Companies & State
MATCH (a:COMPANY)
MATCH (b:STATE {name: a.state})
CREATE (a)-[:HAS_STATE]->(b);
//Companies & Region
MATCH (a:COMPANY)
MATCH (b:REGION {name: a.region})
CREATE (a)-[:HAS_REGION]->(b);
//Companies & Cities
MATCH (a:COMPANY)
MATCH (b:CITY {name: a.city})
CREATE (a)-[:HAS_CITY]->(b);
//Importing the file Investments.csv
//Creating the constraints
create constraint on (a:INVESTOR) assert a.permalink is unique;
create constraint on (b:INVESTOR_CATEGORY) assert b.name is unique;
create constraint on (c:INVESTOR_MARKET) assert c.name is unique;
create constraint on (e:COUNTRY) assert e.name is unique;
create constraint on (f:STATE) assert f.name is unique;
create constraint on (g:REGION) assert g.name is unique;
create constraint on (h:CITY) assert h.name is unique;
create constraint on (i:FUNDING_ROUND) assert i.permalink is unique;
create constraint on (j:ROUND_CODE) assert j.name is unique;
create constraint on (k:ROUND_TYPE) assert k.name is unique;
//Creating the nodes
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line
FIELDTERMINATOR ';'
MATCH (a:COMPANY { permalink: line.investor_permalink })
SET a:INVESTOR;
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line
FIELDTERMINATOR ';'
MERGE (a:INVESTOR { permalink: line.investor_permalink })
ON CREATE SET a.name = line.investor_name,
a.category = line.investor_category_list,
a.market = line.investor_market,
a.country = line.investor_country_code,
a.state = line.investor_state_code,
a.region = line.investor_region,
a.city = line.investor_city,
a.logo = 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw',
a.url = 'http://www.crunchbase.com' + line.investor_permalink;
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line
FIELDTERMINATOR ';'
MERGE (b:INVESTOR_CATEGORY {name: line.investor_category_list})
MERGE (c:INVESTOR_MARKET {name: line.investor_market})
MERGE (d:COUNTRY {name: line.investor_country_code})
MERGE (e:STATE {name: line.investor_state_code})
MERGE (f:REGION {name: line.investor_region})
MERGE (g:CITY {name: line.investor_city})
MERGE (h:FUNDING_ROUND {permalink: line.funding_round_permalink})
ON CREATE SET h.funded_at = line.funded_at, h.funded_month = line.funded_month, h.funded_quarter = line.funded_quarter, h.funded_year = toInt(line.funded_year), h.raised_amount_usd = toInt(line.raised_amount_usd), h. funding_round_code = line.funding_round_code, h.funding_round_type = line.funding_round_type, h.url = 'http://www.crunchbase.com' + line.permalink
MERGE (i:ROUND_CODE {name: line.funding_round_code})
MERGE (j:ROUND_TYPE {name: line.funding_round_type});
//Creating the edges
//Creating an index on the investors and funding rounds to speed up the import
DROP CONSTRAINT ON (a:INVESTOR) ASSERT a.permalink IS UNIQUE;
CREATE INDEX ON :INVESTOR(permalink);
DROP CONSTRAINT ON (a:FUNDING_ROUND) ASSERT a.permalink IS UNIQUE;
CREATE INDEX ON :FUNDING_ROUND(permalink);
//Investor & Category
MATCH (a:INVESTOR)
MATCH (b:INVESTOR_CATEGORY { name: a.category})
CREATE (a)-[:HAS_INVESTOR_CATEGORY]->(b);
//Investor & Market
MATCH (a:INVESTOR)
MATCH (b:INVESTOR_MARKET { name: a.market})
CREATE (a)-[:HAS_INVESTOR_MARKET]->(b);
//Investor and Country
MATCH (a:INVESTOR)
MATCH (b:COUNTRY { name: a.country})
CREATE (a)-[:HAS_COUNTRY]->(b);
//Investor & State
MATCH (a:INVESTOR)
MATCH (b:STATE { name: a.state})
CREATE (a)-[:HAS_STATE]->(b);
//Investor & Region
MATCH (a:INVESTOR)
MATCH (b:REGION { name: a.region})
CREATE (a)-[:HAS_REGION]->(b);
//Investor & City
MATCH (a:INVESTOR)
MATCH (b:CITY { name: a.city})
CREATE (a)-[:HAS_CITY]->(b);
//Investor & Funding Round
USING PERIODIC COMMIT 1000 LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line
FIELDTERMINATOR ';'
MATCH (a:INVESTOR { permalink: line.investor_permalink}),(b:FUNDING_ROUND { permalink: line.funding_round_permalink})
CREATE (a)-[:HAS_FUNDED]->(b);
//Startup & Funding Round
USING PERIODIC COMMIT 1000 LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line
FIELDTERMINATOR ';'
MATCH (a:COMPANY { permalink: line.company_permalink}), (b:FUNDING_ROUND { permalink: line.funding_round_permalink})
CREATE (a)-[:HAS_BEEN_FUNDED]->(b);
//Funding round and round code
MATCH (a:FUNDING_ROUND)
MATCH (b:ROUND_CODE { name: a.funding_round_code})
CREATE (a)-[:HAS_ROUND_CODE]->(b);
//Funding round and round type
MATCH (a:FUNDING_ROUND)
MATCH (b:ROUND_TYPE { name: a.funding_round_type})
CREATE (a)-[:HAS_ROUND_TYPE]->(b);
//Importing the file Acquisitions.csv
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:c:/Acquisitions.csv" AS line
FIELDTERMINATOR ';'
MATCH (a:COMPANY {permalink: line.company_permalink}), (b:COMPANY {permalink: line.acquirer_permalink})
CREATE (b)-[:HAS_ACQUIRED {acquisition_date : line.acquired_at, month: line.acquired_month, quarter: line.acquired_quarter, year: line.acquired_year, price_amount:
toInt(line.price_amount), currency: line.price_currency_code}]->(a);
//-----------------------
//Cleaning up
//-----------------------
MATCH (n:COMPANY)
WHERE n.state = ''
REMOVE n.state;
MATCH (n:COMPANY)
WHERE n.homepage_url = ''
REMOVE n.homepage_url;
MATCH (n:COMPANY)
WHERE n.funding_total = ''
REMOVE n.funding_total;
MATCH (n:COMPANY)
WHERE n.funding_rounds = ''
REMOVE n.funding_rounds;
MATCH (n:COMPANY)
WHERE n.founded_at = ''
REMOVE n.founded_at;
MATCH (n:COMPANY)
WHERE n.founded_month = ''
REMOVE n.founded_month;
MATCH (n:COMPANY)
WHERE n.founded_quarter = ''
REMOVE n.founded_quarter;
MATCH (n:COMPANY)
WHERE n.founded_year = ''
REMOVE n.founded_year;
MATCH (n:COMPANY)
WHERE n.first_funding_at = ''
REMOVE n.first_funding_at;
MATCH (n:COMPANY)
WHERE n.last_funding_at = ''
REMOVE n.last_funding_at;
MATCH (n:COMPANY)
WHERE n.category = ''
REMOVE n.category;
MATCH (n:COMPANY)
WHERE n.market = ''
REMOVE n.market;
MATCH (n:COMPANY)
WHERE n.status = ''
REMOVE n.status;
MATCH (n:COMPANY)
WHERE n.country = ''
REMOVE n.country;
MATCH (n:COMPANY)
WHERE n.state = ''
REMOVE n.state;
MATCH (n:COMPANY)
WHERE n.region = ''
REMOVE n.region;
MATCH (n:COMPANY)
REMOVE n.city;
MATCH (n:INVESTOR)
WHERE n.state = ''
REMOVE n.state;
MATCH (n:INVESTOR)
WHERE n.category = ''
REMOVE n.category;
MATCH (n:INVESTOR)
WHERE n.market = ''
REMOVE n.market;
MATCH (n:INVESTOR)
WHERE n.city = ''
REMOVE n.city;
MATCH (n:INVESTOR)
WHERE n.region = ''
REMOVE n.region;
MATCH (n:INVESTOR)
WHERE n.country = ''
REMOVE n.country;
optional match (n)-[r]-() where n.name = ''
DELETE n, r;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment