Skip to content

Instantly share code, notes, and snippets.

@jvilledieu
Last active August 18, 2016 12:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jvilledieu/4968309b478125bfbfe02576812fb304 to your computer and use it in GitHub Desktop.
Save jvilledieu/4968309b478125bfbfe02576812fb304 to your computer and use it in GitHub Desktop.
New crunchbase import script
/*
https://linkurio.us/the-crunchbase-graph-data-modelling/
https://linkurio.us/crunchbase-graph-importing-data-neo4j/
https://linkurio.us/crunchbase-graph-analysing-graph/
Check also Neo4j csv and etl guide
*/
// Verifying the CSV file
// assert correct line count
LOAD CSV FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line
RETURN count(*);
// check first few raw lines
LOAD CSV FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
RETURN line
LIMIT 1;
// check first 5 line-sample with header-mapping
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
RETURN line
LIMIT 1;
//-=-=-=-=-=-=-=
//Cleaning up the database
MATCH (n)
WITH n LIMIT 10000
OPTIONAL MATCH (n)-[r]->()
DELETE n,r;
//Importing the file Companies.csv
//Creating the constraints
create constraint on (a:COMPANY) assert a.permalink is unique;
create constraint on (b:CATEGORY) assert b.name is unique;
create constraint on (c:MARKET) assert c.name is unique;
create constraint on (d:STATUS) assert d.name is unique;
create constraint on (e:COUNTRY) assert e.name is unique;
create constraint on (f:STATE) assert f.name is unique;
create constraint on (g:REGION) assert g.name is unique;
create constraint on (h:CITY) assert h.name is unique;
//Creating the nodes
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.category_list IS NOT NULL
MERGE (b:CATEGORY {name: line.category_list});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.market IS NOT NULL
MERGE (c:MARKET {name: line.market});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.status IS NOT NULL
MERGE (d:STATUS {name: line.status});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.country_code IS NOT NULL
MERGE (e:COUNTRY {name: line.country_code});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.state_code IS NOT NULL
MERGE (f:STATE {name: line.state_code});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.region IS NOT NULL
MERGE (g:REGION {name: line.region});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line
WHERE line.city IS NOT NULL
MERGE (h:CITY {name: line.city});
// Check label counts:
MATCH n
RETURN DISTINCT count(labels(n)), labels(n);
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line
MERGE (a:COMPANY { permalink: line.permalink })
ON CREATE SET a.name = line.name,
a.homepage_url = line.homepage_url,
a.funding_total = toInt(line.funding_total_usd),
a.funding_rounds = toInt(line.funding_rounds),
a.founded_at = line.founded_at,
a.founded_month = line.founded_month,
a.founded_quarter = line.founded_quarter,
a.founded_year = toInt(line.founded_year),
a.first_funding_at = line.first_funding_at,
a.last_funding_at = line.last_funding_at,
a.category = line.category_list,
a.market = line.market,
a.status = line.status,
a.country = line.country_code,
a.state = line.state_code,
a.region = line.region,
a.city = line.city,
a.url = 'http://www.crunchbase.com' + line.permalink,
a.logo = 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw';
//Creating the edges
//Creating an index on the companies to speed up the import
DROP CONSTRAINT ON (a:COMPANY) ASSERT a.permalink IS UNIQUE;
CREATE INDEX ON :COMPANY(permalink);
//Companies & Categories
MATCH (a:COMPANY)
MATCH (b:CATEGORY {name: a.category})
CREATE (a)-[:HAS_CATEGORY]->(b);
//Companies & Market
MATCH (a:COMPANY)
MATCH (b:MARKET {name: a.market})
CREATE (a)-[:HAS_MARKET]->(b);
//Companies & Status
MATCH (a:COMPANY)
MATCH (b:STATUS {name: a.status})
CREATE (a)-[:HAS_STATUS]->(b);
//Companies & Country
MATCH (a:COMPANY)
MATCH (b:COUNTRY {name: a.country})
CREATE (a)-[:HAS_COUNTRY]->(b);
//Companies & State
MATCH (a:COMPANY)
MATCH (b:STATE {name: a.state})
CREATE (a)-[:HAS_STATE]->(b);
//Companies & Region
MATCH (a:COMPANY)
MATCH (b:REGION {name: a.region})
CREATE (a)-[:HAS_REGION]->(b);
//Companies & Cities
MATCH (a:COMPANY)
MATCH (b:CITY {name: a.city})
CREATE (a)-[:HAS_CITY]->(b);
//Importing the file Investments.csv
//Creating the constraints
create constraint on (a:INVESTOR) assert a.permalink is unique;
create constraint on (b:INVESTOR_CATEGORY) assert b.name is unique;
create constraint on (c:INVESTOR_MARKET) assert c.name is unique;
create constraint on (e:COUNTRY) assert e.name is unique;
create constraint on (f:STATE) assert f.name is unique;
create constraint on (g:REGION) assert g.name is unique;
create constraint on (h:CITY) assert h.name is unique;
create constraint on (i:FUNDING_ROUND) assert i.permalink is unique;
create constraint on (j:ROUND_CODE) assert j.name is unique;
create constraint on (k:ROUND_TYPE) assert k.name is unique;
//Creating the nodes
//Relabel added companies that invested to become INVESTOR
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line
MATCH (a:COMPANY { permalink: line.investor_permalink })
SET a:INVESTOR;
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_permalink IS NOT NULL
MERGE (a:INVESTOR { permalink: line.investor_permalink })
ON CREATE SET a.name = line.investor_name,
a.category = line.investor_category_list,
a.market = line.investor_market,
a.country = line.investor_country_code,
a.state = line.investor_state_code,
a.region = line.investor_region,
a.city = line.investor_city,
a.logo = 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw',
a.url = 'http://www.crunchbase.com' + line.investor_permalink;
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_category_list IS NOT NULL
MERGE (b:INVESTOR_CATEGORY {name: line.investor_category_list});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_market IS NOT NULL
MERGE (c:INVESTOR_MARKET {name: line.investor_market});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_country_code IS NOT NULL
MERGE (d:COUNTRY {name: line.investor_country_code});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_state_code IS NOT NULL
MERGE (e:STATE {name: line.investor_state_code});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_region IS NOT NULL
MERGE (f:REGION {name: line.investor_region});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_city IS NOT NULL
MERGE (g:CITY {name: line.investor_city});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.funding_round_permalink IS NOT NULL
MERGE (h:FUNDING_ROUND {permalink: line.funding_round_permalink})
ON CREATE SET h.funded_at = line.funded_at,
h.funded_month = line.funded_month,
h.funded_quarter = line.funded_quarter,
h.funded_year = toInt(line.funded_year),
h.raised_amount_usd = toInt(line.raised_amount_usd),
h. funding_round_code = line.funding_round_code,
h.funding_round_type = line.funding_round_type,
h.url = 'http://www.crunchbase.com' + line.permalink;
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.funding_round_code IS NOT NULL
MERGE (i:ROUND_CODE {name: line.funding_round_code});
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.funding_round_type IS NOT NULL
MERGE (j:ROUND_TYPE {name: line.funding_round_type});
//Creating the edges
//Creating an index on the investors and funding rounds to speed up the import
DROP CONSTRAINT ON (a:INVESTOR) ASSERT a.permalink IS UNIQUE;
CREATE INDEX ON :INVESTOR(permalink);
DROP CONSTRAINT ON (a:FUNDING_ROUND) ASSERT a.permalink IS UNIQUE;
CREATE INDEX ON :FUNDING_ROUND(permalink);
//Investor & Category
MATCH (a:INVESTOR)
MATCH (b:INVESTOR_CATEGORY { name: a.category})
CREATE (a)-[:HAS_INVESTOR_CATEGORY]->(b);
//Investor & Market
MATCH (a:INVESTOR)
MATCH (b:INVESTOR_MARKET { name: a.market})
CREATE (a)-[:HAS_INVESTOR_MARKET]->(b);
//Investor and Country
MATCH (a:INVESTOR)
MATCH (b:COUNTRY { name: a.country})
CREATE (a)-[:HAS_COUNTRY]->(b);
//Investor & State
MATCH (a:INVESTOR)
MATCH (b:STATE { name: a.state})
CREATE (a)-[:HAS_STATE]->(b);
//Investor & Region
MATCH (a:INVESTOR)
MATCH (b:REGION { name: a.region})
CREATE (a)-[:HAS_REGION]->(b);
//Investor & City
MATCH (a:INVESTOR)
MATCH (b:CITY { name: a.city})
CREATE (a)-[:HAS_CITY]->(b);
//Investor & Funding Round
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.investor_permalink IS NOT NULL AND line.funding_round_permalink IS NOT NULL
MATCH (a:INVESTOR { permalink: line.investor_permalink}),(b:FUNDING_ROUND { permalink: line.funding_round_permalink})
CREATE (a)-[:HAS_FUNDED]->(b);
//Startup & Funding Round
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line
WHERE line.company_permalink IS NOT NULL AND line.funding_round_permalink IS NOT NULL
MATCH (a:COMPANY { permalink: line.company_permalink}), (b:FUNDING_ROUND { permalink: line.funding_round_permalink})
CREATE (a)-[:HAS_BEEN_FUNDED]->(b);
//Funding round and round code
MATCH (a:FUNDING_ROUND)
MATCH (b:ROUND_CODE { name: a.funding_round_code})
CREATE (a)-[:HAS_ROUND_CODE]->(b);
//Funding round and round type
MATCH (a:FUNDING_ROUND)
MATCH (b:ROUND_TYPE { name: a.funding_round_type})
CREATE (a)-[:HAS_ROUND_TYPE]->(b);
//Importing the file Acquisitions.csv
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Acquisitions.csv" AS line WITH line
WHERE line.company_permalink IS NOT NULL AND line.acquirer_permalink IS NOT NULL
MATCH (a:COMPANY {permalink: line.company_permalink}), (b:COMPANY {permalink: line.acquirer_permalink})
CREATE (b)-[:HAS_ACQUIRED {acquisition_date : line.acquired_at, month: line.acquired_month, quarter: line.acquired_quarter, year: line.acquired_year, price_amount: toInt(line.price_amount), currency: line.price_currency_code}]->(a);
//-----------------------
//Cleaning up
//-----------------------
MATCH (n:COMPANY)
WHERE n.state = ''
REMOVE n.state;
MATCH (n:COMPANY)
WHERE n.homepage_url = ''
REMOVE n.homepage_url;
MATCH (n:COMPANY)
WHERE n.funding_total = ''
REMOVE n.funding_total;
MATCH (n:COMPANY)
WHERE n.funding_rounds = ''
REMOVE n.funding_rounds;
MATCH (n:COMPANY)
WHERE n.founded_at = ''
REMOVE n.founded_at;
MATCH (n:COMPANY)
WHERE n.founded_month = ''
REMOVE n.founded_month;
MATCH (n:COMPANY)
WHERE n.founded_quarter = ''
REMOVE n.founded_quarter;
MATCH (n:COMPANY)
WHERE n.founded_year = ''
REMOVE n.founded_year;
MATCH (n:COMPANY)
WHERE n.first_funding_at = ''
REMOVE n.first_funding_at;
MATCH (n:COMPANY)
WHERE n.last_funding_at = ''
REMOVE n.last_funding_at;
MATCH (n:COMPANY)
WHERE n.category = ''
REMOVE n.category;
MATCH (n:COMPANY)
WHERE n.market = ''
REMOVE n.market;
MATCH (n:COMPANY)
WHERE n.status = ''
REMOVE n.status;
MATCH (n:COMPANY)
WHERE n.country = ''
REMOVE n.country;
MATCH (n:COMPANY)
WHERE n.state = ''
REMOVE n.state;
MATCH (n:COMPANY)
WHERE n.region = ''
REMOVE n.region;
MATCH (n:COMPANY)
REMOVE n.city;
MATCH (n:INVESTOR)
WHERE n.state = ''
REMOVE n.state;
MATCH (n:INVESTOR)
WHERE n.category = ''
REMOVE n.category;
MATCH (n:INVESTOR)
WHERE n.market = ''
REMOVE n.market;
MATCH (n:INVESTOR)
WHERE n.city = ''
REMOVE n.city;
MATCH (n:INVESTOR)
WHERE n.region = ''
REMOVE n.region;
MATCH (n:INVESTOR)
WHERE n.country = ''
REMOVE n.country;
optional match (n)-[r]-() where n.name = ''
DELETE n, r;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment