Here's a challenge to the KG Construction CG:
- Take Crunchbase: 10.5M rows, across 18 tables, served as CSV, updated daily.
- The data of some nodes comes from multiple tables (eg Organization from
organizations, org_parents, org_descriptions
) - RDFize and store the total dataset, in under 1-2 hours time
- Using the approach described here, GraphDB 9.11 with OntoRefine takes 76-119 minutes (1.3-2 hours) depending on hardware to produce and load 138M triples (19-30k triples per second)
- Update the data daily, replacing the data of recently updated rows.
- Using the approach described here, it takes about 15 minutes to update all of Crunchbase
- Do it with your favorite RDFization toolkit, and preferably do it declaratively
The ttl files here represent a model for converting each CSV table to RDF.
Source CSV fields are embedded in URLs and literals.
An overall model can be displayed by concatenating the individual ttl's, renaming some nodes to make them connected, and adding a few rdfpuml
instructions:
You can generate an ontology from cb-model.tsv
that describes all classes and fields.
- terms without prefix are in the
k2:
namespace CbIndustry
is the only multi-parented class: it's subClassOfskos:Concept,CbObject
Some values need special processing shown as "function calls":
urlify1(x)
: make a name usable in URL- Replace punctuations with one underscore; remove leading/trailing punctuation
- Support all Unicode alphanumeric chars
- Convert alphabetical chars to lowercase
urlify(x)
: same but also generates a bind tox_URLIFY
fixDate(x)
: replaceT
in timestamp to conform toxsd:dateTime
formatlcase(x)
: lowercasesplit1(x)
: split on comma and produce multiple bindings.splitArray(x)
: strip brackets and commas from["foo","bar"]
then split on commaifNotNull(x)
: filter out parasitic values ("other","not provided","unknown")ifNotSame(x,y)
: filter outx
values that are equal to?y
- Used to strip self-referential parent: CB
category
mentioning itself ascategory_group
- Used to strip self-referential parent: CB
booleanYesNo(x)
: map"Yes","No"
totrue,false
coalesce(x,...)
: coalesce the variable number of args (pick the first one that's bound)
This outlines Ontotext's approach, using OntoRefine for loading and updating, and generation of these transformations from models.
We use one named graph per table row.
Eg the longest SPARQL update query organiations.ru
looks like this:
delete {graph ?GRAPH {?s ?p ?o}}
where {
service <rdf-mapper:ontorefine:PROJECT_ID> {
bind(?c_updated_at as ?c_updated_at1)
bind(iri(concat("cb/graph/organizations/",?c_uuid)) as ?GRAPH)}
<cb> k2:updatedAt ?UPDATED_AT_DATETIME.
bind(replace(str(?UPDATED_AT_DATETIME),'T',' ') as ?UPDATED_AT)
filter(?c_updated_at1 > ?UPDATED_AT)
graph ?GRAPH {?s ?p ?o}};
insert {graph ?GRAPH {
?cb_agent_uuid_URL a k2:Organization;
k2:cbId ?uuid;
k2:name ?name;
k2:cbPermalink ?permalink;
k2:cbUrl ?CB_URL;
k2:rank ?RANK;
k2:createdAt ?CREATED_AT_FIXDATE;
k2:updatedAt ?UPDATED_AT_FIXDATE;
k2:legalName ?legal_name;
k2:organizationRole ?cb_organizationRole_roles_SPLIT1_URLIFY_URL;
k2:domain ?domain;
k2:homepageUrl ?HOMEPAGE_URL;
k2:countryCode ?country_code;
k2:stateCode ?state_code;
k2:region ?region;
k2:city ?city;
k2:address ?address;
k2:postalCode ?postal_code;
k2:status ?cb_organizationStatus_status_URLIFY_URL;
k2:shortDescription ?short_description;
k2:industry ?cb_industry_category_list_SPLIT1_URLIFY_URL;
k2:numFundingRounds ?NUM_FUNDING_ROUNDS;
k2:totalFundingUsd ?TOTAL_FUNDING_USD;
k2:totalFunding ?TOTAL_FUNDING;
k2:totalFundingCurrencyCode ?total_funding_currency_code;
k2:foundedOn ?FOUNDED_ON_FIXDATE;
k2:lastFundingOn ?LAST_FUNDING_ON_FIXDATE;
k2:closedOn ?CLOSED_ON_FIXDATE;
k2:employeeCount ?cb_employeeCount_employee_count_IFNOTNULL_URLIFY_URL;
k2:email ?email;
k2:phone ?phone;
k2:facebookUrl ?FACEBOOK_URL;
k2:linkedinUrl ?LINKEDIN_URL;
k2:twitterUrl ?TWITTER_URL;
k2:logoUrl ?LOGO_URL;
k2:alias ?alias1;
k2:alias ?alias2;
k2:alias ?alias3;
k2:primaryRole ?cb_organizationRole_primary_role_URLIFY_URL;
k2:numExits ?NUM_EXITS.
}}
where {
service <rdf-mapper:ontorefine:PROJECT_ID> {
bind(?c_uuid as ?uuid)
bind(?c_name as ?name)
bind(?c_permalink as ?permalink)
bind(?c_cb_url as ?cb_url)
bind(?c_rank as ?rank)
bind(?c_created_at as ?created_at)
bind(?c_updated_at as ?updated_at)
bind(?c_legal_name as ?legal_name)
bind(?c_roles as ?roles)
bind(?c_domain as ?domain)
bind(?c_homepage_url as ?homepage_url)
bind(?c_country_code as ?country_code)
bind(?c_state_code as ?state_code)
bind(?c_region as ?region)
bind(?c_city as ?city)
bind(?c_address as ?address)
bind(?c_postal_code as ?postal_code)
bind(?c_status as ?status)
bind(?c_short_description as ?short_description)
bind(?c_category_list as ?category_list)
bind(?c_num_funding_rounds as ?num_funding_rounds)
bind(?c_total_funding_usd as ?total_funding_usd)
bind(?c_total_funding as ?total_funding)
bind(?c_total_funding_currency_code as ?total_funding_currency_code)
bind(?c_founded_on as ?founded_on)
bind(?c_last_funding_on as ?last_funding_on)
bind(?c_closed_on as ?closed_on)
bind(?c_employee_count as ?employee_count)
bind(?c_email as ?email)
bind(?c_phone as ?phone)
bind(?c_facebook_url as ?facebook_url)
bind(?c_linkedin_url as ?linkedin_url)
bind(?c_twitter_url as ?twitter_url)
bind(?c_logo_url as ?logo_url)
bind(?c_alias1 as ?alias1)
bind(?c_alias2 as ?alias2)
bind(?c_alias3 as ?alias3)
bind(?c_primary_role as ?primary_role)
bind(?c_num_exits as ?num_exits)
bind(iri(concat("cb/agent/",?uuid)) as ?cb_agent_uuid_URL)
bind(strdt(?cb_url,xsd:anyURI) as ?CB_URL)
bind(strdt(?rank,xsd:integer) as ?RANK)
bind(REPLACE(?created_at,' ','T') as ?created_at_FIXDATE)
bind(strdt(?created_at_FIXDATE,xsd:dateTime) as ?CREATED_AT_FIXDATE)
bind(REPLACE(?updated_at,' ','T') as ?updated_at_FIXDATE)
bind(strdt(?updated_at_FIXDATE,xsd:dateTime) as ?UPDATED_AT_FIXDATE)
?roles_SPLIT1 spif:split (?roles ',').
bind(LCASE(REPLACE(REPLACE(REPLACE(?roles_SPLIT1, "[^\\p{L}0-9]", "_"), "_+", "_"), "^_|_$", "")) as ?roles_SPLIT1_URLIFY)
bind(iri(concat("cb/organizationRole/",?roles_SPLIT1_URLIFY)) as ?cb_organizationRole_roles_SPLIT1_URLIFY_URL)
bind(strdt(?homepage_url,xsd:anyURI) as ?HOMEPAGE_URL)
bind(LCASE(REPLACE(REPLACE(REPLACE(?status, "[^\\p{L}0-9]", "_"), "_+", "_"), "^_|_$", "")) as ?status_URLIFY)
bind(iri(concat("cb/organizationStatus/",?status_URLIFY)) as ?cb_organizationStatus_status_URLIFY_URL)
?category_list_SPLIT1 spif:split (?category_list ',').
bind(LCASE(REPLACE(REPLACE(REPLACE(?category_list_SPLIT1, "[^\\p{L}0-9]", "_"), "_+", "_"), "^_|_$", "")) as ?category_list_SPLIT1_URLIFY)
bind(iri(concat("cb/industry/",?category_list_SPLIT1_URLIFY)) as ?cb_industry_category_list_SPLIT1_URLIFY_URL)
bind(strdt(?num_funding_rounds,xsd:integer) as ?NUM_FUNDING_ROUNDS)
bind(strdt(?total_funding_usd,xsd:decimal) as ?TOTAL_FUNDING_USD)
bind(strdt(?total_funding,xsd:decimal) as ?TOTAL_FUNDING)
bind(REPLACE(?founded_on,' ','T') as ?founded_on_FIXDATE)
bind(strdt(?founded_on_FIXDATE,xsd:dateTime) as ?FOUNDED_ON_FIXDATE)
bind(REPLACE(?last_funding_on,' ','T') as ?last_funding_on_FIXDATE)
bind(strdt(?last_funding_on_FIXDATE,xsd:dateTime) as ?LAST_FUNDING_ON_FIXDATE)
bind(REPLACE(?closed_on,' ','T') as ?closed_on_FIXDATE)
bind(strdt(?closed_on_FIXDATE,xsd:dateTime) as ?CLOSED_ON_FIXDATE)
bind(if(?employee_count in ("other","not provided","unknown"),?UNDEF,?employee_count) as ?employee_count_IFNOTNULL)
bind(LCASE(REPLACE(REPLACE(REPLACE(?employee_count_IFNOTNULL, "[^\\p{L}0-9]", "_"), "_+", "_"), "^_|_$", "")) as ?employee_count_IFNOTNULL_URLIFY)
bind(iri(concat("cb/employeeCount/",?employee_count_IFNOTNULL_URLIFY)) as ?cb_employeeCount_employee_count_IFNOTNULL_URLIFY_URL)
bind(strdt(?facebook_url,xsd:anyURI) as ?FACEBOOK_URL)
bind(strdt(?linkedin_url,xsd:anyURI) as ?LINKEDIN_URL)
bind(strdt(?twitter_url,xsd:anyURI) as ?TWITTER_URL)
bind(strdt(?logo_url,xsd:anyURI) as ?LOGO_URL)
bind(LCASE(REPLACE(REPLACE(REPLACE(?primary_role, "[^\\p{L}0-9]", "_"), "_+", "_"), "^_|_$", "")) as ?primary_role_URLIFY)
bind(iri(concat("cb/organizationRole/",?primary_role_URLIFY)) as ?cb_organizationRole_primary_role_URLIFY_URL)
bind(strdt(?num_exits,xsd:integer) as ?NUM_EXITS)
bind(?c_updated_at as ?c_updated_at1)
bind(iri(concat("cb/graph/organizations/",?c_uuid)) as ?GRAPH)}
<cb> k2:updatedAt ?UPDATED_AT_DATETIME.
bind(replace(str(?UPDATED_AT_DATETIME),'T',' ') as ?UPDATED_AT)
filter(?c_updated_at1 > ?UPDATED_AT)}