Skip to content

Instantly share code, notes, and snippets.

@aneeshdurg
Last active April 9, 2024 15:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aneeshdurg/19441eff1a315aeec417509ecd0e297d to your computer and use it in GitHub Desktop.
Save aneeshdurg/19441eff1a315aeec417509ecd0e297d to your computer and use it in GitHub Desktop.
Load LDBC CSV files into JanusGraph
// Usage: bin/gremlin.sh -e load.groovy
// This file loads LDBC CSV files into a graph
import java.text.SimpleDateFormat
dateFormat = new SimpleDateFormat("yyyy-MM-dd");
timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss")
def defineSchemaAndIndices(graph) {
mgmt = graph.openManagement(); // create management object
// define vertex labels
mgmt.makeVertexLabel("Place").make();
mgmt.makeVertexLabel("Comment").make();
mgmt.makeVertexLabel("Forum").make();
mgmt.makeVertexLabel("Person").make();
mgmt.makeVertexLabel("Post").make();
mgmt.makeVertexLabel("Tag").make();
mgmt.makeVertexLabel("TagClass").make();
mgmt.makeVertexLabel("University").make();
mgmt.makeVertexLabel("Company").make();
// define edge labels and USAGE
mgmt.makeEdgeLabel("containerOf").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("hasCreator").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("hasInterest").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("hasMember").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("hasModerator").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("hasTag").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("isLocatedIn").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("isPartOf").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("isSubclassOf").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("likes").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("knows").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("replyOf").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("studyAt").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("workAt").multiplicity(Multiplicity.SIMPLE).make();
mgmt.makeEdgeLabel("hasType").multiplicity(Multiplicity.SIMPLE).make();
// define vertex property keys
mgmt.makePropertyKey("id").dataType(Long.class).cardinality(Cardinality.SINGLE).make(); // Forum, Post/Comment (Message), Company/University (Organisation), Person, City/Country/Continent (Place), Tag, TagClass
mgmt.makePropertyKey("title").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Forum
mgmt.makePropertyKey("creationDate").dataType(Date.class).cardinality(Cardinality.SINGLE).make(); // Forum, Post/Comment (Message), Person, :knows, :likes
mgmt.makePropertyKey("browserUsed").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message), Person
mgmt.makePropertyKey("locationIP").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message)
mgmt.makePropertyKey("content").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message)
mgmt.makePropertyKey("length").dataType(Long.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message
mgmt.makePropertyKey("name").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Company/University (Organisation), City/Country/Continent (Place), Tag, TagClass
mgmt.makePropertyKey("url").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Company/University (Organisation), City/Country/Continent (Place), Tag, TagClass
mgmt.makePropertyKey("firstName").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Person
mgmt.makePropertyKey("lastName").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Person
mgmt.makePropertyKey("gender").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Person
mgmt.makePropertyKey("birthday").dataType(Date.class).cardinality(Cardinality.SINGLE).make(); // Person
mgmt.makePropertyKey("email").dataType(String.class).cardinality(Cardinality.LIST).make(); // Person
mgmt.makePropertyKey("language").dataType(String.class).cardinality(Cardinality.LIST).make(); // Person, Post (Message)
mgmt.makePropertyKey("imageFile").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post (Message)
mgmt.makePropertyKey("type").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Company/University (Organisation), City/Country/Continent (Place)
// define edge property keys
mgmt.makePropertyKey("joinDate").dataType(Date.class).cardinality(Cardinality.SINGLE).make(); // :hasMember
mgmt.makePropertyKey("classYear").dataType(Integer.class).cardinality(Cardinality.SINGLE).make(); // :studyAt
mgmt.makePropertyKey("workFrom").dataType(Integer.class).cardinality(Cardinality.SINGLE).make(); // :workAt
// define graph indexes
mgmt.buildIndex("byPlaceId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Place")).unique().buildCompositeIndex();
mgmt.buildIndex("byCommentId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Comment")).unique().buildCompositeIndex();
mgmt.buildIndex("byCompanyId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Company")).unique().buildCompositeIndex();
mgmt.buildIndex("byUniversityId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("University")).unique().buildCompositeIndex();
mgmt.buildIndex("byForumId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Forum")).unique().buildCompositeIndex();
mgmt.buildIndex("byPersonId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Person")).unique().buildCompositeIndex();
mgmt.buildIndex("byPostId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Post")).unique().buildCompositeIndex();
mgmt.buildIndex("byTagId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Tag")).unique().buildCompositeIndex();
mgmt.buildIndex("byTagClassId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("TagClass")).unique().buildCompositeIndex();
// commit schema
mgmt.commit();
}
def CSVToMap(filename) {
file = new File(filename)
lines = file.readLines()
// Get the CSV Header
header = lines[0].tokenize("|")
// Load the CSV file as a map using the header as keys
rows = []
lines[1..-1].each { line ->
def row = [:]
def props = line.tokenize("|")
for (int i = 0; i < props.size(); i++) {
if (header[i] == "id") {
row.put(header[i], props[i].toLong())
} else if (header[i] == "creationDate"){
ts = timestampFormat.parse(props[i])
row.put(header[i], ts)
} else if (header[i] == "birthday") {
d = dateFormat.parse(props[i])
row.put(header[i], d)
} else if (header[i] == "classYear" || header[i] == "workFrom") {
row.put(header[i], props[i].toInteger())
} else if (header[i] == "length") {
row.put(header[i], props[i].toLong())
} else {
row.put(header[i], props[i])
}
}
rows << row
}
return rows
}
def constructVertices(g, label, allVertices, typeAsLabel) {
allVertices.each { vertexProps ->
if (typeAsLabel) {
label = vertexProps["type"]
}
v = g.addV(label).next()
vertexProps.each { key, val ->
v.property(key, val)
}
}
}
def constructEdges(g, label, fromlabel, fromKey, tolabel, toKey, allEdges) {
allEdges.each { edgeProps ->
fromVertex = g.V().has(fromlabel, "id", edgeProps[fromKey]).next()
toVertex = g.V().has(tolabel, "id", edgeProps[toKey]).next()
e = g.V(fromVertex).addE(label).to(toVertex).next()
edgeProps.each { key, val ->
if (key == fromKey || key == toKey) {
return
}
e.property(key, val)
}
valueMap = g.E(e).valueMap().next()
}
}
def getAllCSVFilesInDirectory(dir) {
def files = []
new File(dir).eachFile { file ->
if (file.name.endsWith(".csv")) {
files << file.toString()
}
}
return files
}
def loadLDBCVertices(g, type, label, typeAsLabel) {
path = "/mygraph/ldbc/$type/$label"
files = getAllCSVFilesInDirectory(path)
files.each { file ->
vertices = CSVToMap(file)
constructVertices(g, label, vertices, typeAsLabel)
}
println "Loaded $label vertices"
}
def loadLDBCEdges(g, label, inlabel, outlabel) {
path = "/mygraph/ldbc/dynamic/${inlabel}_${label}_${outlabel}"
files = getAllCSVFilesInDirectory(path)
// if inlabel is the same as outlabel, then the headers will be ${inlabel}1Id
// and ${inlabel}2Id, otherwise they will be ${inlabel}Id and ${outlabel}Id
if (inlabel == outlabel) {
inKey = "${inlabel}1Id"
outKey = "${inlabel}2Id"
} else {
inKey = "${inlabel}Id"
outKey = "${outlabel}Id"
}
files.each { file ->
edges = CSVToMap(file)
constructEdges(g, label, inlabel, inKey, outlabel, outKey, edges)
}
println "Loaded $label edges"
}
def loadLDBC(graph) {
defineSchemaAndIndices(graph)
println "Schema and Indices defined"
g = graph.traversal()
loadLDBCVertices(g, "static", "Place", false)
loadLDBCVertices(g, "static", "Tag", false)
loadLDBCVertices(g, "static", "TagClass", false)
// Organisation can be either Company or University
loadLDBCVertices(g, "static", "Organisation", true)
loadLDBCVertices(g, "dynamic", "Forum", false)
loadLDBCVertices(g, "dynamic", "Post", false)
loadLDBCVertices(g, "dynamic", "Comment", false)
loadLDBCVertices(g, "dynamic", "Person", false)
loadLDBCEdges(g, "hasTag", "Comment", "Tag")
loadLDBCEdges(g, "knows", "Person", "Person")
loadLDBCEdges(g, "hasMember", "Forum", "Person")
loadLDBCEdges(g, "likes", "Person", "Post")
loadLDBCEdges(g, "workAt", "Person", "Company")
loadLDBCEdges(g, "hasTag", "Post", "Tag")
loadLDBCEdges(g, "hasInterest", "Person", "Tag")
loadLDBCEdges(g, "hasTag", "Forum", "Tag")
loadLDBCEdges(g, "studyAt", "Person", "University")
loadLDBCEdges(g, "likes", "Person", "Comment")
return g
}
graph = JanusGraphFactory.open('conf/janusgraph-inmemory-server.properties')
g = loadLDBC(graph)
vmap = g.V().limit(1).valueMap().next()
println "Vertex: $vmap"
emap = g.E().limit(1).valueMap().next()
println "Edge: $emap"
// Save to Kryo - reading from kryo takes 6s on my machine compared to 15s for loading from CSV
// g.io('/mygraph/ldbc003.kryo').write().iterate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment