Skip to content

Instantly share code, notes, and snippets.

@amcp
Forked from dkuppitz/ml.groovy
Last active April 9, 2016 13:34
Show Gist options
  • Save amcp/15ad10ff1157ea7ab11f2232732af96f to your computer and use it in GitHub Desktop.
Save amcp/15ad10ff1157ea7ab11f2232732af96f to your computer and use it in GitHub Desktop.
:load ml.groovy
MovieLensParser.load(graph, "ml-1m")
g = graph.traversal()
clockWithResult(1){ g.E().count().tryNext().get() }
clockWithResult(1){ g.V().count().tryNext().get() }
class MovieLensParser {
static Map occupations
static List genres
static {
occupations = [0: "other", 1: "academic/educator", 2: "artist",
3: "clerical/admin", 4: "college/grad student", 5: "customer service",
6: "doctor/health care", 7: "executive/managerial", 8: "farmer",
9: "homemaker", 10: "K-12 student", 11: "lawyer", 12: "programmer",
13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer"]
//iconv -f ISO-8859-1 -t UTF-8 movies.dat | sed 's/.*://' | tr "|" "\n" | sort | uniq | sed -e 's/^\(.*\)/\"\1\"/' | tr "\n" "@" | sed 's/@/,\ /g'
genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary",
"Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller",
"War", "Western"]
}
public static void parse(final org.apache.tinkerpop.gremlin.structure.Graph graph, final String dataDirectory) {
def g = graph.traversal()
// MovieID::Title::Genres
def genremap = [:]
genres.each {
genremap.put(it, graph.addVertex(T.label, 'genre', 'uid', 'g' + it, 'name', it))
}
//occupations
def occupationmap = [:]
(0..20).each {
occupationmap.put(it, graph.addVertex(T.label, 'occupation', 'uid', 'o' + it, 'jobId', it, 'name', occupations.get(it)))
}
def moviemap = [:]
new File(dataDirectory + '/movies.dat').eachLine { final String line ->
def components = line.split("::")
def movieTitleYear = components[1] =~ /(.*\b)\s*\((\d+)\)/
if (!movieTitleYear.find()) return
def movieId = components[0].toInteger()
def movieTitle = movieTitleYear.group(1)
def movieYear = movieTitleYear.group(2).toInteger()
def genres = components[2]
def movieVertex = graph.addVertex(T.label, 'movie', 'uid', 'm' + movieId, 'movieId', movieId, 'name', movieTitle, 'year', movieYear)
moviemap.put(movieId, movieVertex)
genres.split('\\|').each { def genre ->
movieVertex.addEdge('hasGenre', genremap[genre])
}
}
def usermap = [:]
// UserID::Gender::Age::Occupation::Zip-code
new File(dataDirectory + '/users.dat').eachLine { final String line ->
def components = line.split("::")
def userId = components[0].toInteger()
def userGender = components[1]
def userAge = components[2].toInteger()
def occupationId = components[3].toInteger()
def userZipcode = components[4]
def userVertex = graph.addVertex(T.label, 'person', 'uid', 'u' + userId, 'userId', userId, 'gender', userGender, 'age', userAge, 'zipcode', userZipcode)
usermap.put(userId, userVertex)
userVertex.addEdge('hasOccupation', occupationmap[occupationId])
}
// UserID::MovieID::Rating::Timestamp
new File(dataDirectory + '/ratings.dat').readLines().parallelStream().forEach( { final String line ->
def components = line.split("::")
def movieId = components[1].toInteger()
if(!moviemap.containsKey(movieId)) {
return
}
def userId = components[0].toInteger()
def stars = components[2].toInteger()
def time = components[3].toLong()
usermap[userId].addEdge('rated', moviemap[movieId], 'stars', stars, 'time', time)
})
}
public static void load(final org.apache.tinkerpop.gremlin.structure.Graph graph, final String dataDirectory) {
if(graph instanceof com.thinkaurelius.titan.graphdb.database.StandardTitanGraph) {
def mgmt = ((com.thinkaurelius.titan.graphdb.database.StandardTitanGraph) graph).openManagement()
["movieId", "year", "stars", "userId", "age", "jobId"].each {
mgmt.makePropertyKey(it).dataType(Integer.class).make()
}
["name", "gender", "zipcode"].each {
mgmt.makePropertyKey(it).dataType(String.class).make()
}
mgmt.makePropertyKey("time").dataType(Long.class).make()
def uidKey = mgmt.makePropertyKey('uid').dataType(String.class).make()
mgmt.buildIndex('byUid', org.apache.tinkerpop.gremlin.structure.Vertex.class).addKey(uidKey).unique().buildCompositeIndex()
["rated", "hasOccupation", "hasGenre"].each {
mgmt.makeEdgeLabel(it).make()
}
["person", "genre", "occupation", "movie"].each {
mgmt.makeVertexLabel(it).make()
}
mgmt.commit()
graph.tx().commit()
} else {
graph.createIndex('uid', org.apache.tinkerpop.gremlin.structure.Vertex.class)
}
def start = System.currentTimeMillis()
def actualGraph = graph instanceof com.thinkaurelius.titan.graphdb.database.StandardTitanGraph ? graph.tx().createThreadedTx() : graph
parse(actualGraph, dataDirectory)
def creating = System.currentTimeMillis()
println "Creating objects took (ms): " + (creating - start)
if(actualGraph instanceof com.thinkaurelius.titan.graphdb.transaction.StandardTitanTx) actualGraph.tx().commit()
println "Committing took (ms): " + (System.currentTimeMillis() - creating)
}
}
graph = TinkerGraph.open()
# Followed common-usage.txt to get the following results:
Creating objects took (ms): 3233
Committing took (ms): 8
Time to traverse 969719 edges: 111 ms
Time to traverse 9625 vertices: 2 ms
graph = com.thinkaurelius.titan.core.TitanFactory.build().set("storage.backend", "berkeleyje").
set("storage.directory", "bdb").set("storage.buffer-size", Integer.MAX_VALUE).open()
# Followed common-usage.txt to get the following results:
Creating objects took (ms): 14789
Committing took (ms): 15790
Time to traverse 969719 edges: 2742 ms
Time to traverse 9625 vertices: 1594 ms
graph = com.thinkaurelius.titan.core.TitanFactory.build().set("storage.backend", "inmemory").open()
# Followed common-usage.txt to get the following results:
Creating objects took (ms): 13136
Committing took (ms): 6180
Time to traverse 969719 edges: 1051 ms
Time to traverse 9625 vertices: 17 ms
graph = com.thinkaurelius.titan.core.TitanFactory.build().
set("storage.backend", "jp.classmethod.titan.diskstorage.tupl.TuplStoreManager").
set("storage.tupl.min-cache-size", "1000000000").
set("storage.tupl.map-data-files", "true").set("storage.tupl.direct-page-access", "true").
set("storage.buffer-size", Integer.MAX_VALUE).open()
# Followed common-usage.txt to get the following results:
Creating objects took (ms): 13614
Committing took (ms): 8692
Time to traverse 969719 edges: 1155 ms
Time to traverse 9625 vertices: 191 ms
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment