Skip to content

Instantly share code, notes, and snippets.

@dkuppitz
Last active December 15, 2015 18:29
Show Gist options
  • Save dkuppitz/5304264 to your computer and use it in GitHub Desktop.
Save dkuppitz/5304264 to your computer and use it in GitHub Desktop.
Titan Movie Recommendation
#
# First: Start a new m1.large EC2 instance with Amazon Linux AMI
# Next: Copy/Paste all the following code
#
wget http://www.grouplens.org/sites/www.grouplens.org/external_files/data/ml-10m.zip
unzip ml-10m.zip
cp ml-10M100K/*.dat /tmp
rm -f ml-10m.zip
rm -rf ml-10M100K/
wget http://s3.thinkaurelius.com/downloads/titan/titan-all-0.3.0.zip
unzip titan-all-0.3.0.zip
rm -f titan-all-0.3.0.zip
cd titan-all-0.3.0
# optionally adjust JVM settings, especially the Xmx option
# vi bin/gremlin.sh
bin/gremlin.sh
conf = new BaseConfiguration()
conf.setProperty("storage.backend", "embeddedcassandra")
graph = TitanFactory.open(conf)
graph.makeType().name('type').indexed(Vertex.class).unique(OUT).dataType(String.class).makePropertyKey()
graph.makeType().name('title').unique(OUT).dataType(String.class).makePropertyKey()
graph.makeType().name('genre').indexed(Vertex.class).unique(OUT).dataType(String.class).makePropertyKey()
graph.makeType().name('tag').indexed(Vertex.class).unique(OUT).dataType(String.class).makePropertyKey()
graph.makeType().name('movieId').indexed(Vertex.class).unique(OUT).dataType(Long.class).makePropertyKey()
graph.makeType().name('userId').indexed(Vertex.class).unique(OUT).dataType(Long.class).makePropertyKey()
graph.makeType().name('timestamp').unique(OUT).dataType(Long.class).makePropertyKey()
stars = graph.makeType().name('stars').unique(OUT).dataType(Integer.class).makePropertyKey()
graph.makeType().name('tagged').primaryKey(stars).makeEdgeLabel()
graph.makeType().name('hasGenera').makeEdgeLabel()
graph.makeType().name('hasTag').makeEdgeLabel()
graph.commit()
g = new BatchGraph(graph, VertexIDType.NUMBER, 100000)
vid = 1
genres = [:]
movies = [:]
new File('/tmp/movies.dat').eachLine { line ->
components = line.split('::')
movieId = components[0]
mv = g.addVertex(vid)
mv.setProperty('type','Movie')
mv.setProperty('movieId',movieId.toInteger())
mv.setProperty('title',components[1])
movies.put(movieId, vid++)
components[2].split('\\|').each { genre ->
if (!genres.containsKey(genre) && genre != '(no genres listed)') {
gv = g.addVertex(vid)
gv.setProperty('type','Genre')
gv.setProperty('genre',genre)
genres.put(genre, vid++)
}
if (genres.containsKey(genre)) {
g.addEdge(null, g.getVertex(movies[movieId]), g.getVertex(genres[genre]), 'hasGenera')
}
}
}; g.commit()
genres = null
tags = [:]
users = [:]
new File('/tmp/tags.dat').eachLine { line ->
components = line.split('::')
userId = components[0]
movieId = components[1]
tag = components[2].toLowerCase()
timestamp = components[3].toLong()
if (!tags.containsKey(tag)) {
tv = g.addVertex(vid)
tv.setProperty('type','Tag')
tv.setProperty('tag',tag)
tags.put(tag, vid++)
}
if (!users.containsKey(userId)) {
uv = g.addVertex(vid)
uv.setProperty('type','User')
uv.setProperty('userId',userId.toInteger())
users.put(userId, vid++)
}
if (movies.containsKey(movieId)) {
g.addEdge(null, g.getVertex(movies[movieId]), g.getVertex(tags[tag]), 'hasTag')
e = g.addEdge(null, g.getVertex(users[userId]), g.getVertex(movies[movieId]), 'tagged')
e.setProperty('timestamp', timestamp)
e.setProperty('tag', components[2])
}
}; g.commit()
tags = null
i = 1
new File('/tmp/ratings.dat').eachLine { line ->
components = line.split('::')
userId = components[0]
movieId = components[1]
rating = (components[2].toDouble() * 2).toInteger()
timestamp = components[3].toLong()
if (!users.containsKey(userId)) {
uv = g.addVertex(vid)
uv.setProperty('type','User')
uv.setProperty('userId',userId.toInteger())
users.put(userId, vid++)
}
if (movies.containsKey(movieId)) {
e = g.addEdge(null, g.getVertex(users[userId]), g.getVertex(movies[movieId]), 'rated')
e.setProperty('timestamp', timestamp)
e.setProperty('stars', rating)
}
if (i % 100000 == 0) {
println i
g.commit()
}
i++;
}; g.commit()
g.shutdown()
graph.shutdown()
conf = new BaseConfiguration()
conf.setProperty("storage.backend", "embeddedcassandra")
g = TitanFactory.open(conf)
Gremlin.defineStep('likes', [Vertex,Pipe], { minRating ->
_().outE('rated').has('stars', T.gte, minRating).inV()
})
Gremlin.defineStep('likedBy', [Vertex,Pipe], { minRating ->
_().inE('rated').has('stars', T.gte, minRating).outV()
})
minRating = 9
knownMovies = [] as Set
favTags = [] as Set
favGenera = [] as Set
/* pick a random user */ \
g.V('type','User').random(0.5).next() \
/* determine known movies */ \
.outE('rated','tagged').inV().aggregate(knownMovies).optional(3) \
/* determine favorite movies */ \
.likes(minRating) \
/* store favorite movie tags */ \
.out('hasTag').store(favTags).optional(2) \
/* store favorite movie genre */ \
.out('hasGenera').store(favGenera).optional(2) \
/* who else has these favorite movies (take 1%) */ \
.likedBy(minRating).dedup().filter{it!=user}.random(0.01) \
/* what are the others favorite movies that the user does not already know */ \
.likes(minRating).except(knownMovies) \
/* filter out movies, that doesn't have the users favorite tags */ \
.out('hasTag').retain(favTags).back(2) \
/* filter out movies, that doesn't have the users favorite genres */ \
.out('hasGenera').retain(favGenera).back(2) \
/* take movie titles, determine relevance and */ \
/* show the 5 most relevant recommendations */ \
.title.groupCount().cap().orderMap(T.decr)[0..4]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment