Skip to content

Instantly share code, notes, and snippets.

@larssono
Last active May 26, 2016 16:31
Show Gist options
  • Save larssono/9657a888f24e7a836806cda60f484048 to your computer and use it in GitHub Desktop.
Save larssono/9657a888f24e7a836806cda60f484048 to your computer and use it in GitHub Desktop.

##Load graph into memory g = new TinkerGraph() g.loadGraphSON('/Users/lom/Downloads/graph.json')

##Find all activities g.V('concreteType', 'activity').name nActivities = g.V('concreteType', 'activity').count()

##How many entities are there nEntities = g.V.filter{it.concreteType !='activity'}.count()

#What percentage of entities have provenance nActivities/nEntities

##Who is creating these provenance g.V('concreteType', 'activity').property('createdBy').groupCount().cap() g.V('concreteType', 'activity').property('createdBy').groupCount().cap().orderMap(T.incr)

##Get information about an existing entity g.V('synId', 'syn2246874').name #Find what it is using g.V('synId', 'syn2246874').out.map()
#Filter by specific user g.V('synId', 'syn2246874').out().has('createdBy', T.eq, '372127').out.name

##Find all downstream things derived from this g.V('synId', 'syn2246874').filter({it.out.hasNext()}).as('derived').out().loop('derived'){true}{true}.filter{it.concreteType !='activity'}.name

#A more dependent file g.V('synId', 'syn2534308').filter({it.out.hasNext()}).as('derived').out().loop('derived'){true}{true}.filter{it.concreteType !='activity'}.name

##Find most dependent g.V.as('start').filter({it.out.hasNext()}).as('derived').out().loop('derived'){true}{true}.filter{it.concreteType !='activity'}.property('synId').groupCount().cap()

##Find most used entity in Synapse g.V.as('start').filter({it.in.hasNext()}).as('derived').in().loop('derived'){true}{true}.filter{it.concreteType !='activity'}.property('synId').groupCount().cap().orderMap(T.incr)

##Exclude a project g.V.as('start').filter({it.in.hasNext()}).as('derived').in().loop('derived'){true}{true}.filter{it.concreteType !='activity'}.filter({'benefactorId' !='syn2280661'})property('synId').groupCount().cap().orderMap(T.incr)

##################### ##Recomendation engine ######################

  1. Find all entities that I have used,
  2. find others who have used the same
  3. Determine what else they used
  4. Give me the top 5 most similar things they used that I didn't use.

g.V('createdBy', '372127').out().as('myActivities').in().in().except('myActivities').out().groupCount().cap().orderMap(T.decr)[0..<5].out.synId

##################### ##Compute primary eigenvector ###################### m = [:]; c = 0; g.V.as('x').out.groupCount(m).loop('x'){c++ < 1000} m.sort{-it.value}

import synapseclient
import json
from collections import OrderedDict
import multiprocessing.dummy as mp
syn = synapseclient.login()
NODETYPES = {0:'dataset',1: 'layer',2: 'project',3: 'preview',4: 'folder',
5: 'analysis',6: 'step', 7: 'code',8: 'link',9: 'phenotypedata',
10:'genotypedata',11:'expressiondata',12:'robject',
13:'summary',14:'genomicdata',15:'page',16:'file',17:'table',
18:'community'}
def idGenerator(start=0):
i = start
while True:
yield i
i +=1;
newId = idGenerator()#29602)
counter2 = idGenerator()
def getEntities(benefactorId):
query = syn.chunkedQuery('select * from entity where projectId=="%s"' %benefactorId)
entityDict = dict()
for ent in query:
if ent['entity.nodeType'] in [2,3,4]: #Remove containers
continue
for key in ent.keys():
#remove the "entity" portion of query
new_key = '.'.join(key.split('.')[1:])
item = ent.pop(key)
ent[new_key] = item[0] if (type(item) is list and len(item)>0) else item
ent['_type']='vertex'
ent['_id'] = newId.next()
ent['synId'] = ent.pop('id')
entityDict['%s.%s' %(ent['synId'],ent['versionNumber'])] = ent
print 'getting entity (%i): %s.%s' %(ent['_id'], ent['synId'],
ent['versionNumber'])
return entityDict
def safeGetActivity(entity):
k, ent = entity
try:
print 'Getting Provenance for:', k, counter2.next()
prov = syn.getProvenance(ent['synId'], version=ent['versionNumber'])
return (k, prov)
except synapseclient.exceptions.SynapseHTTPError:
return (k, None)
def cleanUpActivities(activities):
returnDict = dict()
for k,activity in activities:
print 'Cleaning up activity: %s' % k
if activity is None:
continue
activity['synId'] = activity.pop('id')
activity['concreteType']='activity'
activity['_id'] = newId.next()
activity['_type'] = 'vertex'
returnDict[k] = activity
return returnDict
def buildEdgesfromActivities(nodes, activities):
new_nodes = dict()
edges = list()
for k, entity in nodes.items():
print 'processing entity:', k
if k not in activities:
continue
activity = activities[k]
#Determine if we have already seen this activity
if activity['synId'] not in new_nodes:
new_nodes[activity['synId']] = activity
#Add input relationships
for used in activity['used']:
#add missing vertices to nodes
if used['concreteType']=='org.sagebionetworks.repo.model.provenance.UsedEntity':
targetId = '%s.%s' %(used['reference']['targetId'],
used['reference'].get('targetVersionNumber'))
if targetId not in nodes:
nodes[targetId] = { '_id': newId.next(),
'_type': 'vertex',
'synId' : used['reference']['targetId'],
'versionNumber': used['reference'].get('targetVersionNumber')}
elif used['concreteType'] =='org.sagebionetworks.repo.model.provenance.UsedURL':
targetId = used['url']
if not targetId in nodes:
nodes[targetId]= {'_id': newId.next(),
'_type': 'vertex',
'name': used.get('name'),
'url': used['url'],
'concreteType' : used['concreteType']}
#Create the incoming edges
edges.append({'_id': newId.next(),
'_inV': activity['_id'],
'_type': 'edge',
'_outV': nodes[targetId]['_id'],
'_label': 'used',
'wasExecuted': used.get('wasExecuted', False),
'createdBy': activity['createdBy'],
'createdOn': activity['createdOn'],
'modifiedBy':activity['modifiedBy'],
'modifiedOn':activity['modifiedOn']})
else:
activity = new_nodes[activity['synId']]
#Add generated relationship (i.e. out edge)
edges.append({'_id': newId.next(),
'_inV': entity['_id'],
'_outV': activity['_id'],
'_type':'edge', '_label':'generatedBy',
'createdBy': activity['createdBy'],
'createdOn': activity['createdOn'],
'modifiedBy':activity['modifiedBy'],
'modifiedOn':activity['modifiedOn']})
nodes.update(new_nodes)
return edges
if __name__ == '__main__':
p = mp.Pool(7)
nodes = getEntities(benefactorId = 'syn1773109')
print 'Fetched %i entities' %len(nodes)
activities = p.map(safeGetActivity, nodes.items())
activities = cleanUpActivities(activities)
print '%i activities found i.e. %0.2g%% entities have provenance' %(len(activities),
float(len(nodes))/len(activities))
edges = buildEdgesfromActivities(nodes, activities)
print 'I have %i nodes and %i edges' %(len(nodes), len(edges))
with open('graph.json', 'w') as fp:
json.dump(OrderedDict([('vertices', nodes.values()), ('edges', edges)]), fp, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment