Skip to content

Instantly share code, notes, and snippets.

@caodac
Created February 11, 2021 13:39
Show Gist options
  • Save caodac/e192546eaa8b40c8482ce4014f97e2e1 to your computer and use it in GitHub Desktop.
Save caodac/e192546eaa8b40c8482ce4014f97e2e1 to your computer and use it in GitHub Desktop.
Count the number of leaf nodes for each HPO category under Phenotypic Abnormality (HP:0000118)
from neo4j import GraphDatabase
from jinja2 import Template
import json
QUERY="""
match p=(d:DATA)-[:PAYLOAD]->(n:S_HP)-[e:R_subClassOf*0..11]->(m:S_HP)-[e2:R_subClassOf]->(:S_HP)<-[:PAYLOAD]-(z)
where z.notation = 'HP:0000118'
and d.notation='{{ leaf }}'
and all(x in e where x.source=n.source or n.source in x.source)
and (e2.source=n.source or n.source in e2.source)
with d,m match (m)<-[:PAYLOAD]-(z)
return distinct z.notation as ID, z.label as LABEL
"""
uri = "bolt://disease.ncats.io:80"
driver = GraphDatabase.driver(uri, auth=("neo4j", ""))
template = Template(QUERY)
def run_cypher(tx, query):
leafs = []
for row in tx.run(query):
leafs.append((row['ID'], row['LABEL']))
return leafs
def get_leafs():
CYPHER="""
match p=(d)-[:PAYLOAD]->(n:S_HP)
where not (n)<-[:R_subClassOf]-()
and not n:TRANSIENT
and not n:AnnotationProperty
and not n:ObjectProperty
and d.notation =~ 'HP:.*'
return d.notation as ID, d.label as LABEL
order by d.notation desc
"""
with driver.session() as session:
return session.read_transaction(run_cypher, CYPHER)
def get_categories(leaf):
with driver.session() as session:
return session.read_transaction(run_cypher, template.render(leaf=leaf))
if __name__ == '__main__':
leafs = get_leafs()
categories = {}
for l in leafs:
cats = get_categories(l[0])
#print ('%s: %s' % l)
for cat in cats:
#print ('... %s %s' % cat)
if cat[0] not in categories:
categories[cat[0]] = {
'label': cat[1],
'count': 1
}
else:
c = categories[cat[0]]['count']
categories[cat[0]]['count'] = c+1
print (json.dumps(categories, indent=2))
@caodac
Copy link
Author

caodac commented Feb 11, 2021

This script can be replaced with this cypher query!

match (d)-[:PAYLOAD]->(n:S_HP) 
where not (n)<-[:R_subClassOf]-() 
and not n:TRANSIENT 
and not n:AnnotationProperty 
and not n:ObjectProperty 
and d.notation =~ 'HP:.*' with d as leaf 
match (d:DATA)-[:PAYLOAD]->(n:S_HP)-[e:R_subClassOf*0..11]->(m:S_HP)-[e2:R_subClassOf]->(:S_HP)<-[:PAYLOAD]-(z) 
where z.notation = 'HP:0000118' 
and d.notation= leaf.notation
and all(x in e where x.source=n.source or n.source in x.source) 
and (e2.source=n.source or n.source in e2.source) 
with leaf,m match (m)<-[:PAYLOAD]-(z) 
return z.notation as ID, z.label as LABEL,count(distinct leaf) as COUNT order by COUNT desc

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment