Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
It's for counting biological_process terms which have is_a and part_of.
"""
It's for counting biological_process terms which have is_a and part_of.
Usage:
$ wget http://purl.obolibrary.org/obo/go.obo
$ python3 go_term_stat.py < go.obo
Main data structure
terms = {
'GO:12345': {
'is_a': ['GO:12344', 'GO:12333'],
'part_of': ['GO:1111', 'GO:2222'],
}
}
"""
import sys
import pprint
target_terms = {}
def iter_term_lines(afile):
while True:
line = next(afile)
line = line.strip()
if not line:
break
lines = [next(afile)]
for line in afile:
line = line.strip()
if line.startswith('[Term]'):
yield lines
lines = []
lines.append(line)
yield lines
total_terms = 0
bp_terms = 0
for lines in iter_term_lines(sys.stdin):
total_terms += 1
go_id = lines[1][4:]
data = {
'is_a': [],
'part_of': [],
}
namespace = lines[3].split(': ')[1]
if namespace != 'biological_process':
continue
bp_terms += 1
for line in lines:
key, *values = line.split(':')
value = ':'.join(values)
if key == 'is_a':
data['is_a'].append(value.split()[0])
if key == 'relationship' and value.startswith(' part_of'):
data['part_of'].append(value.split()[1])
if key == 'name':
data['name'] = value[1:]
if data['is_a'] and data['part_of']:
target_terms[go_id] = data
pprint.pprint(target_terms)
print(total_terms, bp_terms, len(target_terms))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.