Created
May 13, 2015 23:01
-
-
Save yong27/9bf643bc1327c2fecd63 to your computer and use it in GitHub Desktop.
It's for counting biological_process terms which have is_a and part_of.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
It's for counting biological_process terms which have is_a and part_of. | |
Usage: | |
$ wget http://purl.obolibrary.org/obo/go.obo | |
$ python3 go_term_stat.py < go.obo | |
Main data structure | |
terms = { | |
'GO:12345': { | |
'is_a': ['GO:12344', 'GO:12333'], | |
'part_of': ['GO:1111', 'GO:2222'], | |
} | |
} | |
""" | |
import sys | |
import pprint | |
target_terms = {} | |
def iter_term_lines(afile): | |
while True: | |
line = next(afile) | |
line = line.strip() | |
if not line: | |
break | |
lines = [next(afile)] | |
for line in afile: | |
line = line.strip() | |
if line.startswith('[Term]'): | |
yield lines | |
lines = [] | |
lines.append(line) | |
yield lines | |
total_terms = 0 | |
bp_terms = 0 | |
for lines in iter_term_lines(sys.stdin): | |
total_terms += 1 | |
go_id = lines[1][4:] | |
data = { | |
'is_a': [], | |
'part_of': [], | |
} | |
namespace = lines[3].split(': ')[1] | |
if namespace != 'biological_process': | |
continue | |
bp_terms += 1 | |
for line in lines: | |
key, *values = line.split(':') | |
value = ':'.join(values) | |
if key == 'is_a': | |
data['is_a'].append(value.split()[0]) | |
if key == 'relationship' and value.startswith(' part_of'): | |
data['part_of'].append(value.split()[1]) | |
if key == 'name': | |
data['name'] = value[1:] | |
if data['is_a'] and data['part_of']: | |
target_terms[go_id] = data | |
pprint.pprint(target_terms) | |
print(total_terms, bp_terms, len(target_terms)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment