Created
April 12, 2017 14:07
-
-
Save LLCampos/9aa406e349dfe5a1536271a3ed0c18ed to your computer and use it in GitHub Desktop.
Extract, form a .obo file, all names and synonyms into a .txt file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
input_file_name = 'hp.obo' | |
output_file_name = 'hpo.txt' | |
# Read the whole .obo file into a string | |
with open(input_file_name) as f: | |
obo_string = f.read() | |
# This will include all entity names, including main names and synonyms | |
entity_names = [] | |
# This looks for all the text that is after "name: " | |
entity_names += re.findall('name: (.*)', obo_string) | |
# This looks for all the text that is after "synonym: " and between double quotes | |
entity_names += re.findall('synonym: "(.*)"', obo_string) | |
# Do some post-processing in the lexicon. | |
# Lowercase everything | |
entity_names = map(lambda entity: entity.lower(), entity_names) | |
# Remove trailing whitespaces (ex: convert "oral ulcer " to "oral ulcer") | |
entity_names = map(lambda entity: entity.strip(), entity_names) | |
# Remove duplicates | |
entity_names = list(set(entity_names)) | |
# Create lexicon file | |
with open(output_file_name, 'w') as f: | |
for entity_name in entity_names: | |
f.write(entity_name + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment