LLCampos/obo-to-words.py

## obo-to-words.py
import re

input_file_name = 'hp.obo'
output_file_name = 'hpo.txt'

# Read the whole .obo file into a string
with open(input_file_name) as f:
    obo_string = f.read()

# This will include all entity names, including main names and synonyms
entity_names = []

# This looks for all the text that is after "name: "
entity_names += re.findall('name: (.*)', obo_string)

# This looks for all the text that is after "synonym: " and between double quotes
entity_names += re.findall('synonym: "(.*)"', obo_string)

# Do some post-processing in the lexicon.
# Lowercase everything
entity_names = map(lambda entity: entity.lower(), entity_names)
# Remove trailing whitespaces (ex: convert "oral ulcer " to "oral ulcer")
entity_names = map(lambda entity: entity.strip(), entity_names)
# Remove duplicates
entity_names = list(set(entity_names))

# Create lexicon file
with open(output_file_name, 'w') as f:
    for entity_name in entity_names:
        f.write(entity_name + '\n')
	import re

	input_file_name = 'hp.obo'
	output_file_name = 'hpo.txt'

	# Read the whole .obo file into a string
	with open(input_file_name) as f:
	obo_string = f.read()

	# This will include all entity names, including main names and synonyms
	entity_names = []

	# This looks for all the text that is after "name: "
	entity_names += re.findall('name: (.*)', obo_string)

	# This looks for all the text that is after "synonym: " and between double quotes
	entity_names += re.findall('synonym: "(.*)"', obo_string)

	# Do some post-processing in the lexicon.
	# Lowercase everything
	entity_names = map(lambda entity: entity.lower(), entity_names)
	# Remove trailing whitespaces (ex: convert "oral ulcer " to "oral ulcer")
	entity_names = map(lambda entity: entity.strip(), entity_names)
	# Remove duplicates
	entity_names = list(set(entity_names))

	# Create lexicon file
	with open(output_file_name, 'w') as f:
	for entity_name in entity_names:
	f.write(entity_name + '\n')