Skip to content

Instantly share code, notes, and snippets.

@kburnham
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kburnham/693e69b1d774cdbb7fc3 to your computer and use it in GitHub Desktop.
Save kburnham/693e69b1d774cdbb7fc3 to your computer and use it in GitHub Desktop.
def process_file(filename, fields):
classification_list = ['family', 'class', 'phylum','order','kingdom', 'genus']
process_fields = fields.keys()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
new_dict = {}
new_dict['classification'] = {}
for field in fields:
line[field] = line[field].strip()
if fields[field] == 'label':
line[field] = re.sub(r'\([^)]*\)', '', line[field]).strip() #http://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex
if fields[field] == 'name':
if line[field] == 'NULL' or re.search('[\W-]+', line[field]) != None:
line[field] = line['rdf-schema#label']
if line[field] == 'NULL':
line[field] = None
if fields[field] == 'synonym':
if line[field] != None:
syn_list = []
for i in line[field].split('|'):
syn_list.append(i.strip("{} *"))
line[field] = syn_list
if fields[field] in classification_list:
new_dict['classification'][fields[field]] = line[field]
else:
new_dict[fields[field]] = line[field]
data.append(new_dict)
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment