Skip to content

Instantly share code, notes, and snippets.

@kburnham
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kburnham/81958309de1db42fe789 to your computer and use it in GitHub Desktop.
Save kburnham/81958309de1db42fe789 to your computer and use it in GitHub Desktop.
def process_line(line, fields):
classification_list = ['family', 'class', 'phylum','order','kingdom', 'genus']
spider_dict = {}
spider_dict['classification'] = {}
for field in fields:
new_value = line[field].strip()
if line[field].strip() == 'NULL':
new_value = None
if fields[field] == 'label':
new_value = re.sub(r'\([^)]*\)', '', line[field]).strip() #http://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex
if fields[field] == 'name' and (line[field] == 'NULL' or re.search('[\W-]+', line[field]) != None):
new_value = line['rdf-schema#label'].strip()
if fields[field] == 'synonym' and line[field] != 'NULL':
new_value = [i.strip("{} *") for i in line[field].split('|')]
if fields[field] in classification_list:
spider_dict['classification'][fields[field]] = new_value
else:
spider_dict[fields[field]] = new_value
return spider_dict
def process_file(filename, fields):
classification_list = ['family', 'class', 'phylum','order','kingdom', 'genus']
process_fields = fields.keys()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
data.append(process_line(line, fields))
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment