Skip to content

Instantly share code, notes, and snippets.

@alejio
Last active August 29, 2015 14:20
Show Gist options
  • Save alejio/a12ff20a4368ec5bc3c5 to your computer and use it in GitHub Desktop.
Save alejio/a12ff20a4368ec5bc3c5 to your computer and use it in GitHub Desktop.
Nanodegree > Project 2 > Lesson 4 > Problem 1
def res_clean(line):
#set name entry equal to label if broken
n = re.match('^[a-zA-Z0-9_]+$',line["name"])
if line["name"]=="NULL" or not n:
line["name"]=line["rdf-schema#label"]
#if value is null set to none
for key, value in line.items():
if value=="NULL":
line[key]=None
#trim redundant parentheses from rdf-schema#label
out = []
colsplit = line["rdf-schema#label"].split()
exp=re.compile(r'\(+\S+\)', re.IGNORECASE)
for value in range(len(colsplit)):
m = exp.search(colsplit[value])
if m:
continue
else:
out.append(colsplit[value])
line["rdf-schema#label"] =' '.join(out)
#fix synonym entry
if line["synonym"]:
line["synonym"] = line["synonym"].replace("{", '').replace("}", '').replace("*", '').strip().split("|")
#strip whitespace
#for key, value in line.items():
# if value:
# line['key']=value.strip()
return line
def process_file(filename, fields):
process_fields = fields.keys()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
next(reader)
for line in reader:
data1={}
line_o = res_clean(line)
for header in line_o.keys():
if header in FIELDS.keys():
key_out = FIELDS[header]
data1[key_out] = line_o[header]
classif_list = ["kingdom", "family", "order", "phylum", "genus", "class"]
d_temp = {}
for c in classif_list:
d_temp[c] = data1[c]
del data1[c]
data1["classification"] = d_temp
data.append(data1)
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment