Skip to content

Instantly share code, notes, and snippets.

@Wavewash
Created June 11, 2014 13:09
Show Gist options
  • Save Wavewash/8597030884329dc3c448 to your computer and use it in GitHub Desktop.
Save Wavewash/8597030884329dc3c448 to your computer and use it in GitHub Desktop.
This was used to cleanse data input by various users for years in a record system. The years being input by different people with different methods all had different formats. Some were just plain incorrect. To reduce the effort by the data cleansing team to unify the year format. This high level parser was written to catch most obvious date form…
__author__ = 'mkakwan'
import sys
def clense(row, headers):
i=0
clensedkey = "year_cleansed"
newAnnualReportsClensedValue = None;
for key in row:
value = row[key]
if(key == "year_dirty") and (value != "" ):
#Value only has a single 4 digit year ex:1999
if(len(value) == 4) and str(value).isdigit():
newAnnualReportsClensedValue = [(int(value))]
else:
newAnnualReportsClensedValue = dateCases(value)
i=i+1
#Comment out data clense output
if False:
fo = open("clensedata.txt", "a")
if newAnnualReportsClensedValue == None:
fo.write("UNRECOGNIZED \t " + row["annual_reports_s"] + '\n')
else:
fo.write(str(newAnnualReportsClensedValue) + " \t " + row["annual_reports_s"] + '\n')
fo.close()
row[clensedkey] = newAnnualReportsClensedValue
return row;
def dateCases(value):
originalValue = value;
#clense the string of spaces
value = value.replace(" ","");
#clense the string of periods
value = value.replace(".",",");
#clense the data of dates ex: 1979/80,1982/83
value = value.replace("/",",");
#clense the data of dates ex: 1989;1992
value = value.replace(";",",");
#clense the data of parts ex: 2003:2 parts
value = value.split(":")[0]
#clense the date of (BOUND) ex: 1976/1977-1985/1986; 1986/1987, 1988/1989-1994/1995 (BOUND)
value = value.replace("(BOUND)","");
while(str(value).endswith(".") or str(value).endswith(",") or str(value).endswith(";")):
value = value[:-1]
commasplit = value.split(",")
valuelist = []
#assume the current century is in 19th century
currentCentury = "19";
for splitvalue in commasplit:
#Value only has a single 4 digit year ex:1999
if(len(splitvalue) == 4) and str(splitvalue).isdigit():
#get the current century to append onto dates
currentCentury = splitvalue[:2]
valuelist.append(int(splitvalue))
elif(len(splitvalue) == 2) and str(splitvalue).isdigit():
valuelist.append(int(currentCentury+splitvalue))
#Value is a dash to denote a range
elif "-" in splitvalue:
dashsplitvalues = splitvalue.split("-")
#make sure we only have two dates with the dash (ex: 1999-2000, 98-99, 1988-98)
rangedate = []
if len(dashsplitvalues) == 2 and dashsplitvalues[0].isdigit() and dashsplitvalues[1].isdigit():
for dashsplitvalue in dashsplitvalues:
if len(dashsplitvalue) == 2:
rangedate.append(int(currentCentury + dashsplitvalue))
else:
rangedate.append(int(dashsplitvalue))
rangedate = range(rangedate[0],rangedate[1] + 1)
valuelist = valuelist+ (list(rangedate))
#Value format is unrecognized
else:
print("UNRECOGNIZED ,\"" + originalValue + "\"")
return None
print(str(valuelist) + " ,\"" + originalValue + "\"")
return valuelist
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment