Created
July 11, 2012 00:50
-
-
Save mapmeld/3087203 to your computer and use it in GitHub Desktop.
PredictPrep
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Predict Cases | |
# Prepare housing cases CSV for Google Prediction API | |
# Now associates a case with where it is in three more years, not the ultimate result | |
# for example | |
# "Some", 19960909, Cut and Cleared | |
# "None", 19980101, Cut and Cleared | |
# "None", 19980202, No Violations (not predicted by previous because there were no violations) | |
# "Demolished", 20020808, Repaired (prediction should be demolished because demo comes in < 3 years) | |
# "Demolished", 20030101, Demolished | |
prop = open('PROP1.csv', 'r') | |
thisyr = open('cases-predict-plus.csv', 'w') | |
cases = { } | |
skipto = "ELL ST, 0583" | |
for line in prop: | |
# get a case id | |
caseid = line.split(',')[10] | |
opendate = line.split(',')[11] | |
if(opendate > "20090620"): | |
# stop loading transit data after June 20, 2009 | |
continue | |
# clean up line | |
# a few don't match this model line = line.replace('F,"","",','').replace('"Macon","Macon",','').replace('F,T,F,F,F,','') | |
# classify this line | |
address = line[ 1 : len(line) ] | |
address = address[ 0 : address.find('"') ] | |
if(address.find('Macon') > -1): | |
address = address[0 : address.find('Macon')] | |
address = address[0 : address.rfind(',')] | |
elif(address.find('MACON') > -1): | |
address = address[0 : address.find('MACON')] | |
address = address[0 : address.rfind(',')] | |
#line = line.split(',') | |
if(cases.has_key(address)): | |
continue | |
else: | |
cases[address] = { "lines": [ ], "opendates": [ ], "closedates": [ ], "result": "none" } | |
if(skipto != None): | |
if(address != skipto): | |
continue | |
else: | |
skipto = None | |
propreview = open('PROP1.csv', 'r') | |
lastcase = "" | |
caseids = { } | |
for propline in propreview: | |
# clean up line | |
propcaseid = propline.split(',')[10] | |
propopendate = propline.split(',')[11] | |
propclosedate = propline.split(',')[9] | |
# print propcaseid + ": open at " + propopendate + " close at " + propclosedate | |
# a few don't match this model: propline = propline.replace('F,"","",','').replace('"Macon","Macon",','').replace('F,T,F,F,F,','') | |
# classify this line | |
propaddress = propline[ propline.find('"') + 1 : len(propline) ] | |
propaddress = propaddress[ 0 : propaddress.find('"') ] | |
if(propaddress.find('Macon') > -1): | |
propaddress = propaddress[0 : propaddress.find('Macon')] | |
propaddress = propaddress[0 : propaddress.rfind(',')] | |
elif(propaddress.find('MACON') > -1): | |
propaddress = propaddress[0 : propaddress.find('MACON')] | |
propaddress = propaddress[0 : propaddress.rfind(',')] | |
if(address == propaddress and caseids.has_key(propcaseid) == False): | |
lastcase = propopendate | |
caseids[propcaseid] = "Yes" | |
if(propclosedate > "20090620"): | |
# if case was closed after June 20, 2009 - show as empty | |
line = line.replace(propclosedate, " ", 1) | |
if(propopendate > "20090620"): | |
# if case was opened after June 20, 2009 - keep around for others' 3 year predictions, but cannot predict its own 3 year | |
propline = "donotprint:" + propline | |
cases[address]["lines"].append(propline) | |
cases[address]["opendates"].append(propopendate) | |
cases[address]["closedates"].append(propclosedate) | |
if(propline.find("emolish") > -1): | |
# mark demolished even if demolitions came after June 20, 2009 | |
print address + " demolished after " + str(len(cases[address]["lines"])) + " reports in " + propcaseid | |
break | |
propreview.close() | |
caseindex = 0 | |
for case in cases[address]["lines"]: | |
# only print cases with opendates before June 20, 2009 | |
if(case.find('donotprint:') > -1): | |
continue | |
nofuturecases = True | |
demolished = False | |
for futurecase in range(caseindex, len(cases[address]["lines"])): | |
if(int(cases[address]["opendates"][futurecase]) > int(cases[address]["opendates"][caseindex]) + 30000): | |
# reached a case which is more than three years in the future - remove from prediction categorization | |
break | |
if((futurecase != caseindex) and (cases[address]["lines"][futurecase].lower().find('no violations') == -1)): | |
# found a future case which is not a No Violations dismissal | |
nofuturecases = False | |
if(cases[address]["lines"][futurecase].find('emolish') > -1): | |
# found a demolition in the next three years | |
demolished = True | |
break | |
if(demolished == True): | |
thisyr.write('"Demolished",' + case) | |
elif(nofuturecases == False): | |
thisyr.write('"Some",' + case) | |
else: | |
thisyr.write('"None",' + case) | |
caseindex = caseindex + 1 | |
cases[address] = "True" |
Oh! Also add quotes to sometimes-blank fields, to fit https://developers.google.com/prediction/docs/developer-guide
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TODO: