Last active
May 2, 2021 03:34
-
-
Save isc-rsingh/4757f6d9c6de76c3ed4cd919cb5905bc to your computer and use it in GitHub Desktop.
Cleans up broken lines and unquoted strings in Boston crime incident data CSV files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get the data from https://data.boston.gov/dataset/crime-incident-reports-august-2015-to-date-source-new-system | |
badfile = open('bostonpolice.csv', 'r') | |
betterfile = open('tmp.csv', 'w') | |
while True: | |
line = badfile.readline().strip() | |
if not line: | |
break | |
# figure out if the line has been broken incorrectly | |
qs = line.count('"') | |
if qs == 1: | |
line += " " + badfile.readline().strip() | |
line += " " + badfile.readline().strip() | |
# quote upquoted strings | |
lparts = line.split(',') | |
for i in range(len(lparts)): | |
if len(lparts[i])>0 and lparts[i][0].isalpha(): | |
lparts[i] = '"' + lparts[i] + '"' | |
newstr = ",".join(lparts) | |
# omit records with no lat/long | |
hasgeo = False if newstr.find("(0, 0)") >= 0 else True | |
# write the better line to the new file | |
if hasgeo: | |
betterfile.writelines(newstr+'\n') | |
badfile.close() | |
betterfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment