Skip to content

Instantly share code, notes, and snippets.

@rolandcrosby
Last active December 12, 2018 21:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rolandcrosby/4e8eca924336103ac5abf04160c2052e to your computer and use it in GitHub Desktop.
Save rolandcrosby/4e8eca924336103ac5abf04160c2052e to your computer and use it in GitHub Desktop.
csv preprocessor
import sys
# CSV cleaner-upper to change CRLF terminations to LF, and to avoid quoting empty strings
# inspired by https://github.com/mhus/mhus-lib/blob/master/mhu-lib-core/src/main/java/de/mhus/lib/core/io/CSVReader.java
# usage: python csvfix.py < in.csv > out.csv
def parse_field(char, state="start", acc=""): # returns (new_state, new_acc)
if state == "start":
if char == ",":
return ("eof_u", acc)
elif char is None or char == "\n":
return ("eol_u", acc)
elif char == "\r":
return ("after_cr_u", acc)
elif char == '"':
return ("in_q", acc)
else:
return ("in_u", acc + char)
elif state == "in_u":
if char == ",":
return ("eof_u", acc)
elif char is None or char == "\n":
return ("eol_u", acc)
elif char == "\r":
return ("after_cr_u", acc)
elif char == '"':
raise Exception("stray double quote in unquoted string. so far: %s" % acc)
else:
return ("in_u", acc + char)
elif state == "in_q":
if char == '"':
return ("after_q", acc)
else: # this is maybe overly broad - it allows for multiline strings
return ("in_q", acc + char)
elif state == "after_q":
if char == '"': # double quote in string
return ("in_q", acc + '""')
elif char == ",":
return ("eof_q", acc)
elif char is None or char == "\n":
return ("eol_q", acc)
elif char == "\r":
return ("after_cr_q", acc)
elif state == "after_cr_u":
if char == "\n":
return ("eol_u", acc)
else:
raise Exception("stray \\r character in input. so far: %s" % acc)
elif state == "after_cr_q":
if char == "\n":
return ("eol_q", acc)
else:
raise Exception("stray \\r character in input. so far: %s" % acc)
if __name__ == "__main__":
for line in sys.stdin:
it = iter(line)
out = ""
state = "start"
acc = ""
while state not in ('eol_u', 'eol_q'):
state, acc = parse_field(next(it, None), state, acc)
if state == 'eof_u':
out += acc
out += ','
acc = ''
state = 'start'
elif state == 'eof_q':
if acc == "":
out += ","
else:
out += '"%s",' % acc
acc = ''
state = 'start'
if state == 'eol_u' or (state == 'eol_q' and acc == ""):
sys.stdout.write(out + acc + "\n")
else:
sys.stdout.write(out + '"%s"\n' % acc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment