Skip to content

Instantly share code, notes, and snippets.

@mattiasostmar
Created December 8, 2013 19:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattiasostmar/7862242 to your computer and use it in GitHub Desktop.
Save mattiasostmar/7862242 to your computer and use it in GitHub Desktop.
import re
import sys
_, infile, outfile = sys.argv
s_pat_row = r'''
"([^"]+)" # match column; this is group 1
\s*\t\s* # match separating tab and any optional white space
([^\t]+) # match a string of non-tab chars; this is group 2
\s*\t\s* # match separating tab and any optional white space
"((?:\\"|[^"])*)" # match string data that can include escaped quotes
'''
pat_row = re.compile(s_pat_row, re.MULTILINE|re.VERBOSE)
s_pat_clean = r'''[\x01-\x1f\x7f]'''
pat_clean = re.compile(s_pat_clean)
row_template = '"{}",{},"{}"\n'
with open(infile, "rt") as inf, open(outfile, "wt") as outf:
data = inf.read()
for m in re.finditer(pat_row, data):
row = m.groups()
cleaned = re.sub(pat_clean, ' ', row[2])
words = cleaned.split()
cleaned = ' '.join(words)
outrow = row_template.format(row[0], row[1], cleaned)
outf.write(outrow)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment