Created
August 29, 2013 22:40
-
-
Save tboyce12/6384294 to your computer and use it in GitHub Desktop.
Format CSV to work better with unix column command: - Replace commas with custom separator (except commas between double quotes) - Collapse extra whitespace, including newlines between double quotes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse, re | |
# Arg Parser | |
parser = argparse.ArgumentParser(description='Prints well-formatted version of supplied CSV file to STDOUT.') | |
parser.add_argument('filename', metavar='FILE', type=str, | |
help='File to convert.') | |
parser.add_argument('-s', '--separator', dest='separator', type=str, default='|', | |
help="Character to separate fields. Default: '|' (pipe).") | |
# Args | |
args = parser.parse_args() | |
filename = args.filename | |
separator = args.separator[0] | |
# Replace separators | |
file = open(filename, 'r') | |
text = re.sub( | |
r'((?:[^,"]|"[^"]*")*),', | |
r' \1 %c ' % separator, | |
file.read(), | |
) | |
# Collapse spaces | |
text = re.sub(r' {2,}', r' ', text) | |
# Replace quoted newlines | |
spans = [] | |
for m in re.finditer(r'"[^"]*"', text): | |
spans.append(m.span()) | |
text = list(text) | |
for span in reversed(spans): | |
index = span[0]; | |
while index < span[1]: | |
if text[index] == '\n': | |
text[index] = ' ' | |
index = index + 1 | |
print "".join(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment