Skip to content

Instantly share code, notes, and snippets.

@ctavan
Created October 12, 2012 11:46
Show Gist options
  • Save ctavan/3878845 to your computer and use it in GitHub Desktop.
Save ctavan/3878845 to your computer and use it in GitHub Desktop.
Decode CSV file with broken encodings.
#!/usr/bin/env python
"""
This script parses a tab-separated CSV-file where fields may contain strings
with broken encodings. Each line of the CSV-file is parsed into a dictionary.
"""
import csv
import json
import sys
import re
# from: http://stackoverflow.com/questions/3870084/how-to-decode-a-non-unicode-character-in-python
def decode_heuristically(string, enc = None, denc = sys.getdefaultencoding()):
"""
Try to interpret 'string' using several possible encodings.
@input : string, encode type.
@output: a list [decoded_string, flag_decoded, encoding]
"""
if isinstance(string, unicode): return string, 0, "utf-8"
try:
new_string = unicode(string, "ascii")
return string, 0, "ascii"
except UnicodeError:
encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]
if denc != "ascii": encodings.insert(0, denc)
if enc: encodings.insert(0, enc)
for enc in encodings:
if (enc in ("iso-8859-15", "iso-8859-1") and
re.search(r"[\x80-\x9f]", string) is not None):
continue
if (enc in ("iso-8859-1", "cp1252") and
re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", string)\
is not None):
continue
try:
new_string = unicode(string, enc)
except UnicodeError:
pass
else:
if new_string.encode(enc) == string:
return new_string, 0, enc
# If unable to decode,doing force decoding i.e.neglecting those chars.
output = [(unicode(string, enc, "ignore"), enc) for enc in encodings]
output = [(len(new_string[0]), new_string) for new_string in output]
output.sort()
new_string, enc = output[-1][1]
return new_string, 1, enc
def decode_field(value):
return decode_heuristically(value)[0] if value is not None else None
# from: http://stackoverflow.com/questions/5004687/python-csv-dictreader-with-utf-8-data
def UnicodeDictReader(utf8_data, **kwargs):
csv_reader = csv.DictReader(utf8_data, **kwargs)
for row in csv_reader:
yield dict([(key, decode_field(value)[0]) for key, value
in row.iteritems()])
if __name__ == '__main__':
print 'Parsing: %s' % sys.argv[1]
with open(sys.argv[1]) as log:
for row in UnicodeDictReader(log, delimiter='\t'):
print json.dumps(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment