Created
October 12, 2012 11:46
-
-
Save ctavan/3878845 to your computer and use it in GitHub Desktop.
Decode CSV file with broken encodings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script parses a tab-separated CSV-file where fields may contain strings | |
with broken encodings. Each line of the CSV-file is parsed into a dictionary. | |
""" | |
import csv | |
import json | |
import sys | |
import re | |
# from: http://stackoverflow.com/questions/3870084/how-to-decode-a-non-unicode-character-in-python | |
def decode_heuristically(string, enc = None, denc = sys.getdefaultencoding()): | |
""" | |
Try to interpret 'string' using several possible encodings. | |
@input : string, encode type. | |
@output: a list [decoded_string, flag_decoded, encoding] | |
""" | |
if isinstance(string, unicode): return string, 0, "utf-8" | |
try: | |
new_string = unicode(string, "ascii") | |
return string, 0, "ascii" | |
except UnicodeError: | |
encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"] | |
if denc != "ascii": encodings.insert(0, denc) | |
if enc: encodings.insert(0, enc) | |
for enc in encodings: | |
if (enc in ("iso-8859-15", "iso-8859-1") and | |
re.search(r"[\x80-\x9f]", string) is not None): | |
continue | |
if (enc in ("iso-8859-1", "cp1252") and | |
re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", string)\ | |
is not None): | |
continue | |
try: | |
new_string = unicode(string, enc) | |
except UnicodeError: | |
pass | |
else: | |
if new_string.encode(enc) == string: | |
return new_string, 0, enc | |
# If unable to decode,doing force decoding i.e.neglecting those chars. | |
output = [(unicode(string, enc, "ignore"), enc) for enc in encodings] | |
output = [(len(new_string[0]), new_string) for new_string in output] | |
output.sort() | |
new_string, enc = output[-1][1] | |
return new_string, 1, enc | |
def decode_field(value): | |
return decode_heuristically(value)[0] if value is not None else None | |
# from: http://stackoverflow.com/questions/5004687/python-csv-dictreader-with-utf-8-data | |
def UnicodeDictReader(utf8_data, **kwargs): | |
csv_reader = csv.DictReader(utf8_data, **kwargs) | |
for row in csv_reader: | |
yield dict([(key, decode_field(value)[0]) for key, value | |
in row.iteritems()]) | |
if __name__ == '__main__': | |
print 'Parsing: %s' % sys.argv[1] | |
with open(sys.argv[1]) as log: | |
for row in UnicodeDictReader(log, delimiter='\t'): | |
print json.dumps(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment