Skip to content

Instantly share code, notes, and snippets.

@Shaunwei
Last active January 15, 2016 00:12
Show Gist options
  • Save Shaunwei/31d0ba35ed66a906c6db to your computer and use it in GitHub Desktop.
Save Shaunwei/31d0ba35ed66a906c6db to your computer and use it in GitHub Desktop.
File encoding issue with chardet library. A few characters '{', '}', '~' with several combinations will cause chardet to fail. Find out who are the bad guys!! For fancy looks, I used colorclass and terminaltables.
#!/usr/bin/env python3
import chardet
from colorclass import Color
from terminaltables import AsciiTable
def generate_test(chars_list):
table_data = [['Chars', 'Confidence', 'Encoding', 'Issue'], ]
for chars in chars_list:
result = chardet.detect(chars.encode('utf-8'))
conf, enco = result['confidence'], result['encoding']
if not conf or not enco:
table_data.append(
[chars, str(conf), 'None', Color('{red}x{/red}')]
)
else:
table_data.append(
[chars, str(conf), enco, Color('{green}✔{/green}')]
)
table = AsciiTable(sorted(table_data, key=lambda x: x[3], reverse=True))
print(table.table)
def generate_chars_list(chars):
chars_set = set()
def permutation(vals, curt):
chars_set.add(''.join(curt))
for i, v in enumerate(vals):
permutation(vals[:i] + vals[i + 1:], curt + [v])
permutation(list(chars), [])
return list(chars_set)
if __name__ == '__main__':
chars_list = generate_chars_list('{}~')
generate_test(chars_list)
"""
+-------+------------+----------+-------+
| Chars | Confidence | Encoding | Issue |
+-------+------------+----------+-------+
| { | 1.0 | ascii | ✔ |
| {~} | 1.0 | ascii | ✔ |
| {}~ | 1.0 | ascii | ✔ |
| } | 1.0 | ascii | ✔ |
| ~}{ | 1.0 | ascii | ✔ |
| {} | 1.0 | ascii | ✔ |
| ~} | 1.0 | ascii | ✔ |
| }{~ | 1.0 | ascii | ✔ |
| {~ | 1.0 | ascii | ✔ |
| }{ | 1.0 | ascii | ✔ |
| ~ | 1.0 | ascii | ✔ |
| }~ | 1.0 | ascii | ✔ |
| | 0.0 | None | x |
| }~{ | 0.0 | None | x |
| ~{} | 0.0 | None | x |
| ~{ | 0.0 | None | x |
+-------+------------+----------+-------+
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment