Latin1 -> UTF-8 and double-encoded UTF-8 fixer I wrote to process a few munged Wordpress databases
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fixes double-encoded UTF-8 | |
# By Martin Falatic, www.Falatic.com 2021-12-03 | |
# Mappings based on https://www.i18nqa.com/debug/utf8-debug.html | |
""" | |
Unmapped: | |
0x81 | |
0x8D | |
0x8F | |
0x90 | |
0x9D | |
""" | |
mappings = [ | |
['\u20AC', '80', '€', '€', 'E2 82 AC'], | |
['\u201A', '82', '‚', '‚', 'E2 80 9A'], | |
['\u0192', '83', 'ƒ', 'Æ’', 'C6 92'], | |
['\u201E', '84', '„', '„', 'E2 80 9E'], | |
['\u2026', '85', '…', '…', 'E2 80 A6'], | |
['\u2020', '86', '†', '†', 'E2 80 A0'], | |
['\u2021', '87', '‡', '‡', 'E2 80 A1'], | |
['\u02C6', '88', 'ˆ', 'ˆ', 'CB 86'], | |
['\u2030', '89', '‰', '‰', 'E2 80 B0'], | |
['\u0160', '8A', 'Š', 'Å ', 'C5 A0'], | |
['\u2039', '8B', '‹', '‹', 'E2 80 B9'], | |
['\u0152', '8C', 'Œ', 'Å’', 'C5 92'], | |
['\u017D', '8E', 'Ž', 'Ž', 'C5 BD'], | |
['\u2018', '91', '‘', '‘', 'E2 80 98'], | |
['\u2019', '92', '’', '’', 'E2 80 99'], | |
['\u201C', '93', '“', '“', 'E2 80 9C'], | |
['\u201D', '94', '”', 'â€', 'E2 80 9D'], | |
['\u2022', '95', '•', '•', 'E2 80 A2'], | |
['\u2013', '96', '–', '–', 'E2 80 93'], | |
['\u2014', '97', '—', '—', 'E2 80 94'], | |
['\u02DC', '98', '˜', 'Ëœ', 'CB 9C'], | |
['\u2122', '99', '™', 'â„¢', 'E2 84 A2'], | |
['\u0161', '9A', 'š', 'Å¡', 'C5 A1'], | |
['\u203A', '9B', '›', '›', 'E2 80 BA'], | |
['\u0153', '9C', 'œ', 'Å“', 'C5 93'], | |
['\u017E', '9E', 'ž', 'ž', 'C5 BE'], | |
['\u0178', '9F', 'Ÿ', 'Ÿ', 'C5 B8'], | |
['\u00A0', 'A0', ' ', 'Â ', 'C2 A0'], | |
['\u00A1', 'A1', '¡', '¡', 'C2 A1'], | |
['\u00A2', 'A2', '¢', '¢', 'C2 A2'], | |
['\u00A3', 'A3', '£', '£', 'C2 A3'], | |
['\u00A4', 'A4', '¤', '¤', 'C2 A4'], | |
['\u00A5', 'A5', '¥', 'Â¥', 'C2 A5'], | |
['\u00A6', 'A6', '¦', '¦', 'C2 A6'], | |
['\u00A7', 'A7', '§', '§', 'C2 A7'], | |
['\u00A8', 'A8', '¨', '¨', 'C2 A8'], | |
['\u00A9', 'A9', '©', '©', 'C2 A9'], | |
['\u00AA', 'AA', 'ª', 'ª', 'C2 AA'], | |
['\u00AB', 'AB', '«', '«', 'C2 AB'], | |
['\u00AC', 'AC', '¬', '¬', 'C2 AC'], | |
['\u00AD', 'AD', '', 'Â', 'C2 AD'], | |
['\u00AE', 'AE', '®', '®', 'C2 AE'], | |
['\u00AF', 'AF', '¯', '¯', 'C2 AF'], | |
['\u00B0', 'B0', '°', '°', 'C2 B0'], | |
['\u00B1', 'B1', '±', '±', 'C2 B1'], | |
['\u00B2', 'B2', '²', '²', 'C2 B2'], | |
['\u00B3', 'B3', '³', '³', 'C2 B3'], | |
['\u00B4', 'B4', '´', '´', 'C2 B4'], | |
['\u00B5', 'B5', 'µ', 'µ', 'C2 B5'], | |
['\u00B6', 'B6', '¶', '¶', 'C2 B6'], | |
['\u00B7', 'B7', '·', '·', 'C2 B7'], | |
['\u00B8', 'B8', '¸', '¸', 'C2 B8'], | |
['\u00B9', 'B9', '¹', '¹', 'C2 B9'], | |
['\u00BA', 'BA', 'º', 'º', 'C2 BA'], | |
['\u00BB', 'BB', '»', '»', 'C2 BB'], | |
['\u00BC', 'BC', '¼', '¼', 'C2 BC'], | |
['\u00BD', 'BD', '½', '½', 'C2 BD'], | |
['\u00BE', 'BE', '¾', '¾', 'C2 BE'], | |
['\u00BF', 'BF', '¿', '¿', 'C2 BF'], | |
['\u00C0', 'C0', 'À', 'À', 'C3 80'], | |
['\u00C1', 'C1', 'Á', 'Ã', 'C3 81'], | |
['\u00C2', 'C2', 'Â', 'Â', 'C3 82'], | |
['\u00C3', 'C3', 'Ã', 'Ã', 'C3 83'], | |
['\u00C4', 'C4', 'Ä', 'Ä', 'C3 84'], | |
['\u00C5', 'C5', 'Å', 'Ã…', 'C3 85'], | |
['\u00C6', 'C6', 'Æ', 'Æ', 'C3 86'], | |
['\u00C7', 'C7', 'Ç', 'Ç', 'C3 87'], | |
['\u00C8', 'C8', 'È', 'È', 'C3 88'], | |
['\u00C9', 'C9', 'É', 'É', 'C3 89'], | |
['\u00CA', 'CA', 'Ê', 'Ê', 'C3 8A'], | |
['\u00CB', 'CB', 'Ë', 'Ë', 'C3 8B'], | |
['\u00CC', 'CC', 'Ì', 'ÃŒ', 'C3 8C'], | |
['\u00CD', 'CD', 'Í', 'Ã', 'C3 8D'], | |
['\u00CE', 'CE', 'Î', 'ÃŽ', 'C3 8E'], | |
['\u00CF', 'CF', 'Ï', 'Ã', 'C3 8F'], | |
['\u00D0', 'D0', 'Ð', 'Ã', 'C3 90'], | |
['\u00D1', 'D1', 'Ñ', 'Ñ', 'C3 91'], | |
['\u00D2', 'D2', 'Ò', 'Ã’', 'C3 92'], | |
['\u00D3', 'D3', 'Ó', 'Ó', 'C3 93'], | |
['\u00D4', 'D4', 'Ô', 'Ô', 'C3 94'], | |
['\u00D5', 'D5', 'Õ', 'Õ', 'C3 95'], | |
['\u00D6', 'D6', 'Ö', 'Ö', 'C3 96'], | |
['\u00D7', 'D7', '×', '×', 'C3 97'], | |
['\u00D8', 'D8', 'Ø', 'Ø', 'C3 98'], | |
['\u00D9', 'D9', 'Ù', 'Ù', 'C3 99'], | |
['\u00DA', 'DA', 'Ú', 'Ú', 'C3 9A'], | |
['\u00DB', 'DB', 'Û', 'Û', 'C3 9B'], | |
['\u00DC', 'DC', 'Ü', 'Ü', 'C3 9C'], | |
['\u00DD', 'DD', 'Ý', 'Ã', 'C3 9D'], | |
['\u00DE', 'DE', 'Þ', 'Þ', 'C3 9E'], | |
['\u00DF', 'DF', 'ß', 'ß', 'C3 9F'], | |
['\u00E0', 'E0', 'à', 'Ã ', 'C3 A0'], | |
['\u00E1', 'E1', 'á', 'á', 'C3 A1'], | |
['\u00E2', 'E2', 'â', 'â', 'C3 A2'], | |
['\u00E3', 'E3', 'ã', 'ã', 'C3 A3'], | |
['\u00E4', 'E4', 'ä', 'ä', 'C3 A4'], | |
['\u00E5', 'E5', 'å', 'Ã¥', 'C3 A5'], | |
['\u00E6', 'E6', 'æ', 'æ', 'C3 A6'], | |
['\u00E7', 'E7', 'ç', 'ç', 'C3 A7'], | |
['\u00E8', 'E8', 'è', 'è', 'C3 A8'], | |
['\u00E9', 'E9', 'é', 'é', 'C3 A9'], | |
['\u00EA', 'EA', 'ê', 'ê', 'C3 AA'], | |
['\u00EB', 'EB', 'ë', 'ë', 'C3 AB'], | |
['\u00EC', 'EC', 'ì', 'ì', 'C3 AC'], | |
['\u00ED', 'ED', 'í', 'Ã', 'C3 AD'], | |
['\u00EE', 'EE', 'î', 'î', 'C3 AE'], | |
['\u00EF', 'EF', 'ï', 'ï', 'C3 AF'], | |
['\u00F0', 'F0', 'ð', 'ð', 'C3 B0'], | |
['\u00F1', 'F1', 'ñ', 'ñ', 'C3 B1'], | |
['\u00F2', 'F2', 'ò', 'ò', 'C3 B2'], | |
['\u00F3', 'F3', 'ó', 'ó', 'C3 B3'], | |
['\u00F4', 'F4', 'ô', 'ô', 'C3 B4'], | |
['\u00F5', 'F5', 'õ', 'õ', 'C3 B5'], | |
['\u00F6', 'F6', 'ö', 'ö', 'C3 B6'], | |
['\u00F7', 'F7', '÷', '÷', 'C3 B7'], | |
['\u00F8', 'F8', 'ø', 'ø', 'C3 B8'], | |
['\u00F9', 'F9', 'ù', 'ù', 'C3 B9'], | |
['\u00FA', 'FA', 'ú', 'ú', 'C3 BA'], | |
['\u00FB', 'FB', 'û', 'û', 'C3 BB'], | |
['\u00FC', 'FC', 'ü', 'ü', 'C3 BC'], | |
['\u00FD', 'FD', 'ý', 'ý', 'C3 BD'], | |
['\u00FE', 'FE', 'þ', 'þ', 'C3 BE'], | |
['\u00FF', 'FF', 'ÿ', 'ÿ', 'C3 BF'], | |
] | |
import os, sys | |
from collections import defaultdict, Counter | |
infile = sys.argv[1] | |
outname, outext = os.path.splitext(infile) | |
outfile = outname + '-fixed' + outext | |
newlines = [] | |
fixes = defaultdict(int) | |
print(f"Reading {infile}") | |
#with open(infile, 'r', encoding='latin-1') as f: | |
# Column 3 gets adjusted here: | |
for mapping in mappings: | |
hexvals = mapping[4] | |
if hexvals in ['C3 81', 'C3 8D', 'C3 8F', 'C3 90', 'C3 9D', 'E2 80 9D', ]: | |
newmapping = bytes("".join([f'\\x{z}' for z in hexvals.split()]), "utf-8").decode("unicode_escape") | |
print(f"Adjusting {hexvals}") # from {mapping[3]} to {newmapping}") | |
mapping[3] = newmapping | |
with open(infile, 'r', encoding='utf-8') as f: | |
num = 0 | |
for line in f: | |
num += 1 | |
#line = line.encode('raw_unicode_escape').decode('utf-8') | |
# if num < 72: | |
# continue | |
# if num > 75: | |
# break | |
# print(num, line[:80]) | |
for mapping in mappings: | |
name_latin1 = mapping[1] | |
name_utf8 = mapping[4] | |
before = mapping[3] | |
after = mapping[2] | |
if line.find(before) == -1: | |
continue | |
bucket = f'{name_latin1} aka {name_utf8} --> {after}' | |
fixes[bucket] += 1 | |
print(f"{num} fixing {bucket}") | |
line = line.replace(before, after) | |
# print(num, line[:80]) | |
# print() | |
newlines.append(line) | |
print() | |
print(f"Writing {outfile}") | |
with open(outfile, 'w', encoding='utf-8', newline='\n') as f: | |
for num, line in enumerate(newlines): | |
f.write(line) | |
print() | |
for fix in sorted(fixes): | |
print(f'{fixes[fix]}\t{fix}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment