Skip to content

Instantly share code, notes, and snippets.

@MartyMacGyver
Last active December 6, 2021 03:01
Show Gist options
  • Save MartyMacGyver/3c28aaa6ce042aed2b176b8b9c84dec9 to your computer and use it in GitHub Desktop.
Save MartyMacGyver/3c28aaa6ce042aed2b176b8b9c84dec9 to your computer and use it in GitHub Desktop.
Latin1 -> UTF-8 and double-encoded UTF-8 fixer I wrote to process a few munged Wordpress databases
# Fixes double-encoded UTF-8
# By Martin Falatic, www.Falatic.com 2021-12-03
# Mappings based on https://www.i18nqa.com/debug/utf8-debug.html
"""
Unmapped:
0x81
0x8D
0x8F
0x90
0x9D
"""
mappings = [
['\u20AC', '80', '€', '€', 'E2 82 AC'],
['\u201A', '82', '‚', '‚', 'E2 80 9A'],
['\u0192', '83', 'ƒ', 'Æ’', 'C6 92'],
['\u201E', '84', '„', '„', 'E2 80 9E'],
['\u2026', '85', '…', '…', 'E2 80 A6'],
['\u2020', '86', '†', '†', 'E2 80 A0'],
['\u2021', '87', '‡', '‡', 'E2 80 A1'],
['\u02C6', '88', 'ˆ', 'ˆ', 'CB 86'],
['\u2030', '89', '‰', '‰', 'E2 80 B0'],
['\u0160', '8A', 'Š', 'Å ', 'C5 A0'],
['\u2039', '8B', '‹', '‹', 'E2 80 B9'],
['\u0152', '8C', 'Œ', 'Å’', 'C5 92'],
['\u017D', '8E', 'Ž', 'Ž', 'C5 BD'],
['\u2018', '91', '‘', '‘', 'E2 80 98'],
['\u2019', '92', '’', '’', 'E2 80 99'],
['\u201C', '93', '“', '“', 'E2 80 9C'],
['\u201D', '94', '”', 'â€', 'E2 80 9D'],
['\u2022', '95', '•', '•', 'E2 80 A2'],
['\u2013', '96', '–', '–', 'E2 80 93'],
['\u2014', '97', '—', '—', 'E2 80 94'],
['\u02DC', '98', '˜', 'Ëœ', 'CB 9C'],
['\u2122', '99', '™', 'â„¢', 'E2 84 A2'],
['\u0161', '9A', 'š', 'Å¡', 'C5 A1'],
['\u203A', '9B', '›', '›', 'E2 80 BA'],
['\u0153', '9C', 'œ', 'Å“', 'C5 93'],
['\u017E', '9E', 'ž', 'ž', 'C5 BE'],
['\u0178', '9F', 'Ÿ', 'Ÿ', 'C5 B8'],
['\u00A0', 'A0', ' ', ' ', 'C2 A0'],
['\u00A1', 'A1', '¡', '¡', 'C2 A1'],
['\u00A2', 'A2', '¢', '¢', 'C2 A2'],
['\u00A3', 'A3', '£', '£', 'C2 A3'],
['\u00A4', 'A4', '¤', '¤', 'C2 A4'],
['\u00A5', 'A5', '¥', 'Â¥', 'C2 A5'],
['\u00A6', 'A6', '¦', '¦', 'C2 A6'],
['\u00A7', 'A7', '§', '§', 'C2 A7'],
['\u00A8', 'A8', '¨', '¨', 'C2 A8'],
['\u00A9', 'A9', '©', '©', 'C2 A9'],
['\u00AA', 'AA', 'ª', 'ª', 'C2 AA'],
['\u00AB', 'AB', '«', '«', 'C2 AB'],
['\u00AC', 'AC', '¬', '¬', 'C2 AC'],
['\u00AD', 'AD', '­', '­', 'C2 AD'],
['\u00AE', 'AE', '®', '®', 'C2 AE'],
['\u00AF', 'AF', '¯', '¯', 'C2 AF'],
['\u00B0', 'B0', '°', '°', 'C2 B0'],
['\u00B1', 'B1', '±', '±', 'C2 B1'],
['\u00B2', 'B2', '²', '²', 'C2 B2'],
['\u00B3', 'B3', '³', '³', 'C2 B3'],
['\u00B4', 'B4', '´', '´', 'C2 B4'],
['\u00B5', 'B5', 'µ', 'µ', 'C2 B5'],
['\u00B6', 'B6', '¶', '¶', 'C2 B6'],
['\u00B7', 'B7', '·', '·', 'C2 B7'],
['\u00B8', 'B8', '¸', '¸', 'C2 B8'],
['\u00B9', 'B9', '¹', '¹', 'C2 B9'],
['\u00BA', 'BA', 'º', 'º', 'C2 BA'],
['\u00BB', 'BB', '»', '»', 'C2 BB'],
['\u00BC', 'BC', '¼', '¼', 'C2 BC'],
['\u00BD', 'BD', '½', '½', 'C2 BD'],
['\u00BE', 'BE', '¾', '¾', 'C2 BE'],
['\u00BF', 'BF', '¿', '¿', 'C2 BF'],
['\u00C0', 'C0', 'À', 'À', 'C3 80'],
['\u00C1', 'C1', 'Á', 'Ã', 'C3 81'],
['\u00C2', 'C2', 'Â', 'Â', 'C3 82'],
['\u00C3', 'C3', 'Ã', 'Ã', 'C3 83'],
['\u00C4', 'C4', 'Ä', 'Ä', 'C3 84'],
['\u00C5', 'C5', 'Å', 'Ã…', 'C3 85'],
['\u00C6', 'C6', 'Æ', 'Æ', 'C3 86'],
['\u00C7', 'C7', 'Ç', 'Ç', 'C3 87'],
['\u00C8', 'C8', 'È', 'È', 'C3 88'],
['\u00C9', 'C9', 'É', 'É', 'C3 89'],
['\u00CA', 'CA', 'Ê', 'Ê', 'C3 8A'],
['\u00CB', 'CB', 'Ë', 'Ë', 'C3 8B'],
['\u00CC', 'CC', 'Ì', 'ÃŒ', 'C3 8C'],
['\u00CD', 'CD', 'Í', 'Ã', 'C3 8D'],
['\u00CE', 'CE', 'Î', 'ÃŽ', 'C3 8E'],
['\u00CF', 'CF', 'Ï', 'Ã', 'C3 8F'],
['\u00D0', 'D0', 'Ð', 'Ã', 'C3 90'],
['\u00D1', 'D1', 'Ñ', 'Ñ', 'C3 91'],
['\u00D2', 'D2', 'Ò', 'Ã’', 'C3 92'],
['\u00D3', 'D3', 'Ó', 'Ó', 'C3 93'],
['\u00D4', 'D4', 'Ô', 'Ô', 'C3 94'],
['\u00D5', 'D5', 'Õ', 'Õ', 'C3 95'],
['\u00D6', 'D6', 'Ö', 'Ö', 'C3 96'],
['\u00D7', 'D7', '×', '×', 'C3 97'],
['\u00D8', 'D8', 'Ø', 'Ø', 'C3 98'],
['\u00D9', 'D9', 'Ù', 'Ù', 'C3 99'],
['\u00DA', 'DA', 'Ú', 'Ú', 'C3 9A'],
['\u00DB', 'DB', 'Û', 'Û', 'C3 9B'],
['\u00DC', 'DC', 'Ü', 'Ãœ', 'C3 9C'],
['\u00DD', 'DD', 'Ý', 'Ã', 'C3 9D'],
['\u00DE', 'DE', 'Þ', 'Þ', 'C3 9E'],
['\u00DF', 'DF', 'ß', 'ß', 'C3 9F'],
['\u00E0', 'E0', 'à', 'à', 'C3 A0'],
['\u00E1', 'E1', 'á', 'á', 'C3 A1'],
['\u00E2', 'E2', 'â', 'â', 'C3 A2'],
['\u00E3', 'E3', 'ã', 'ã', 'C3 A3'],
['\u00E4', 'E4', 'ä', 'ä', 'C3 A4'],
['\u00E5', 'E5', 'å', 'Ã¥', 'C3 A5'],
['\u00E6', 'E6', 'æ', 'æ', 'C3 A6'],
['\u00E7', 'E7', 'ç', 'ç', 'C3 A7'],
['\u00E8', 'E8', 'è', 'è', 'C3 A8'],
['\u00E9', 'E9', 'é', 'é', 'C3 A9'],
['\u00EA', 'EA', 'ê', 'ê', 'C3 AA'],
['\u00EB', 'EB', 'ë', 'ë', 'C3 AB'],
['\u00EC', 'EC', 'ì', 'ì', 'C3 AC'],
['\u00ED', 'ED', 'í', 'í', 'C3 AD'],
['\u00EE', 'EE', 'î', 'î', 'C3 AE'],
['\u00EF', 'EF', 'ï', 'ï', 'C3 AF'],
['\u00F0', 'F0', 'ð', 'ð', 'C3 B0'],
['\u00F1', 'F1', 'ñ', 'ñ', 'C3 B1'],
['\u00F2', 'F2', 'ò', 'ò', 'C3 B2'],
['\u00F3', 'F3', 'ó', 'ó', 'C3 B3'],
['\u00F4', 'F4', 'ô', 'ô', 'C3 B4'],
['\u00F5', 'F5', 'õ', 'õ', 'C3 B5'],
['\u00F6', 'F6', 'ö', 'ö', 'C3 B6'],
['\u00F7', 'F7', '÷', '÷', 'C3 B7'],
['\u00F8', 'F8', 'ø', 'ø', 'C3 B8'],
['\u00F9', 'F9', 'ù', 'ù', 'C3 B9'],
['\u00FA', 'FA', 'ú', 'ú', 'C3 BA'],
['\u00FB', 'FB', 'û', 'û', 'C3 BB'],
['\u00FC', 'FC', 'ü', 'ü', 'C3 BC'],
['\u00FD', 'FD', 'ý', 'ý', 'C3 BD'],
['\u00FE', 'FE', 'þ', 'þ', 'C3 BE'],
['\u00FF', 'FF', 'ÿ', 'ÿ', 'C3 BF'],
]
import os, sys
from collections import defaultdict, Counter
infile = sys.argv[1]
outname, outext = os.path.splitext(infile)
outfile = outname + '-fixed' + outext
newlines = []
fixes = defaultdict(int)
print(f"Reading {infile}")
#with open(infile, 'r', encoding='latin-1') as f:
# Column 3 gets adjusted here:
for mapping in mappings:
hexvals = mapping[4]
if hexvals in ['C3 81', 'C3 8D', 'C3 8F', 'C3 90', 'C3 9D', 'E2 80 9D', ]:
newmapping = bytes("".join([f'\\x{z}' for z in hexvals.split()]), "utf-8").decode("unicode_escape")
print(f"Adjusting {hexvals}") # from {mapping[3]} to {newmapping}")
mapping[3] = newmapping
with open(infile, 'r', encoding='utf-8') as f:
num = 0
for line in f:
num += 1
#line = line.encode('raw_unicode_escape').decode('utf-8')
# if num < 72:
# continue
# if num > 75:
# break
# print(num, line[:80])
for mapping in mappings:
name_latin1 = mapping[1]
name_utf8 = mapping[4]
before = mapping[3]
after = mapping[2]
if line.find(before) == -1:
continue
bucket = f'{name_latin1} aka {name_utf8} --> {after}'
fixes[bucket] += 1
print(f"{num} fixing {bucket}")
line = line.replace(before, after)
# print(num, line[:80])
# print()
newlines.append(line)
print()
print(f"Writing {outfile}")
with open(outfile, 'w', encoding='utf-8', newline='\n') as f:
for num, line in enumerate(newlines):
f.write(line)
print()
for fix in sorted(fixes):
print(f'{fixes[fix]}\t{fix}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment