Skip to content

Instantly share code, notes, and snippets.

@zhehaowang
Created May 22, 2020 19:26
Show Gist options
  • Save zhehaowang/42503c27ba43bc68f0408fd64a596312 to your computer and use it in GitHub Desktop.
Save zhehaowang/42503c27ba43bc68f0408fd64a596312 to your computer and use it in GitHub Desktop.
utf-8 recovery hack
import os
import glob
import csv
from xlsxwriter.workbook import Workbook
for csvfile in ["out.csv"]:
workbook = Workbook(csvfile[:-4] + '.xlsx')
worksheet = workbook.add_worksheet()
with open(csvfile, 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for r, row in enumerate(reader):
for c, col in enumerate(row):
worksheet.write(r, c, col)
workbook.close()
import re
def should_flip(ints):
characters = "".join([chr(c) for c in ints])
if len(characters) == 3:
# if characters in ["e94", "d8\n", "f57", "h57", "g,,", "e92", "e72", "e96", "h49", "g\n6"]:
# return True
if re.match(r"^[edfgh][,\n0-9][,\n0-9]$", characters):
return True
# if re.match(r"^_[0-9][2b]$", characters):
# return True
all_alphanum = all([chr(c).isalnum() or chr(c) in ['\n', ',', '_'] for c in ints])
if all_alphanum:
return False
later_bytes_match = all([(0b01000000 & i) == 0 for i in ints[1:]])
# turns out chinese utf-8 are all 3 chars?
if len(ints) == 4:
return (0b11111000 & ints[0] == 0b01110000) and later_bytes_match
elif len(ints) == 3:
return (0b11110000 & ints[0] == 0b01100000) and later_bytes_match
# elif len(ints) == 2:
# return (0b11100000 & ints[0] == 0b01000000) and later_bytes_match
else:
return False
# c = chr(ints[0])
# return not (c.isalnum() or c in ['\n', ',', '_'])
with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f, open('out.csv', 'wb') as outfile:
# reader = csv.reader(f)
content = f.read()
i = 0
while i < len(content):
flipped = False
for j in [3]:
if should_flip(content[i:i + j]):
seg = content[i:i + j]
for k in range(len(seg)):
outfile.write((seg[k] + 128).to_bytes(1, byteorder='little'))
i += j
flipped = True
break
if flipped:
continue
else:
outfile.write(content[i].to_bytes(1, byteorder='little'))
i += 1
# c = chr(content[i])
# num = content[i]
# if c.isalnum() or c in ['\n', ',', '_']:
# pass
# else:
# num += 128
# outfile.write(num.to_bytes(1, byteorder='little'))
# i += 1
# with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f:
# # reader = csv.reader(f)
# content = f.read()
# # for c in content:
# # print(c),
# utf8str = content.decode("utf-8")
# line = utf8str.split('\n')[1:5]
# print("".join(line).encode("utf-8").hex())
# # part = line.split(',')[1]
# # partbin = part.encode("utf-8")
# # print(partbin.hex())
# # print(u"f\x17%".encode("utf-8"))
# # print(u"f\x17%".encode("utf-8").hex())
# # print(u"f\x17".encode("utf-8").decode(""))
# # print(u"年 月 日".encode("gb2312"))
# # print(u"年 月 日".encode("utf-8"))
# # 2016 e94 9 f\x1c\x08 13 f\x17%
# actual unicode: \xe5\xb9\xb4 \xe6\x9c\x88 \xe6\x97\xa5
# given: \x65\x3e\x34 \x66\x1c\x08 \x66\x17\x25
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment