Last active
December 30, 2015 02:39
-
-
Save abg/7764054 to your computer and use it in GitHub Desktop.
Decode MySQL filenames to unicode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import binascii | |
import collections | |
import re | |
import struct | |
CodeRange = collections.namedtuple('CodeRange', 'start end') | |
class EncodingRule(object): | |
def __init__(self, coderange, pattern, name, initial_code): | |
self.coderange = coderange | |
self.pattern = pattern | |
self.name = name | |
self.initial_code = initial_code | |
def __call__(self, matchobj): | |
value = matchobj.group() | |
ivalue, = struct.unpack_from('>H', value, 1) | |
initial_code, = struct.unpack_from('>H', self.initial_code, 1) | |
uni_ord = self.coderange.start + (ivalue - initial_code) | |
return unichr(uni_ord) | |
class Ucs2EncodingRule(object): | |
def __init__(self): | |
self.pattern = r'@[0-9a-f]{4}' | |
def __call__(self, matchobj): | |
value = matchobj.group() | |
uni_ord, = struct.unpack('>H', binascii.unhexlify(value[1:])) | |
return unichr(uni_ord) | |
class WindowsDeviceNameEncodingRule(object): | |
def __init__(self): | |
self.pattern = r'[@]{3}' | |
def __call__(self, matchobj): | |
return '' | |
encoding_rules = [ | |
EncodingRule(coderange=CodeRange(0x00C0, 0x017F), | |
pattern=r'[@][0-4][g-zG-Z]', | |
name='Latin-1 Supplement + Latin Extended-A', | |
initial_code='@0G'), | |
Ucs2EncodingRule(), | |
WindowsDeviceNameEncodingRule(), | |
EncodingRule(coderange=CodeRange(0x0370, 0x03FF), | |
pattern=r'[@][5-9][g-zG-Z]', | |
name='Greek and Coptic', | |
initial_code='@5G'), | |
EncodingRule(coderange=CodeRange(0x0400, 0x052F), | |
pattern=r'[@][g-zG-Z][0-6]', | |
name='Cyrillic + Cyrillic Supplement', | |
initial_code='@G0'), | |
EncodingRule(coderange=CodeRange(0x0530, 0x058F), | |
pattern=r'[@][G-Zg-z][7-8]', | |
name='Armenian', | |
initial_code='@G7'), | |
EncodingRule(coderange=CodeRange(0x2160, 0x217F), | |
pattern=r'[@][G-Zg-z][9]', | |
name='Number Forms', | |
initial_code='@G9'), | |
EncodingRule(coderange=CodeRange(0x0180, 0x02AF), | |
pattern=r'[@][G-Zg-z][A-Ka-k]', | |
name='Latin Extended-B + IPA Extensions', | |
initial_code='@GA'), | |
EncodingRule(coderange=CodeRange(0x1E00, 0x1EFF), | |
pattern=r'[@][G-Zg-z][L-Rl-r]]', | |
name='Latin Extended Additional', | |
initial_code='@GL'), | |
EncodingRule(coderange=CodeRange(0x1F00, 0x1FFF), | |
pattern=r'[@][G-Zg-z][S-Zs-z]', | |
name='Greek Extended', | |
initial_code='@GS'), | |
EncodingRule(coderange=CodeRange(0x24B6, 0x24E9), | |
pattern=r'[@][@][A-Za-z]', | |
name='Enclosed Alphanumerics', | |
initial_code='@@A'), | |
EncodingRule(coderange=CodeRange(0xFF21, 0xFF5A), | |
pattern=r'[@][A-Za-z][@]', | |
name=r'Halfwidth and Fullwidth forms', | |
initial_code='@A@'), | |
] | |
def decode_filename(path): | |
for rule in encoding_rules: | |
match = re.search(rule.pattern, path) | |
path = re.sub(rule.pattern, rule, path) | |
return path | |
if __name__ == '__main__': | |
import sys | |
for name in sys.argv[1:]: | |
print name, ":", decode_filename(name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment