Skip to content

Instantly share code, notes, and snippets.

@abg
Last active December 30, 2015 02:39
Show Gist options
  • Save abg/7764054 to your computer and use it in GitHub Desktop.
Save abg/7764054 to your computer and use it in GitHub Desktop.
Decode MySQL filenames to unicode
import binascii
import collections
import re
import struct
CodeRange = collections.namedtuple('CodeRange', 'start end')
class EncodingRule(object):
def __init__(self, coderange, pattern, name, initial_code):
self.coderange = coderange
self.pattern = pattern
self.name = name
self.initial_code = initial_code
def __call__(self, matchobj):
value = matchobj.group()
ivalue, = struct.unpack_from('>H', value, 1)
initial_code, = struct.unpack_from('>H', self.initial_code, 1)
uni_ord = self.coderange.start + (ivalue - initial_code)
return unichr(uni_ord)
class Ucs2EncodingRule(object):
def __init__(self):
self.pattern = r'@[0-9a-f]{4}'
def __call__(self, matchobj):
value = matchobj.group()
uni_ord, = struct.unpack('>H', binascii.unhexlify(value[1:]))
return unichr(uni_ord)
class WindowsDeviceNameEncodingRule(object):
def __init__(self):
self.pattern = r'[@]{3}'
def __call__(self, matchobj):
return ''
encoding_rules = [
EncodingRule(coderange=CodeRange(0x00C0, 0x017F),
pattern=r'[@][0-4][g-zG-Z]',
name='Latin-1 Supplement + Latin Extended-A',
initial_code='@0G'),
Ucs2EncodingRule(),
WindowsDeviceNameEncodingRule(),
EncodingRule(coderange=CodeRange(0x0370, 0x03FF),
pattern=r'[@][5-9][g-zG-Z]',
name='Greek and Coptic',
initial_code='@5G'),
EncodingRule(coderange=CodeRange(0x0400, 0x052F),
pattern=r'[@][g-zG-Z][0-6]',
name='Cyrillic + Cyrillic Supplement',
initial_code='@G0'),
EncodingRule(coderange=CodeRange(0x0530, 0x058F),
pattern=r'[@][G-Zg-z][7-8]',
name='Armenian',
initial_code='@G7'),
EncodingRule(coderange=CodeRange(0x2160, 0x217F),
pattern=r'[@][G-Zg-z][9]',
name='Number Forms',
initial_code='@G9'),
EncodingRule(coderange=CodeRange(0x0180, 0x02AF),
pattern=r'[@][G-Zg-z][A-Ka-k]',
name='Latin Extended-B + IPA Extensions',
initial_code='@GA'),
EncodingRule(coderange=CodeRange(0x1E00, 0x1EFF),
pattern=r'[@][G-Zg-z][L-Rl-r]]',
name='Latin Extended Additional',
initial_code='@GL'),
EncodingRule(coderange=CodeRange(0x1F00, 0x1FFF),
pattern=r'[@][G-Zg-z][S-Zs-z]',
name='Greek Extended',
initial_code='@GS'),
EncodingRule(coderange=CodeRange(0x24B6, 0x24E9),
pattern=r'[@][@][A-Za-z]',
name='Enclosed Alphanumerics',
initial_code='@@A'),
EncodingRule(coderange=CodeRange(0xFF21, 0xFF5A),
pattern=r'[@][A-Za-z][@]',
name=r'Halfwidth and Fullwidth forms',
initial_code='@A@'),
]
def decode_filename(path):
for rule in encoding_rules:
match = re.search(rule.pattern, path)
path = re.sub(rule.pattern, rule, path)
return path
if __name__ == '__main__':
import sys
for name in sys.argv[1:]:
print name, ":", decode_filename(name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment