Skip to content

Instantly share code, notes, and snippets.

@justecorruptio
Last active February 21, 2020 09:51
Show Gist options
  • Save justecorruptio/9b0fd15860eee90493e2 to your computer and use it in GitHub Desktop.
Save justecorruptio/9b0fd15860eee90493e2 to your computer and use it in GitHub Desktop.
Generate a regex to detect unicode blocks in mysql.
#!/usr/bin/python
# -*- coding: utf-8 -*-
start, end = u'가힝' # Korean
#start, end = u'あゟ' # Japanese
def esc(c):
if c in '[]\\\'\"':
return '\\' + c
return c
def inc(c):
return chr(ord(c) + 1)
def dec(c):
return chr(ord(c) - 1)
def rr(u, v):
l = len(u)
m = l - 1
if u == '':
return ''
if u[0] == v[0]:
return esc(u[0]) + rr(u[1:], v[1:])
if l == 1:
if u == '\x00':
if v == '\xFF':
return '.'
else:
return '[^%s-\xFF]' % (esc(inc(v[0])),)
else:
return '[%s-%s]' % (esc(u), esc(v))
u_complete = u == '\x00' * l
v_complete = v == '\xFF' * l
if u_complete and v_complete:
return '.' * l
elif u_complete:
return '(' + '|'.join([
rr('\x00', dec(v[0])) + rr(u[1:], '\xFF' * m),
esc(v[0]) + rr('\x00' * m, v[1:]),
]) + ')'
elif v_complete:
return '(' + '|'.join([
esc(u[0]) + rr(u[1:], '\xFF' * m),
rr(inc(u[0]), '\xFF') + rr('\x00' * m, v[1:]),
]) + ')'
parts = [esc(u[0]) + rr(u[1:], '\xFF' * m)]
if ord(v[0]) - ord(u[0]) > 1:
parts += [rr(inc(u[0]), dec(v[0])) + '.' * m]
parts += [esc(v[0]) + rr('\x00' * m, v[1:])]
return '(' + '|'.join(parts) + ')'
def to_mysql_x(regex):
return ''.join(
'%02X' % (ord(c),)
for c in regex[1:-1]
)
start_bytes = start.encode('utf-8')
end_bytes = end.encode('utf-8')
regex = rr(start_bytes, end_bytes)
print to_mysql_x(regex)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment