Skip to content

Instantly share code, notes, and snippets.

@waylan
Created May 12, 2011 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save waylan/969171 to your computer and use it in GitHub Desktop.
Save waylan/969171 to your computer and use it in GitHub Desktop.
The shlex module does not support Unicode input. Which workaround is faster?
import shlex
import re
# Testing shlex with Unicode.
#
# The shlex module does not support Unicode input. Which workaround is faster?
# Sample string. Using an Attribute List from Maruku's syntax:
# http://maruku.rubyforge.org/proposal.html#attribute_lists
t = u'.foo #bar class=foo ref title="Foo \xc3 bar."'
# Helper function for test1 & test2
def parse(part):
''' Parse each part into key and value. '''
if part.startswith('#'):
return u'id', part[1:]
if part.startswith('.'):
return u'.', part[1:]
if '=' in part:
return part.split('=')
return part
def test1():
''' Remove the extra weird strings in the output. Feels hacky. '''
return [parse(x.replace('\x00', '')) for x in shlex.split(t)]
def test2():
''' Encode input and decode output. Should work in all edge cases??? '''
return [parse(x.decode('utf-8')) for x in shlex.split(t.encode('utf-8'))]
# Helper functions for test3
def r_id(s, t): return u'id', t[1:]
def r_class(s, t): return u'.', t[1:]
def r_kv(s, t): return t.split('=')
def r_dq(s, t):
k, v = t.split('=')
return k, v.strip('"')
def r_sq(s, t):
k, v = t.split('=')
return k, v.strip("'")
def r_ref(s, t): return t
scanner1 = re.Scanner([
(r'\.[^ ]+', r_class),
(r'\#[^ ]+', r_id),
(r'[^ ]+=".*?"', r_dq),
(r"[^ ]+='.*?'", r_sq),
(r'[^ ]+=[^ ]+', r_kv),
(r'[^ ]+', r_ref),
(r' ', None)
])
def test3():
''' Reimplement our own parser. May be edge cases parser doesn't parse. '''
return scanner1.scan(t)[0]
# Helper functions for test 4
def kv(s, t):
if t.endswith('"'):
return t[:-1].split('="')
if t.endswith("'"):
return t[:-1].split("='")
return t.split('=')
def word(s, t):
if t.startswith('.'):
return u'.', t[1:]
if t.startswith('#'):
return u'id', t[1:]
return t
scanner2 = re.Scanner([
(r"""[^ ]+=((?P<q>['"]).*?(?P=q)|[^ ]*)""", kv),
(r'[^ ]+', word),
(r' ', None)
])
def test4():
''' Another parser implementation. Uses less regex. '''
return scanner2.scan(t)[0]
scanner3 = re.Scanner([
(r'[^ ]+=".*?"', r_dq),
(r"[^ ]+='.*?'", r_sq),
(r'[^ ]+=[^ ]*', r_kv),
(r'[^ ]+', word),
(r' ', None)
])
def test5():
''' A third parser implementation. Between test3 and test4. '''
return scanner3.scan(t)[0]
if __name__ == '__main__':
from timeit import repeat
n = 10000
print "Replace:", repeat("test1()", "from __main__ import test1", number=n)
print test1()
print
print "Encode:", repeat("test2()", "from __main__ import test2", number=n)
print test2()
print
print "Scanner1:", repeat("test3()", "from __main__ import test3", number=n)
print test3()
print
print "Scanner2:", repeat("test4()", "from __main__ import test4", number=n)
print test4()
print
print "Scanner3:", repeat("test5()", "from __main__ import test5", number=n)
print test5()
# Output (the winner is test3 - build our own (with more regex and less code)):
#
# Replace: [3.970919823098155, 3.89778243211634, 3.8928384251926413]
# [(u'.', 'foo'), (u'id', 'bar'), ['class', 'foo'], 'ref', ['title', 'Foo \xc3 bar.']]
#
# Encode: [2.610697129939652, 2.598209946468314, 2.6193825514097995]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']]
#
# Scanner1: [0.8332152618775517, 0.8287351160521546, 0.8228314527069926]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')]
#
# Scanner2: [0.9497916123664218, 0.9404912909089695, 0.9393206237343854]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']]
#
# Scanner3: [0.9443332347575044, 0.9392774174469203, 0.9396791981608104]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment