Created
May 12, 2011 18:42
-
-
Save waylan/969171 to your computer and use it in GitHub Desktop.
The shlex module does not support Unicode input. Which workaround is faster?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import shlex | |
import re | |
# Testing shlex with Unicode. | |
# | |
# The shlex module does not support Unicode input. Which workaround is faster? | |
# Sample string. Using an Attribute List from Maruku's syntax: | |
# http://maruku.rubyforge.org/proposal.html#attribute_lists | |
t = u'.foo #bar class=foo ref title="Foo \xc3 bar."' | |
# Helper function for test1 & test2 | |
def parse(part): | |
''' Parse each part into key and value. ''' | |
if part.startswith('#'): | |
return u'id', part[1:] | |
if part.startswith('.'): | |
return u'.', part[1:] | |
if '=' in part: | |
return part.split('=') | |
return part | |
def test1(): | |
''' Remove the extra weird strings in the output. Feels hacky. ''' | |
return [parse(x.replace('\x00', '')) for x in shlex.split(t)] | |
def test2(): | |
''' Encode input and decode output. Should work in all edge cases??? ''' | |
return [parse(x.decode('utf-8')) for x in shlex.split(t.encode('utf-8'))] | |
# Helper functions for test3 | |
def r_id(s, t): return u'id', t[1:] | |
def r_class(s, t): return u'.', t[1:] | |
def r_kv(s, t): return t.split('=') | |
def r_dq(s, t): | |
k, v = t.split('=') | |
return k, v.strip('"') | |
def r_sq(s, t): | |
k, v = t.split('=') | |
return k, v.strip("'") | |
def r_ref(s, t): return t | |
scanner1 = re.Scanner([ | |
(r'\.[^ ]+', r_class), | |
(r'\#[^ ]+', r_id), | |
(r'[^ ]+=".*?"', r_dq), | |
(r"[^ ]+='.*?'", r_sq), | |
(r'[^ ]+=[^ ]+', r_kv), | |
(r'[^ ]+', r_ref), | |
(r' ', None) | |
]) | |
def test3(): | |
''' Reimplement our own parser. May be edge cases parser doesn't parse. ''' | |
return scanner1.scan(t)[0] | |
# Helper functions for test 4 | |
def kv(s, t): | |
if t.endswith('"'): | |
return t[:-1].split('="') | |
if t.endswith("'"): | |
return t[:-1].split("='") | |
return t.split('=') | |
def word(s, t): | |
if t.startswith('.'): | |
return u'.', t[1:] | |
if t.startswith('#'): | |
return u'id', t[1:] | |
return t | |
scanner2 = re.Scanner([ | |
(r"""[^ ]+=((?P<q>['"]).*?(?P=q)|[^ ]*)""", kv), | |
(r'[^ ]+', word), | |
(r' ', None) | |
]) | |
def test4(): | |
''' Another parser implementation. Uses less regex. ''' | |
return scanner2.scan(t)[0] | |
scanner3 = re.Scanner([ | |
(r'[^ ]+=".*?"', r_dq), | |
(r"[^ ]+='.*?'", r_sq), | |
(r'[^ ]+=[^ ]*', r_kv), | |
(r'[^ ]+', word), | |
(r' ', None) | |
]) | |
def test5(): | |
''' A third parser implementation. Between test3 and test4. ''' | |
return scanner3.scan(t)[0] | |
if __name__ == '__main__': | |
from timeit import repeat | |
n = 10000 | |
print "Replace:", repeat("test1()", "from __main__ import test1", number=n) | |
print test1() | |
print "Encode:", repeat("test2()", "from __main__ import test2", number=n) | |
print test2() | |
print "Scanner1:", repeat("test3()", "from __main__ import test3", number=n) | |
print test3() | |
print "Scanner2:", repeat("test4()", "from __main__ import test4", number=n) | |
print test4() | |
print "Scanner3:", repeat("test5()", "from __main__ import test5", number=n) | |
print test5() | |
# Output (the winner is test3 - build our own (with more regex and less code)): | |
# | |
# Replace: [3.970919823098155, 3.89778243211634, 3.8928384251926413] | |
# [(u'.', 'foo'), (u'id', 'bar'), ['class', 'foo'], 'ref', ['title', 'Foo \xc3 bar.']] | |
# | |
# Encode: [2.610697129939652, 2.598209946468314, 2.6193825514097995] | |
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']] | |
# | |
# Scanner1: [0.8332152618775517, 0.8287351160521546, 0.8228314527069926] | |
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')] | |
# | |
# Scanner2: [0.9497916123664218, 0.9404912909089695, 0.9393206237343854] | |
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']] | |
# | |
# Scanner3: [0.9443332347575044, 0.9392774174469203, 0.9396791981608104] | |
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment