waylan/shlex_with_unicode.py

## shlex_with_unicode.py
import shlex
import re

# Testing shlex with Unicode.
#
# The shlex module does not support Unicode input. Which workaround is faster?

# Sample string. Using an Attribute List from Maruku's syntax:
# http://maruku.rubyforge.org/proposal.html#attribute_lists
t = u'.foo #bar class=foo ref title="Foo \xc3 bar."'

# Helper function for test1 & test2
def parse(part):
    ''' Parse each part into key and value. '''
    if part.startswith('#'):
        return u'id', part[1:]
    if part.startswith('.'):
        return u'.', part[1:]
    if '=' in part:
        return part.split('=')
    return part

def test1():
    ''' Remove the extra weird strings in the output. Feels hacky. '''
    return [parse(x.replace('\x00', '')) for x in shlex.split(t)]

def test2():
    ''' Encode input and decode output. Should work in all edge cases??? '''
    return [parse(x.decode('utf-8')) for x in shlex.split(t.encode('utf-8'))]

# Helper functions for test3
def r_id(s, t): return u'id', t[1:]
def r_class(s, t): return u'.', t[1:]
def r_kv(s, t): return t.split('=')
def r_dq(s, t):
    k, v = t.split('=')
    return k, v.strip('"')
def r_sq(s, t):
    k, v = t.split('=')
    return k, v.strip("'")
def r_ref(s, t): return t

scanner1 = re.Scanner([
    (r'\.[^ ]+', r_class),
    (r'\#[^ ]+', r_id),
    (r'[^ ]+=".*?"', r_dq),
    (r"[^ ]+='.*?'", r_sq),
    (r'[^ ]+=[^ ]+', r_kv),
    (r'[^ ]+', r_ref),
    (r' ', None)
])

def test3():
    ''' Reimplement our own parser. May be edge cases parser doesn't parse. '''
    return scanner1.scan(t)[0]

# Helper functions for test 4
def kv(s, t):
    if t.endswith('"'):
        return t[:-1].split('="')
    if t.endswith("'"):
        return t[:-1].split("='")
    return t.split('=')

def word(s, t):
    if t.startswith('.'):
        return u'.', t[1:]
    if t.startswith('#'):
        return u'id', t[1:]
    return t

scanner2 = re.Scanner([
    (r"""[^ ]+=((?P<q>['"]).*?(?P=q)|[^ ]*)""", kv),
    (r'[^ ]+', word),
    (r' ', None)
])

def test4():
    ''' Another parser implementation. Uses less regex. '''
    return scanner2.scan(t)[0]

scanner3 = re.Scanner([
    (r'[^ ]+=".*?"', r_dq),
    (r"[^ ]+='.*?'", r_sq),
    (r'[^ ]+=[^ ]*', r_kv),
    (r'[^ ]+', word),
    (r' ', None)
])

def test5():
    ''' A third parser implementation. Between test3 and test4. '''
    return scanner3.scan(t)[0]

if __name__ == '__main__':
    from timeit import repeat
    n = 10000
    print "Replace:", repeat("test1()", "from __main__ import test1", number=n)
    print test1()
    print
    print "Encode:",  repeat("test2()", "from __main__ import test2", number=n)
    print test2()
    print
    print "Scanner1:", repeat("test3()", "from __main__ import test3", number=n)
    print test3()
    print
    print "Scanner2:", repeat("test4()", "from __main__ import test4", number=n)
    print test4()
    print
    print "Scanner3:", repeat("test5()", "from __main__ import test5", number=n)
    print test5()

# Output (the winner is test3 - build our own (with more regex  and less code)):
#
# Replace: [3.970919823098155, 3.89778243211634, 3.8928384251926413]
# [(u'.', 'foo'), (u'id', 'bar'), ['class', 'foo'], 'ref', ['title', 'Foo \xc3 bar.']]
#
# Encode: [2.610697129939652, 2.598209946468314, 2.6193825514097995]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']]
#
# Scanner1: [0.8332152618775517, 0.8287351160521546, 0.8228314527069926]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')]
#
# Scanner2: [0.9497916123664218, 0.9404912909089695, 0.9393206237343854]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']]
#
# Scanner3: [0.9443332347575044, 0.9392774174469203, 0.9396791981608104]
# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')]
	import shlex
	import re

	# Testing shlex with Unicode.
	#
	# The shlex module does not support Unicode input. Which workaround is faster?

	# Sample string. Using an Attribute List from Maruku's syntax:
	# http://maruku.rubyforge.org/proposal.html#attribute_lists
	t = u'.foo #bar class=foo ref title="Foo \xc3 bar."'

	# Helper function for test1 & test2
	def parse(part):
	''' Parse each part into key and value. '''
	if part.startswith('#'):
	return u'id', part[1:]
	if part.startswith('.'):
	return u'.', part[1:]
	if '=' in part:
	return part.split('=')
	return part

	def test1():
	''' Remove the extra weird strings in the output. Feels hacky. '''
	return [parse(x.replace('\x00', '')) for x in shlex.split(t)]

	def test2():
	''' Encode input and decode output. Should work in all edge cases??? '''
	return [parse(x.decode('utf-8')) for x in shlex.split(t.encode('utf-8'))]

	# Helper functions for test3
	def r_id(s, t): return u'id', t[1:]
	def r_class(s, t): return u'.', t[1:]
	def r_kv(s, t): return t.split('=')
	def r_dq(s, t):
	k, v = t.split('=')
	return k, v.strip('"')
	def r_sq(s, t):
	k, v = t.split('=')
	return k, v.strip("'")
	def r_ref(s, t): return t

	scanner1 = re.Scanner([
	(r'\.[^ ]+', r_class),
	(r'\#[^ ]+', r_id),
	(r'[^ ]+=".*?"', r_dq),
	(r"[^ ]+='.*?'", r_sq),
	(r'[^ ]+=[^ ]+', r_kv),
	(r'[^ ]+', r_ref),
	(r' ', None)
	])

	def test3():
	''' Reimplement our own parser. May be edge cases parser doesn't parse. '''
	return scanner1.scan(t)[0]

	# Helper functions for test 4
	def kv(s, t):
	if t.endswith('"'):
	return t[:-1].split('="')
	if t.endswith("'"):
	return t[:-1].split("='")
	return t.split('=')

	def word(s, t):
	if t.startswith('.'):
	return u'.', t[1:]
	if t.startswith('#'):
	return u'id', t[1:]
	return t

	scanner2 = re.Scanner([
	(r"""[^ ]+=((?P<q>['"]).?(?P=q)\|[^ ])""", kv),
	(r'[^ ]+', word),
	(r' ', None)
	])

	def test4():
	''' Another parser implementation. Uses less regex. '''
	return scanner2.scan(t)[0]

	scanner3 = re.Scanner([
	(r'[^ ]+=".*?"', r_dq),
	(r"[^ ]+='.*?'", r_sq),
	(r'[^ ]+=[^ ]*', r_kv),
	(r'[^ ]+', word),
	(r' ', None)
	])

	def test5():
	''' A third parser implementation. Between test3 and test4. '''
	return scanner3.scan(t)[0]

	if __name__ == '__main__':
	from timeit import repeat
	n = 10000
	print "Replace:", repeat("test1()", "from __main__ import test1", number=n)
	print test1()
	print
	print "Encode:", repeat("test2()", "from __main__ import test2", number=n)
	print test2()
	print
	print "Scanner1:", repeat("test3()", "from __main__ import test3", number=n)
	print test3()
	print
	print "Scanner2:", repeat("test4()", "from __main__ import test4", number=n)
	print test4()
	print
	print "Scanner3:", repeat("test5()", "from __main__ import test5", number=n)
	print test5()

	# Output (the winner is test3 - build our own (with more regex and less code)):
	#
	# Replace: [3.970919823098155, 3.89778243211634, 3.8928384251926413]
	# [(u'.', 'foo'), (u'id', 'bar'), ['class', 'foo'], 'ref', ['title', 'Foo \xc3 bar.']]
	#
	# Encode: [2.610697129939652, 2.598209946468314, 2.6193825514097995]
	# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']]
	#
	# Scanner1: [0.8332152618775517, 0.8287351160521546, 0.8228314527069926]
	# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')]
	#
	# Scanner2: [0.9497916123664218, 0.9404912909089695, 0.9393206237343854]
	# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', [u'title', u'Foo \xc3 bar.']]
	#
	# Scanner3: [0.9443332347575044, 0.9392774174469203, 0.9396791981608104]
	# [(u'.', u'foo'), (u'id', u'bar'), [u'class', u'foo'], u'ref', (u'title', u'Foo \xc3 bar.')]