Skip to content

Instantly share code, notes, and snippets.

@leth
Created August 9, 2014 22:42
Show Gist options
  • Save leth/1f2ba2d6b961fabe5d60 to your computer and use it in GitHub Desktop.
Save leth/1f2ba2d6b961fabe5d60 to your computer and use it in GitHub Desktop.
_clean_data benchmarking
import os
import re
import StringIO
import cStringIO
import contextlib
# caching the compilation of the regex used
# to check for lookup calls within data
LOOKUP_REGEX=re.compile(r'lookup\s*\(')
def _clean_data(orig_data, from_remote=False, from_inventory=False):
''' remove jinja2 template tags from a string '''
if not isinstance(orig_data, basestring):
return orig_data
data = StringIO.StringIO("")
# when the data is marked as having come from a remote, we always
# replace any print blocks (ie. {{var}}), however when marked as coming
# from inventory we only replace print blocks that contain a call to
# a lookup plugin (ie. {{lookup('foo','bar'))}})
replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None)
# these variables keep track of opening block locations, as we only
# want to replace matched pairs of print/block tags
print_openings = []
block_openings = []
for idx,c in enumerate(orig_data):
# if the current character is an opening brace, check to
# see if this is a jinja2 token. Otherwise, if the current
# character is a closing brace, we backup one character to
# see if we have a closing.
if c == '{' and idx < len(orig_data) - 1:
token = orig_data[idx:idx+2]
# if so, and we want to replace this block, push
# this token's location onto the appropriate array
if token == '{{' and replace_prints:
print_openings.append(idx)
elif token == '{%':
block_openings.append(idx)
# finally we write the data to the buffer and write
data.seek(0, os.SEEK_END)
data.write(c)
elif c == '}' and idx > 0:
token = orig_data[idx-1:idx+1]
prev_idx = -1
if token == '%}' and len(block_openings) > 0:
prev_idx = block_openings.pop()
elif token == '}}' and len(print_openings) > 0:
prev_idx = print_openings.pop()
# if we have a closing token, and we have previously found
# the opening to the same kind of block represented by this
# token, replace both occurrences, otherwise we just write
# the current character to the buffer
if prev_idx != -1:
# replace the opening
data.seek(prev_idx, os.SEEK_SET)
data.write('{#')
# replace the closing
data.seek(-1, os.SEEK_END)
data.write('#}')
else:
data.seek(0, os.SEEK_END)
data.write(c)
else:
# not a jinja2 token, so we just write the current char
# to the output buffer
data.seek(0, os.SEEK_END)
data.write(c)
return_data = data.getvalue()
data.close()
return return_data
def _clean_data_cstringio(orig_data, from_remote=False, from_inventory=False):
''' remove jinja2 template tags from a string '''
if not isinstance(orig_data, basestring):
return orig_data
data = cStringIO.StringIO()
# when the data is marked as having come from a remote, we always
# replace any print blocks (ie. {{var}}), however when marked as coming
# from inventory we only replace print blocks that contain a call to
# a lookup plugin (ie. {{lookup('foo','bar'))}})
replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None)
# these variables keep track of opening block locations, as we only
# want to replace matched pairs of print/block tags
print_openings = []
block_openings = []
for idx,c in enumerate(orig_data):
# if the current character is an opening brace, check to
# see if this is a jinja2 token. Otherwise, if the current
# character is a closing brace, we backup one character to
# see if we have a closing.
if c == '{' and idx < len(orig_data) - 1:
token = orig_data[idx:idx+2]
# if so, and we want to replace this block, push
# this token's location onto the appropriate array
if token == '{{' and replace_prints:
print_openings.append(idx)
elif token == '{%':
block_openings.append(idx)
# finally we write the data to the buffer and write
data.seek(0, os.SEEK_END)
data.write(c)
elif c == '}' and idx > 0:
token = orig_data[idx-1:idx+1]
prev_idx = -1
if token == '%}' and len(block_openings) > 0:
prev_idx = block_openings.pop()
elif token == '}}' and len(print_openings) > 0:
prev_idx = print_openings.pop()
# if we have a closing token, and we have previously found
# the opening to the same kind of block represented by this
# token, replace both occurrences, otherwise we just write
# the current character to the buffer
if prev_idx != -1:
# replace the opening
data.seek(prev_idx, os.SEEK_SET)
data.write('{#')
# replace the closing
data.seek(-1, os.SEEK_END)
data.write('#}')
else:
data.seek(0, os.SEEK_END)
data.write(c)
else:
# not a jinja2 token, so we just write the current char
# to the output buffer
data.seek(0, os.SEEK_END)
data.write(c)
return_data = data.getvalue()
data.close()
return return_data
PRINT_CODE_REGEX = re.compile(r'(?:{[{%]|[%}]})')
ONLY_CODE_REGEX = re.compile(r'(?:{%|%})')
def _regex(orig_data, from_remote=False, from_inventory=False):
''' remove jinja2 template tags from a string '''
if not isinstance(orig_data, basestring):
return orig_data
# when the data is marked as having come from a remote, we always
# replace any print blocks (ie. {{var}}), however when marked as coming
# from inventory we only replace print blocks that contain a call to
# a lookup plugin (ie. {{lookup('foo','bar'))}})
replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None)
regex = PRINT_CODE_REGEX if replace_prints else ONLY_CODE_REGEX
with contextlib.closing(cStringIO.StringIO()) as data:
# these variables keep track of opening block locations, as we only
# want to replace matched pairs of print/block tags
last_pos = 0
print_openings = []
block_openings = []
for mo in regex.finditer(orig_data):
token = mo.group(0)
token_start = mo.start(0)
token_end = mo.end(0)
if token[0] == '{':
if token == '{%':
block_openings.append(token_start)
elif token == '{{':
print_openings.append(token_start)
data.write(orig_data[last_pos:token_end])
elif token[1] == '}':
prev_idx = None
if token == '%}' and block_openings:
prev_idx = block_openings.pop()
elif token == '}}' and print_openings:
prev_idx = print_openings.pop()
data.write(orig_data[last_pos:token_start])
if prev_idx is not None:
# replace the opening
data.seek(prev_idx, os.SEEK_SET)
data.write('{#')
# replace the closing
data.seek(0, os.SEEK_END)
data.write('#}')
else:
data.write(token)
else:
assert False, 'Unhandled regex match'
last_pos = token_end
data.write(orig_data[last_pos:])
return data.getvalue()
def generate_test_strings():
template = '{{ }}'
whitespace = ' ' * len(template)
templates = (
template * 4,
(template * 3) + whitespace,
(template + whitespace) * 2,
template + (whitespace * 3),
whitespace * 4,
)
for tens in xrange(0,4):
length = 10 ** tens
for i, template in enumerate(templates):
yield i / 4.0, template * length
import timeit
to_bench = {
'orig': _clean_data,
'orig cs': _clean_data_cstringio,
'regex': _regex,
}
tests = {
'none ': dict(from_remote=False, from_inventory=False),
'remote ': dict(from_remote=False, from_inventory=False),
'inventory': dict(from_remote=False, from_inventory=False),
'both ': dict(from_remote=False, from_inventory=False)
}
name_padding = max(len(s) for s in to_bench.keys())
for t in generate_test_strings():
test_string = intern(t[1])
print 'length:', len(t[1]), 'density:', t[0]
for test, kwargs in tests.iteritems():
for name, fn in to_bench.iteritems():
fn(t, **kwargs)
print name.ljust(name_padding), test, '{:.8f}'.format(min(timeit.repeat(
stmt=lambda: fn(test_string, **kwargs),
number=1000,
repeat=20)))
print
length: 20 density: 0.0
regex none 0.00233603
orig cs none 0.01116395
orig none 0.03076696
regex inventory 0.00235701
orig cs inventory 0.01127720
orig inventory 0.03086901
regex both 0.00232005
orig cs both 0.01120687
orig both 0.03084397
regex remote 0.00232410
orig cs remote 0.01129794
orig remote 0.03086996
length: 20 density: 0.25
regex none 0.00231910
orig cs none 0.01046991
orig none 0.03019309
regex inventory 0.00233412
orig cs inventory 0.01054406
orig inventory 0.03022504
regex both 0.00233984
orig cs both 0.01054120
orig both 0.03017092
regex remote 0.00230885
orig cs remote 0.01054597
orig remote 0.03020096
length: 20 density: 0.5
regex none 0.00226808
orig cs none 0.00974989
orig none 0.02947187
regex inventory 0.00223994
orig cs inventory 0.00973392
orig inventory 0.02951097
regex both 0.00225091
orig cs both 0.00973988
orig both 0.02943206
regex remote 0.00227809
orig cs remote 0.00979209
orig remote 0.02976108
length: 20 density: 0.75
regex none 0.00228810
orig cs none 0.00916791
orig none 0.02876782
regex inventory 0.00223589
orig cs inventory 0.00901389
orig inventory 0.02870798
regex both 0.00226593
orig cs both 0.00899220
orig both 0.02879286
regex remote 0.00224280
orig cs remote 0.00903201
orig remote 0.02879405
length: 20 density: 1.0
regex none 0.00212598
orig cs none 0.00824499
orig none 0.02791786
regex inventory 0.00211096
orig cs inventory 0.00822306
orig inventory 0.02799916
regex both 0.00214314
orig cs both 0.00822902
orig both 0.02803493
regex remote 0.00210404
orig cs remote 0.00870681
orig remote 0.02804685
length: 200 density: 0.0
regex none 0.00437117
orig cs none 0.10380411
orig none 0.29348207
regex inventory 0.00436783
orig cs inventory 0.10435104
orig inventory 0.29324698
regex both 0.00440097
orig cs both 0.10405898
orig both 0.29357386
regex remote 0.00437903
orig cs remote 0.10408211
orig remote 0.29358387
length: 200 density: 0.25
regex none 0.00419116
orig cs none 0.09632802
orig none 0.28626513
regex inventory 0.00419402
orig cs inventory 0.09633589
orig inventory 0.28691602
regex both 0.00421095
orig cs both 0.09631491
orig both 0.28645802
regex remote 0.00419688
orig cs remote 0.09635997
orig remote 0.28712702
length: 200 density: 0.5
regex none 0.00368094
orig cs none 0.08877921
orig none 0.27943301
regex inventory 0.00369215
orig cs inventory 0.08880091
orig inventory 0.27882290
regex both 0.00374818
orig cs both 0.08862686
orig both 0.30531192
regex remote 0.00372386
orig cs remote 0.10351205
orig remote 0.32544017
length: 200 density: 0.75
regex none 0.00362611
orig cs none 0.09516382
orig none 0.32432103
regex inventory 0.00345397
orig cs inventory 0.09414315
orig inventory 0.27665305
regex both 0.00341392
orig cs both 0.08206987
orig both 0.27330804
regex remote 0.00338101
orig cs remote 0.08216286
orig remote 0.27378011
length: 200 density: 1.0
regex none 0.00282884
orig cs none 0.07388496
orig none 0.26605392
regex inventory 0.00289392
orig cs inventory 0.07474899
orig inventory 0.26591420
regex both 0.00286889
orig cs both 0.07398915
orig both 0.26416993
regex remote 0.00287604
orig cs remote 0.07397103
orig remote 0.26444697
length: 2000 density: 0.0
regex none 0.02651596
orig cs none 1.04446483
orig none 3.16794777
regex inventory 0.02648711
orig cs inventory 1.04291296
orig inventory 3.17617607
regex both 0.02652907
orig cs both 1.04543400
orig both 3.17354608
regex remote 0.02650285
orig cs remote 1.04383111
orig remote 3.17507601
length: 2000 density: 0.25
regex none 0.02319193
orig cs none 0.97022009
orig none 3.08056402
regex inventory 0.02326107
orig cs inventory 0.97225213
orig inventory 3.09071088
regex both 0.02322698
orig cs both 0.97343111
orig both 3.08821392
regex remote 0.02319694
orig cs remote 0.97419095
orig remote 3.09205699
length: 2000 density: 0.5
regex none 0.01878405
orig cs none 0.89367700
orig none 3.02064180
regex inventory 0.01838994
orig cs inventory 0.89543295
orig inventory 3.01873088
regex both 0.01818085
orig cs both 0.89289713
orig both 3.01739812
regex remote 0.01843214
orig cs remote 0.89358997
orig remote 3.01982999
length: 2000 density: 0.75
regex none 0.01398683
orig cs none 0.81412983
orig none 2.92859387
regex inventory 0.01400113
orig cs inventory 0.81458783
orig inventory 2.93060899
regex both 0.01399398
orig cs both 0.81417394
orig both 2.92725301
regex remote 0.01393986
orig cs remote 0.81492209
orig remote 2.92646503
length: 2000 density: 1.0
regex none 0.00927281
orig cs none 0.73526096
orig none 2.84585190
regex inventory 0.00925303
orig cs inventory 0.73610306
orig inventory 2.85367703
regex both 0.00924015
orig cs both 0.73535395
orig both 2.84879208
regex remote 0.00921488
orig cs remote 0.73591018
orig remote 2.84457397
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment