Skip to content

Instantly share code, notes, and snippets.

@pganssle
Created August 2, 2017 16:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pganssle/fe45b08092f2ba56e29295f56377d1f6 to your computer and use it in GitHub Desktop.
Save pganssle/fe45b08092f2ba56e29295f56377d1f6 to your computer and use it in GitHub Desktop.
import random
import string
from timeit import timeit
def _recombine_skipped_queue(tokens, skipped_idxs):
"""
>>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"]
>>> skipped_idxs = set([0, 1, 2, 5])
>>> _recombine_skipped(tokens, skipped_idxs)
["foo bar", "baz"]
"""
# This groups consecutive values
skipped_tokens = []
idx_queue = []
for idx in skipped_idxs:
if idx_queue and idx - 1 != idx_queue[-1]:
skipped_tokens.append(''.join(map(tokens.__getitem__, idx_queue)))
idx_queue = []
idx_queue.append(idx)
if idx_queue:
skipped_tokens.append(''.join(map(tokens.__getitem__, idx_queue)))
return skipped_tokens
def _recombine_skipped(tokens, skipped_idxs):
"""
>>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"]
>>> skipped_idxs = set([0, 1, 2, 5])
>>> _recombine_skipped(tokens, skipped_idxs)
["foo bar", "baz"]
"""
skipped_tokens = []
idx_queue = []
for i, idx in enumerate(sorted(skipped_idxs)):
if i > 0 and idx - 1 == skipped_idxs[i - 1]:
skipped_tokens[-1] = skipped_tokens[-1] + tokens[idx]
else:
skipped_tokens.append(tokens[idx])
return skipped_tokens
def _recombine_skipped_set(tokens, skipped_idxs):
"""
>>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"]
>>> skipped_idxs = set([0, 1, 2, 5])
>>> _recombine_skipped(tokens, skipped_idxs)
["foo bar", "baz"]
"""
skipped_tokens = []
for idx in sorted(list(skipped_idxs)):
if idx-1 in skipped_idxs:
skipped_tokens[-1] = skipped_tokens[-1] + tokens[idx]
else:
skipped_tokens.append(tokens[idx])
return skipped_tokens
def get_rand_tokens(n_token_sets=500, token_population=300):
ALL_TOKENS = [''.join(random.choice(string.ascii_letters)
for x in range(random.randint(1, 20)))
for i in range(token_population)]
token_sets = []
for ii in range(n_token_sets):
n_tokens = random.randint(1, 15)
n_skipped_tokens = random.randint(0, n_tokens)
all_idxs = list(range(n_tokens))
tokens = [random.choice(ALL_TOKENS) for i in range(0, n_tokens)]
skipped_idxs = sorted(random.sample(all_idxs, k=n_skipped_tokens))
token_sets.append((tokens, skipped_idxs))
return token_sets
def assert_same_result(token_set):
for tokens, skipped_idxs in token_set:
skipped_idxs_set = set(skipped_idxs)
lversion = _recombine_skipped(tokens, skipped_idxs)
sversion = _recombine_skipped_set(tokens, skipped_idxs_set)
msg = ('Failure with:\n' +
' tokens == {}\n' +
' skipped_idxs == {}\n' +
' lversion == {}\n'
' sversion == {}').format(tokens, skipped_idxs,
lversion, sversion)
assert lversion == sversion, msg
def test_token_recombination(token_sets):
for tok_args in token_sets:
_recombine_skipped(*tok_args)
def test_token_recombination_queue(token_sets):
for tok_args in token_sets:
_recombine_skipped_queue(*tok_args)
def test_token_recombination_set(token_sets_set):
for tok_args in token_sets_set:
_recombine_skipped_set(*tok_args)
def print_results(loop_sets, N_loops, N_sets):
m = sum(loop_times) * 1000 / N_loops
m_set = (m / N_sets) * 1000
print('{} loops, {} sets: {:0.3f} ms per loop, {:0.3f} us per set'.format(
N_loops, N_sets, m, m_set))
if __name__ == "__main__":
from timeit import default_timer as timer
# Make sure it actually works
tokens = tokens = ["foo", " ", "bar", " ", "19June2000", "baz"]
skipped_idxs = [0, 1, 2, 5]
skipped_idxs_set = set(skipped_idxs)
assert _recombine_skipped(tokens, skipped_idxs) == _recombine_skipped_set(tokens, skipped_idxs_set)
# Test with random sets
N_sets = 500
N_loops = 1000
token_sets = get_rand_tokens(N_sets)
token_sets_set = [(tokens, set(skipped_idxs)) for tokens, skipped_idxs in token_sets]
assert_same_result(token_sets)
runs = [('extend', test_token_recombination, (token_sets,)),
('queue', test_token_recombination_queue, (token_sets,)),
('set', test_token_recombination_set, (token_sets_set,))]
for name, func, args, in runs:
print('Running {} timing test'.format(name))
loop_times = []
for i in range(N_loops):
t_start = timer()
func(*args)
t_end = timer()
loop_times.append(t_end - t_start)
print_results(loop_times, N_loops, N_sets)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment