Last active
May 28, 2020 06:14
-
-
Save rajatscode/f6c3db2234911b7e5dce0020d953e2d6 to your computer and use it in GitHub Desktop.
Generating random lowercase ASCII strings in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import cProfile | |
import itertools | |
import random | |
import string | |
ALL_FUNCTION_NAMES=[ | |
'using_choice', 'using_sample', 'using_randint', 'using_randint_with_offset', 'using_randbits', | |
'using_randbits_ascii_lowercase', 'using_one_long_randbits' | |
] | |
parser = argparse.ArgumentParser( | |
description='Compare performance of methods to generate random ASCII lowercase strings') | |
parser.add_argument('--num_strings', type=int, required=False, default=500, help='number of strings to generate') | |
parser.add_argument('--string_len', type=int, required=False, default=500, | |
help='length of each string to generate' | |
) | |
parser.add_argument( | |
'--functions', nargs='*', type=str, required=False, choices=ALL_FUNCTION_NAMES, default=ALL_FUNCTION_NAMES, | |
help='names of functions to test' | |
) | |
def using_choice(num_strings, string_len): | |
return list( | |
''.join(random.choice(string.ascii_lowercase) for j in range(string_len)) | |
for i in range(num_strings) | |
) | |
def using_sample(num_strings, string_len): | |
return list( | |
''.join(random.sample(string.ascii_lowercase*(string_len//26+1), string_len)) for i in range(num_strings) | |
) | |
def using_randint(num_strings, string_len): | |
return list( | |
''.join(chr(random.randint(97, 122)) for j in range(string_len)) | |
for i in range(num_strings) | |
) | |
def using_randint_with_offset(num_strings, string_len): | |
return list( | |
''.join(chr(97+random.randint(0, 25)) for j in range(string_len)) | |
for i in range(num_strings) | |
) | |
# This method isn't restricted to ASCII lowercase | |
def using_randbits(num_strings, string_len): | |
def bits_to_string(bits, length): | |
return ''.join(chr((bits >> (8*j)) & 0xff) for j in range(length)) | |
return list(bits_to_string(random.getrandbits(8*string_len), string_len) for i in range(num_strings)) | |
# Guarantees ASCII lowercase string with uniform distribution of random letters | |
def using_randbits_ascii_lowercase(num_strings, string_len): | |
letters = itertools.cycle(string.ascii_lowercase) | |
def bits_to_string(bits, length): | |
def bits_to_char(five_bits): | |
return next(letters) if five_bits > 25 else chr(97+five_bits) | |
return ''.join(bits_to_char((bits >> (5*j)) & 0b11111) for j in range(length)) | |
return list(bits_to_string(random.getrandbits(5*string_len), string_len) for i in range(num_strings)) | |
def using_one_long_randbits(num_strings, string_len): | |
letters = itertools.cycle(string.ascii_lowercase) | |
randbits = random.getrandbits(5*string_len*num_strings) | |
def bits_to_string(bits, length, offset): | |
def bits_to_char(five_bits): | |
return next(letters) if five_bits > 25 else chr(97+five_bits) | |
return ''.join(bits_to_char((bits >> (offset+(5*j))) & 0b11111) for j in range(length)) | |
return list(bits_to_string(randbits, string_len, 5*string_len*i) for i in range(num_strings)) | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
print( | |
'Invoking functions {%s} to generate %d strings of length %d:\n' % | |
(', '.join(args.functions), args.num_strings, args.string_len) | |
) | |
for function in args.functions: | |
print('%s:' % function) | |
cProfile.run('%s(%d,%d)' % (function, args.num_strings, args.string_len)) | |
def naive_ascii(num_strings, string_len): | |
return list(''.join(random.choice(string.ascii_lowercase) for j in range(string_len)) for i in range(num_strings)) | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
functions_to_call = args.functions if args.functions else ALL_FUNCTION_NAMES | |
print( | |
'Invoking functions {%s} to generate %d strings of length %d' % | |
(', '.join(functions_to_call), args.num_strings, args.string_len) | |
) | |
for function in functions_to_call: | |
cProfile.run('%s(%d,%d)' % (function, args.num_strings, args.string_len)) |
StackOverflow question relating to why the using_randbits
performance varies unpredictably: https://stackoverflow.com/questions/62057891/same-python-code-appears-to-have-different-performance-characteristics
I think cProfile
was occasionally _under_reporting the runtime of these methods, as evidenced by wrapping cProfile.run()
inside another cProfile.run()
(but who profiles the profiler?) and noticing that the "fast" results had a big discrepancy with how long cProfile.run()
takes: https://pastebin.com/C4W4FEjJ
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The functions using random.getrandbits() have behaved a little oddly on my system. Not sure why but the generator expression inside bits_to_string() gets invoked either num_strings+1 or (num_strings+1)*string_len times, drastically affecting the performance of the using_randbits...() functions.
See https://pastebin.com/1u1j1ZUt for a local Python (2.7) session where it appears the exact same code has drastically different performance characteristics when I redefine and re-run it. I even "redefined" it by simply using the up arrow and Enter keys in the Python REPL, so it should be exactly the same code.
Absolutely no clue what's happening there.