Skip to content

Instantly share code, notes, and snippets.

@pablito56
Created May 17, 2013 09:41
Show Gist options
  • Save pablito56/5598063 to your computer and use it in GitHub Desktop.
Save pablito56/5598063 to your computer and use it in GitHub Desktop.
Performance comparisson of different options to check existence of forbidden symbols in incoming strings (standard 'char in string' vs. RegExp).
#!/usr/bin/env python
#-*- coding: utf-8 -*-
u"""
Created on May 15, 2013
@author: pablito56
@license: MIT
@contact: pablito56@gmail.com
Performance comparisson of different options to check existence of forbidden
symbols in incoming strings (standard 'char in string' vs. RegExp).
It checks 1, 3, 5 or 8 forbidden characters existence in 1k, 10k or 100k words
of 15, 20 or 30 random characters length.
It tests worst case; forbidden chars are never found.
Results at the end of the file
"""
from string import ascii_letters, punctuation, digits
from random import seed, choice
import timeit
import re
# We check worse case; symbols will never be found
chars = ascii_letters + digits # + punctuation
seed()
forbidden_8 = (":", "/", "?", "#", "~", "=", "&", "!")
forbidden_1 = forbidden_8[0]
forbidden_1_re = re.compile(forbidden_1)
def check_forbidden():
num_times = 1000
symbols_amount = (3, 5, 8)
words_amount = (1000, 10000, 100000)
wrods_sizes = (15, 20, 30)
header = "\t| {:^10} | {:^10} | {:^10} | {:^10} |".format("# SYMBOLS", "TIME NO RE", "TIME RE", "RE SpeedUp")
log_trace = "\t| {:^10} | {:^10.5f} | {:^10.5f} | {:^10.5f} |"
for num_words in words_amount:
for size in wrods_sizes:
print "CHECKING {} WORDS OF {} CHARS LENGTH:".format(num_words, size)
words = ["".join([choice(chars) for i in xrange(size)]) for j in xrange(num_words)]
print header
def find_forbidden_1():
return map(lambda w: forbidden_1 in w, words)
def find_forbidden_1_re():
return map(lambda w: forbidden_1_re.search(w) is not None, words)
t_std = timeit.timeit(find_forbidden_1, number=num_times)
t_res = timeit.timeit(find_forbidden_1_re, number=num_times)
print log_trace.format(1, t_std, t_res, t_std / t_res)
for num_symbols in symbols_amount:
forbidden = forbidden_8[:num_symbols]
forbidden_re = re.compile("[" + "".join(forbidden) + "]")
def find_forbidden_N():
return map(lambda w: any((f in w for f in forbidden)), words)
# return map(lambda w: any((f == c for f in forbidden for c in w)), words) # Several times slower
def find_forbidden_N_re():
return map(lambda w: forbidden_re.search(w) is not None, words)
t_std = timeit.timeit(find_forbidden_N, number=num_times)
t_res = timeit.timeit(find_forbidden_N_re, number=num_times)
print log_trace.format(num_symbols, t_std, t_res, t_std / t_res)
print
if __name__ == '__main__':
check_forbidden()
u"""
Results on MacBook Pro 13" 2.4 Ghz Intel Core i5 8Gb
CHECKING 1000 WORDS OF 15 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 0.22396 | 0.51894 | 0.43157 |
| 3 | 1.23259 | 0.66840 | 1.84410 |
| 5 | 1.91099 | 0.69866 | 2.73521 |
| 8 | 2.21286 | 0.73737 | 3.00100 |
CHECKING 1000 WORDS OF 20 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 0.24142 | 0.56059 | 0.43064 |
| 3 | 1.35343 | 0.80293 | 1.68561 |
| 5 | 1.55931 | 0.69956 | 2.22900 |
| 8 | 2.27682 | 0.70398 | 3.23423 |
CHECKING 1000 WORDS OF 30 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 0.25355 | 0.56270 | 0.45060 |
| 3 | 1.21860 | 0.78617 | 1.55003 |
| 5 | 1.54455 | 0.78203 | 1.97506 |
| 8 | 2.04291 | 1.13801 | 1.79516 |
CHECKING 10000 WORDS OF 15 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 2.33030 | 5.25763 | 0.44322 |
| 3 | 12.56244 | 6.80868 | 1.84506 |
| 5 | 15.09999 | 6.68055 | 2.26029 |
| 8 | 19.07921 | 7.10648 | 2.68476 |
CHECKING 10000 WORDS OF 20 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 2.36260 | 5.80443 | 0.40703 |
| 3 | 12.29937 | 7.57868 | 1.62289 |
| 5 | 15.11851 | 7.56908 | 1.99740 |
| 8 | 19.59392 | 7.70403 | 2.54333 |
CHECKING 10000 WORDS OF 30 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 2.47049 | 5.85040 | 0.42228 |
| 3 | 12.75136 | 8.43134 | 1.51238 |
| 5 | 15.77369 | 8.77852 | 1.79685 |
| 8 | 20.65556 | 8.53470 | 2.42019 |
CHECKING 100000 WORDS OF 15 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 24.20128 | 55.73680 | 0.43421 |
| 3 | 122.62828 | 69.88992 | 1.75459 |
| 5 | 148.97942 | 68.43717 | 2.17688 |
| 8 | 188.52356 | 69.15813 | 2.72598 |
CHECKING 100000 WORDS OF 20 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 24.71635 | 55.20613 | 0.44771 |
| 3 | 122.12375 | 73.54637 | 1.66050 |
| 5 | 150.11232 | 72.87738 | 2.05979 |
| 8 | 191.79193 | 73.47913 | 2.61016 |
CHECKING 100000 WORDS OF 30 CHARS LENGTH:
| # SYMBOLS | TIME NO RE | TIME RE | RE SpeedUp |
| 1 | 25.36807 | 56.23131 | 0.45114 |
| 3 | 125.01627 | 82.79500 | 1.50995 |
| 5 | 155.50607 | 82.55599 | 1.88364 |
| 8 | 204.95154 | 84.17276 | 2.43489 |
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment