Created
February 11, 2017 00:25
-
-
Save jedmitten/15574dca8aeb378518e700e8a4f1afbc to your computer and use it in GitHub Desktop.
a demonstration of speed differences to between 3 different styles of regular expression building / execution in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import logging | |
import re | |
log = logging.getLogger(__name__) | |
__author__ = 'jed.mitten' | |
# urls from a shuffle of https://github.eng.fireeye.com/gist/jed-mitten/04ed7a75f428589199e0887a919ab9f9 | |
URLS = ['http://www.anncol.eu', | |
'http://www.sudaneseonline.com/cgi-bin/sdb/2bb.cgi?seq=msg&board=85&msg=1150001892&rn=1', | |
'http://www.wegames.net', | |
'http://seattlest.com/2010/04/27/seattle_cartoonists_everybody_draw.php', | |
'http://www.almktaba.com', | |
'http://mslamh.jeeran.com', | |
'http://hidemyass.com', | |
'http://www.duke.org', | |
'http://www.agapehouseghana.org/', | |
'http://wupload.com', | |
'http://torrent82.com', | |
'http://www.mapuche-nation.org/espanol/indice.htm/', | |
'http://beatleofdoom.livejournal.com', | |
'http://abdulemam.blogspot.com', | |
'http://www.arfd.am', | |
'http://www.download.ru', | |
'http://gayswithoutborders.wordpress.com', | |
'http://blog.tribunadonorte.com.br/heitorgregorio/fatima-bezerra-comanda-manifestacao-contra-michel-temer-no-pingo-da-mei-dia/', | |
'http://loveplanet.ru', | |
'http://www.atimes.com/se-asia/se-asia.html', | |
] | |
DOMAIN_REGEXES = [r'google\.com', | |
r'^www\.yahoo\.com$', | |
r'bing\.com', | |
r'digicert\.com', | |
r'microsoft\.com', | |
r'msftncsi\.com', | |
r'dropbox\.com', | |
r'crl\.verisign\.com', | |
r'winxpsp3', | |
r'win7sp1', | |
r'download\.windowsupdate\.com', | |
r'ns1\.3322\.net', | |
r'ns1\.oray\.net', | |
r'ns1\.china\.com', | |
] | |
P1 = re.compile(r'\.?({})'.format(r'|'.join(DOMAIN_REGEXES))) | |
P2 = re.compile(r'|'.join(DOMAIN_REGEXES)) | |
def do_search(re_obj, inputs, num=None): | |
if num is None: | |
num = len(inputs) | |
for line in inputs[:num]: | |
re_obj.search(line) | |
def do_search_iter(re_list, inputs, num=None): | |
if num is None: | |
num = len(inputs) | |
for re_str in re_list: | |
re_obj = re.compile(re_str) | |
for line in inputs[:num]: | |
re_obj.search(line) | |
def do_search_iter_compiled(re_list, inputs, num=None): | |
re_objs = [] | |
for re_str in re_list: | |
re_objs.append(re.compile(re_str)) | |
if num is None: | |
num = len(inputs) | |
for re_obj in re_objs: | |
for line in inputs[:num]: | |
re_obj.search(line) | |
def do_search_p1_1(): | |
do_search(P1, URLS, 1) | |
def do_search_p2_1(): | |
do_search(P2, URLS, 1) | |
def do_search_iter_1(): | |
do_search_iter(DOMAIN_REGEXES, URLS, 1) | |
def do_search_iter_compile_1(): | |
do_search_iter_compiled(DOMAIN_REGEXES, URLS, 1) | |
def do_search_p1_10(): | |
do_search(P1, URLS, 10) | |
def do_search_p2_10(): | |
do_search(P2, URLS, 10) | |
def do_search_p1_100(): | |
do_search(P1, URLS, 100) | |
def do_search_p2_100(): | |
do_search(P2, URLS, 100) | |
def do_search_p1_1000(): | |
do_search(P1, URLS, 1000) | |
def do_search_p2_1000(): | |
do_search(P2, URLS, 1000) | |
# timeit.timeit(do_search_p1_1) | |
# 2.934359607175311 | |
# timeit.timeit(do_search_p2_1) | |
# 2.2964450297287033 | |
# timeit.timeit(do_search_iter_1) | |
# 23.59554752368704 | |
# timeit.timeit(do_search_iter_compile_1) | |
# 22.47923530400965 | |
# timeit.timeit(do_search_p1_10) | |
# 73.94403021898421 | |
# timeit.timeit(do_search_p2_10) | |
# 38.9072504719249 | |
# timeit.timeit(do_search_p1_100) | |
# timeit.timeit(do_search_p2_100) | |
# timeit.timeit(do_search_p1_1000) | |
# timeit.timeit(do_search_p2_1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment