Skip to content

Instantly share code, notes, and snippets.

@jedmitten
Created February 11, 2017 00:25
Show Gist options
  • Save jedmitten/15574dca8aeb378518e700e8a4f1afbc to your computer and use it in GitHub Desktop.
Save jedmitten/15574dca8aeb378518e700e8a4f1afbc to your computer and use it in GitHub Desktop.
a demonstration of speed differences to between 3 different styles of regular expression building / execution in python
import timeit
import logging
import re
log = logging.getLogger(__name__)
__author__ = 'jed.mitten'
# urls from a shuffle of https://github.eng.fireeye.com/gist/jed-mitten/04ed7a75f428589199e0887a919ab9f9
URLS = ['http://www.anncol.eu',
'http://www.sudaneseonline.com/cgi-bin/sdb/2bb.cgi?seq=msg&board=85&msg=1150001892&rn=1',
'http://www.wegames.net',
'http://seattlest.com/2010/04/27/seattle_cartoonists_everybody_draw.php',
'http://www.almktaba.com',
'http://mslamh.jeeran.com',
'http://hidemyass.com',
'http://www.duke.org',
'http://www.agapehouseghana.org/',
'http://wupload.com',
'http://torrent82.com',
'http://www.mapuche-nation.org/espanol/indice.htm/',
'http://beatleofdoom.livejournal.com',
'http://abdulemam.blogspot.com',
'http://www.arfd.am',
'http://www.download.ru',
'http://gayswithoutborders.wordpress.com',
'http://blog.tribunadonorte.com.br/heitorgregorio/fatima-bezerra-comanda-manifestacao-contra-michel-temer-no-pingo-da-mei-dia/',
'http://loveplanet.ru',
'http://www.atimes.com/se-asia/se-asia.html',
]
DOMAIN_REGEXES = [r'google\.com',
r'^www\.yahoo\.com$',
r'bing\.com',
r'digicert\.com',
r'microsoft\.com',
r'msftncsi\.com',
r'dropbox\.com',
r'crl\.verisign\.com',
r'winxpsp3',
r'win7sp1',
r'download\.windowsupdate\.com',
r'ns1\.3322\.net',
r'ns1\.oray\.net',
r'ns1\.china\.com',
]
P1 = re.compile(r'\.?({})'.format(r'|'.join(DOMAIN_REGEXES)))
P2 = re.compile(r'|'.join(DOMAIN_REGEXES))
def do_search(re_obj, inputs, num=None):
if num is None:
num = len(inputs)
for line in inputs[:num]:
re_obj.search(line)
def do_search_iter(re_list, inputs, num=None):
if num is None:
num = len(inputs)
for re_str in re_list:
re_obj = re.compile(re_str)
for line in inputs[:num]:
re_obj.search(line)
def do_search_iter_compiled(re_list, inputs, num=None):
re_objs = []
for re_str in re_list:
re_objs.append(re.compile(re_str))
if num is None:
num = len(inputs)
for re_obj in re_objs:
for line in inputs[:num]:
re_obj.search(line)
def do_search_p1_1():
do_search(P1, URLS, 1)
def do_search_p2_1():
do_search(P2, URLS, 1)
def do_search_iter_1():
do_search_iter(DOMAIN_REGEXES, URLS, 1)
def do_search_iter_compile_1():
do_search_iter_compiled(DOMAIN_REGEXES, URLS, 1)
def do_search_p1_10():
do_search(P1, URLS, 10)
def do_search_p2_10():
do_search(P2, URLS, 10)
def do_search_p1_100():
do_search(P1, URLS, 100)
def do_search_p2_100():
do_search(P2, URLS, 100)
def do_search_p1_1000():
do_search(P1, URLS, 1000)
def do_search_p2_1000():
do_search(P2, URLS, 1000)
# timeit.timeit(do_search_p1_1)
# 2.934359607175311
# timeit.timeit(do_search_p2_1)
# 2.2964450297287033
# timeit.timeit(do_search_iter_1)
# 23.59554752368704
# timeit.timeit(do_search_iter_compile_1)
# 22.47923530400965
# timeit.timeit(do_search_p1_10)
# 73.94403021898421
# timeit.timeit(do_search_p2_10)
# 38.9072504719249
# timeit.timeit(do_search_p1_100)
# timeit.timeit(do_search_p2_100)
# timeit.timeit(do_search_p1_1000)
# timeit.timeit(do_search_p2_1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment