Skip to content

Instantly share code, notes, and snippets.

@jedmitten
Created February 11, 2017 00:25

Revisions

  1. jedmitten created this gist Feb 11, 2017.
    132 changes: 132 additions & 0 deletions regex_speedtest.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,132 @@
    import timeit
    import logging
    import re

    log = logging.getLogger(__name__)
    __author__ = 'jed.mitten'

    # urls from a shuffle of https://github.eng.fireeye.com/gist/jed-mitten/04ed7a75f428589199e0887a919ab9f9
    URLS = ['http://www.anncol.eu',
    'http://www.sudaneseonline.com/cgi-bin/sdb/2bb.cgi?seq=msg&board=85&msg=1150001892&rn=1',
    'http://www.wegames.net',
    'http://seattlest.com/2010/04/27/seattle_cartoonists_everybody_draw.php',
    'http://www.almktaba.com',
    'http://mslamh.jeeran.com',
    'http://hidemyass.com',
    'http://www.duke.org',
    'http://www.agapehouseghana.org/',
    'http://wupload.com',
    'http://torrent82.com',
    'http://www.mapuche-nation.org/espanol/indice.htm/',
    'http://beatleofdoom.livejournal.com',
    'http://abdulemam.blogspot.com',
    'http://www.arfd.am',
    'http://www.download.ru',
    'http://gayswithoutborders.wordpress.com',
    'http://blog.tribunadonorte.com.br/heitorgregorio/fatima-bezerra-comanda-manifestacao-contra-michel-temer-no-pingo-da-mei-dia/',
    'http://loveplanet.ru',
    'http://www.atimes.com/se-asia/se-asia.html',
    ]

    DOMAIN_REGEXES = [r'google\.com',
    r'^www\.yahoo\.com$',
    r'bing\.com',
    r'digicert\.com',
    r'microsoft\.com',
    r'msftncsi\.com',
    r'dropbox\.com',
    r'crl\.verisign\.com',
    r'winxpsp3',
    r'win7sp1',
    r'download\.windowsupdate\.com',
    r'ns1\.3322\.net',
    r'ns1\.oray\.net',
    r'ns1\.china\.com',
    ]

    P1 = re.compile(r'\.?({})'.format(r'|'.join(DOMAIN_REGEXES)))
    P2 = re.compile(r'|'.join(DOMAIN_REGEXES))


    def do_search(re_obj, inputs, num=None):
    if num is None:
    num = len(inputs)
    for line in inputs[:num]:
    re_obj.search(line)


    def do_search_iter(re_list, inputs, num=None):
    if num is None:
    num = len(inputs)
    for re_str in re_list:
    re_obj = re.compile(re_str)
    for line in inputs[:num]:
    re_obj.search(line)


    def do_search_iter_compiled(re_list, inputs, num=None):
    re_objs = []
    for re_str in re_list:
    re_objs.append(re.compile(re_str))
    if num is None:
    num = len(inputs)
    for re_obj in re_objs:
    for line in inputs[:num]:
    re_obj.search(line)


    def do_search_p1_1():
    do_search(P1, URLS, 1)


    def do_search_p2_1():
    do_search(P2, URLS, 1)


    def do_search_iter_1():
    do_search_iter(DOMAIN_REGEXES, URLS, 1)


    def do_search_iter_compile_1():
    do_search_iter_compiled(DOMAIN_REGEXES, URLS, 1)


    def do_search_p1_10():
    do_search(P1, URLS, 10)


    def do_search_p2_10():
    do_search(P2, URLS, 10)


    def do_search_p1_100():
    do_search(P1, URLS, 100)


    def do_search_p2_100():
    do_search(P2, URLS, 100)


    def do_search_p1_1000():
    do_search(P1, URLS, 1000)


    def do_search_p2_1000():
    do_search(P2, URLS, 1000)

    # timeit.timeit(do_search_p1_1)
    # 2.934359607175311
    # timeit.timeit(do_search_p2_1)
    # 2.2964450297287033
    # timeit.timeit(do_search_iter_1)
    # 23.59554752368704
    # timeit.timeit(do_search_iter_compile_1)
    # 22.47923530400965
    # timeit.timeit(do_search_p1_10)
    # 73.94403021898421
    # timeit.timeit(do_search_p2_10)
    # 38.9072504719249
    # timeit.timeit(do_search_p1_100)
    # timeit.timeit(do_search_p2_100)
    # timeit.timeit(do_search_p1_1000)
    # timeit.timeit(do_search_p2_1000)