Created
February 11, 2017 00:25
Revisions
-
jedmitten created this gist
Feb 11, 2017 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,132 @@ import timeit import logging import re log = logging.getLogger(__name__) __author__ = 'jed.mitten' # urls from a shuffle of https://github.eng.fireeye.com/gist/jed-mitten/04ed7a75f428589199e0887a919ab9f9 URLS = ['http://www.anncol.eu', 'http://www.sudaneseonline.com/cgi-bin/sdb/2bb.cgi?seq=msg&board=85&msg=1150001892&rn=1', 'http://www.wegames.net', 'http://seattlest.com/2010/04/27/seattle_cartoonists_everybody_draw.php', 'http://www.almktaba.com', 'http://mslamh.jeeran.com', 'http://hidemyass.com', 'http://www.duke.org', 'http://www.agapehouseghana.org/', 'http://wupload.com', 'http://torrent82.com', 'http://www.mapuche-nation.org/espanol/indice.htm/', 'http://beatleofdoom.livejournal.com', 'http://abdulemam.blogspot.com', 'http://www.arfd.am', 'http://www.download.ru', 'http://gayswithoutborders.wordpress.com', 'http://blog.tribunadonorte.com.br/heitorgregorio/fatima-bezerra-comanda-manifestacao-contra-michel-temer-no-pingo-da-mei-dia/', 'http://loveplanet.ru', 'http://www.atimes.com/se-asia/se-asia.html', ] DOMAIN_REGEXES = [r'google\.com', r'^www\.yahoo\.com$', r'bing\.com', r'digicert\.com', r'microsoft\.com', r'msftncsi\.com', r'dropbox\.com', r'crl\.verisign\.com', r'winxpsp3', r'win7sp1', r'download\.windowsupdate\.com', r'ns1\.3322\.net', r'ns1\.oray\.net', r'ns1\.china\.com', ] P1 = re.compile(r'\.?({})'.format(r'|'.join(DOMAIN_REGEXES))) P2 = re.compile(r'|'.join(DOMAIN_REGEXES)) def do_search(re_obj, inputs, num=None): if num is None: num = len(inputs) for line in inputs[:num]: re_obj.search(line) def do_search_iter(re_list, inputs, num=None): if num is None: num = len(inputs) for re_str in re_list: re_obj = re.compile(re_str) for line in inputs[:num]: re_obj.search(line) def do_search_iter_compiled(re_list, inputs, num=None): re_objs = [] for re_str in re_list: re_objs.append(re.compile(re_str)) if num is None: num = len(inputs) for re_obj in re_objs: for line in inputs[:num]: re_obj.search(line) def do_search_p1_1(): do_search(P1, URLS, 1) def do_search_p2_1(): do_search(P2, URLS, 1) def do_search_iter_1(): do_search_iter(DOMAIN_REGEXES, URLS, 1) def do_search_iter_compile_1(): do_search_iter_compiled(DOMAIN_REGEXES, URLS, 1) def do_search_p1_10(): do_search(P1, URLS, 10) def do_search_p2_10(): do_search(P2, URLS, 10) def do_search_p1_100(): do_search(P1, URLS, 100) def do_search_p2_100(): do_search(P2, URLS, 100) def do_search_p1_1000(): do_search(P1, URLS, 1000) def do_search_p2_1000(): do_search(P2, URLS, 1000) # timeit.timeit(do_search_p1_1) # 2.934359607175311 # timeit.timeit(do_search_p2_1) # 2.2964450297287033 # timeit.timeit(do_search_iter_1) # 23.59554752368704 # timeit.timeit(do_search_iter_compile_1) # 22.47923530400965 # timeit.timeit(do_search_p1_10) # 73.94403021898421 # timeit.timeit(do_search_p2_10) # 38.9072504719249 # timeit.timeit(do_search_p1_100) # timeit.timeit(do_search_p2_100) # timeit.timeit(do_search_p1_1000) # timeit.timeit(do_search_p2_1000)