Created
April 8, 2014 18:48
-
-
Save liori/10170227 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import timeit\n", | |
"import re\n", | |
"import random" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# build a test data set:\n", | |
"strings_to_match = ['%04dmagicword' % i for i in xrange(3000)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print strings_to_match[:10]\n", | |
"print len(strings_to_match)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['0000magicword', '0001magicword', '0002magicword', '0003magicword', '0004magicword', '0005magicword', '0006magicword', '0007magicword', '0008magicword', '0009magicword']\n", | |
"3000\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# build a test \"file\"\n", | |
"my_file = []\n", | |
"with open('/usr/share/dict/words') as fh:\n", | |
" random.seed(42)\n", | |
" for line in fh:\n", | |
" split_point = random.randint(0, len(line))\n", | |
" my_file.append(line[:split_point] + random.choice(strings_to_match) + line[split_point:])\n", | |
"\n", | |
"my_file = my_file[:1000000]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print my_file[:10]\n", | |
"print len(my_file)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['a0075magicword\\n', '0669magicwordA\\n', 'aa2030magicword\\n', 'AA\\n0260magicword', 'aa0089magicworda\\n', 'A1516magicwordachen\\n', '0596magicwordAachenem\\n', 'Aachen1634magicwordie\\n', 'Aa1767magicwordchenowi\\n', 'Aachenu0019magicword\\n']\n", | |
"1000000\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def naive_match():\n", | |
" for line in my_file:\n", | |
" if not any(s in line for s in strings_to_match):\n", | |
" return False\n", | |
" return True" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%timeit\n", | |
"naive_match()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 1min 46s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def re_match():\n", | |
" # building regular expression to match\n", | |
" expression = re.compile(\n", | |
" '(' + \n", | |
" '|'.join(re.escape(item) for item in strings_to_match) +\n", | |
" ')')\n", | |
"\n", | |
" # perform matching\n", | |
" for line in my_file:\n", | |
" if not expression.search(line):\n", | |
" return False\n", | |
" return True" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%timeit\n", | |
"re_match()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 9.97 s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment