Skip to content

Instantly share code, notes, and snippets.

@liori
Created April 8, 2014 18:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save liori/10170227 to your computer and use it in GitHub Desktop.
Save liori/10170227 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import timeit\n",
"import re\n",
"import random"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# build a test data set:\n",
"strings_to_match = ['%04dmagicword' % i for i in xrange(3000)]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print strings_to_match[:10]\n",
"print len(strings_to_match)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"['0000magicword', '0001magicword', '0002magicword', '0003magicword', '0004magicword', '0005magicword', '0006magicword', '0007magicword', '0008magicword', '0009magicword']\n",
"3000\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# build a test \"file\"\n",
"my_file = []\n",
"with open('/usr/share/dict/words') as fh:\n",
" random.seed(42)\n",
" for line in fh:\n",
" split_point = random.randint(0, len(line))\n",
" my_file.append(line[:split_point] + random.choice(strings_to_match) + line[split_point:])\n",
"\n",
"my_file = my_file[:1000000]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print my_file[:10]\n",
"print len(my_file)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"['a0075magicword\\n', '0669magicwordA\\n', 'aa2030magicword\\n', 'AA\\n0260magicword', 'aa0089magicworda\\n', 'A1516magicwordachen\\n', '0596magicwordAachenem\\n', 'Aachen1634magicwordie\\n', 'Aa1767magicwordchenowi\\n', 'Aachenu0019magicword\\n']\n",
"1000000\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def naive_match():\n",
" for line in my_file:\n",
" if not any(s in line for s in strings_to_match):\n",
" return False\n",
" return True"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%timeit\n",
"naive_match()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 3: 1min 46s per loop\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def re_match():\n",
" # building regular expression to match\n",
" expression = re.compile(\n",
" '(' + \n",
" '|'.join(re.escape(item) for item in strings_to_match) +\n",
" ')')\n",
"\n",
" # perform matching\n",
" for line in my_file:\n",
" if not expression.search(line):\n",
" return False\n",
" return True"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%timeit\n",
"re_match()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 3: 9.97 s per loop\n"
]
}
],
"prompt_number": 9
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment