Skip to content

Instantly share code, notes, and snippets.

@FrancescAlted
Created April 13, 2014 18:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FrancescAlted/10595974 to your computer and use it in GitHub Desktop.
Save FrancescAlted/10595974 to your computer and use it in GitHub Desktop.
Example of how to find substrings in large datasets new 'contains' in numexpr
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Example of how to find substrings in large datasets new 'contains' in numexpr\n",
"-----------------------------------------------------------------------------\n",
"\n",
"The code is exercising the recently introduced 'contains' function in\n",
"numexpr so it is shown how fast these queries can be.\n",
"\n",
"The data is based in Google Books Ngrams, whose description can be found\n",
"[here](http://aws.amazon.com/datasets/8172056142375670) and the files can be downloaded from [here](http://storage.googleapis.com/books/ngrams/books/datasetsv2.html).\n",
"\n",
"In particular, the dataset used is the \"English One Million\", and you can download it from a bash shell with a command like:\n",
"\n",
"```\n",
"for i in `seq 0 199`; do wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-1M-3gram-20090715-$i.csv.zip; done\n",
"```\n",
"\n",
"However, in order to keep the time for running the script reasonable, only the 2 first files will be downloaded, so the next command has been used here:\n",
"\n",
"```\n",
"for i in `seq 0 1`; do wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-1M-3gram-20090715-$i.csv.zip; done\n",
"```\n",
"\n",
"which will be stored in the 'eng-1M-3gram' directory."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import glob\n",
"import os.path\n",
"import zipfile\n",
"import blz\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"MAXLINES=100*1000*1000 # a protection against loading too many lines\n",
"rawdir = 'eng-1M-3gram' # the directory where all the .zip files are\n",
"def create_blz(blzname):\n",
" # Read just the the -0 and -1 files\n",
" zfiles = glob.glob(os.path.join(rawdir, '*-[0-1].csv.zip'))\n",
" # Create the container\n",
" dt = np.dtype([('ngram', 'S64'), ('year', 'i2'), ('occur', 'i2'),\n",
" ('pages', 'i2'), ('books', 'i2')])\n",
" bt = blz.btable(np.empty(0, dtype=dt), mode='w',\n",
" expectedlen=MAXLINES,\n",
" rootdir=blzname)\n",
" lines = 0\n",
" errs = 0\n",
" for zfname in zfiles:\n",
" print(\"zfname:\", zfname)\n",
" with zipfile.ZipFile(zfname, 'r') as myzip:\n",
" fname = zfname[len(rawdir)+1:-4] # remove the .zip extension\n",
" csvfile = myzip.open(fname, 'rU')\n",
" try:\n",
" for row in csvfile:\n",
" row = row[:-1].split('\\t')\n",
" if len(row) == len(dt):\n",
" bt.append(row)\n",
" lines += 1\n",
" if lines >= MAXLINES:\n",
" break\n",
" except:\n",
" errs += 1\n",
" # Ignore all the errors\n",
" print \"Offending line:\", lines\n",
" bt.flush()\n",
" pass\n",
"\n",
" bt.flush()\n",
" print \"bt:\", repr(bt)\n",
" # Some statistics\n",
" print \"lines: %d, errs: %d\" % (lines, errs)\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create the BLZ database if necessary\n",
"blzname = rawdir+'.blz'\n",
"if not os.path.exists(blzname):\n",
" bt = create_blz(blzname)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"('zfname:', 'eng-1M-3gram/googlebooks-eng-1M-3gram-20090715-0.csv.zip')\n",
"('zfname:', 'eng-1M-3gram/googlebooks-eng-1M-3gram-20090715-1.csv.zip')"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"bt:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" btable((42561686,), [('ngram', 'S64'), ('year', '<i2'), ('occur', '<i2'), ('pages', '<i2'), ('books', '<i2')])\n",
" nbytes: 2.85 GB; cbytes: 143.26 MB; ratio: 20.40\n",
" bparams := bparams(clevel=5, shuffle=True, cname=blosclz)\n",
" rootdir := 'eng-1M-3gram.blz'\n",
"[('! ! That', 1731, 1, 1, 1) ('! ! That', 1791, 2, 2, 2)\n",
" ('! ! That', 1802, 1, 1, 1) ..., ('\"canal ', 1956, 11, 11, 11)\n",
" ('\"canal ', 1957, 6, 6, 5) ('\"canal ', 1958, 9, 9, 9)]\n",
"lines: 42561688, errs: 0\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Open the file\n",
"bt = blz.open(blzname)\n",
"print \"bt:\", bt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"bt: [('! ! That', 1731, 1, 1, 1) ('! ! That', 1791, 2, 2, 2)\n",
" ('! ! That', 1802, 1, 1, 1) ..., ('\"canal ', 1956, 11, 11, 11)\n",
" ('\"canal ', 1957, 6, 6, 5) ('\"canal ', 1958, 9, 9, 9)]\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Do some queries\n",
"print \"results:\", list(bt.where(\"contains(ngram, 'Eldest')\"))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"results: "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[row(ngram='The Eldest Daughter', year=1811, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1856, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1860, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1861, occur=3, pages=3, books=3), row(ngram='The Eldest Daughter', year=1863, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1867, occur=2, pages=2, books=1), row(ngram='The Eldest Daughter', year=1873, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1881, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1883, occur=4, pages=4, books=2), row(ngram='The Eldest Daughter', year=1884, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1885, occur=4, pages=4, books=4), row(ngram='The Eldest Daughter', year=1887, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1889, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1891, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1900, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1901, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1902, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1908, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1909, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1910, occur=2, pages=2, books=1), row(ngram='The Eldest Daughter', year=1914, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1916, occur=2, pages=2, books=1), row(ngram='The Eldest Daughter', year=1917, occur=14, pages=12, books=2), row(ngram='The Eldest Daughter', year=1918, occur=7, pages=7, books=4), row(ngram='The Eldest Daughter', year=1923, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1942, occur=4, pages=4, books=4), row(ngram='The Eldest Daughter', year=1944, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1954, occur=13, pages=8, books=1), row(ngram='The Eldest Daughter', year=1955, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1957, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1983, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1986, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=2006, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1824, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1829, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1839, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1840, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1842, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1846, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1847, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1848, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1850, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1852, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1855, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1863, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1864, occur=3, pages=3, books=3), row(ngram='\"\"\" Eldest of\"', year=1865, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1866, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1869, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1872, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1875, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1876, occur=3, pages=3, books=3), row(ngram='\"\"\" Eldest of\"', year=1880, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1883, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1884, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1885, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1892, occur=3, pages=3, books=3), row(ngram='\"\"\" Eldest of\"', year=1908, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1912, occur=4, pages=4, books=4), row(ngram='\"\"\" Eldest of\"', year=1916, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1920, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1921, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1922, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1923, occur=3, pages=3, books=3), row(ngram='\"\"\" Eldest of\"', year=1927, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1928, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1929, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1932, occur=4, pages=4, books=4), row(ngram='\"\"\" Eldest of\"', year=1933, occur=6, pages=6, books=1), row(ngram='\"\"\" Eldest of\"', year=1934, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1936, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1940, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1944, occur=3, pages=3, books=3), row(ngram='\"\"\" Eldest of\"', year=1948, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1954, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1965, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1967, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1976, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1977, occur=2, pages=2, books=2), row(ngram='\"\"\" Eldest of\"', year=1991, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1998, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=1999, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=2002, occur=5, pages=5, books=2), row(ngram='\"\"\" Eldest of\"', year=2003, occur=1, pages=1, books=1), row(ngram='\"\"\" Eldest of\"', year=2007, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1851, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1861, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1863, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1868, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1871, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1872, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1873, occur=8, pages=8, books=8), row(ngram='as the Eldest', year=1874, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1875, occur=6, pages=6, books=6), row(ngram='as the Eldest', year=1877, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1880, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1882, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1889, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1891, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1892, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1893, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1897, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1899, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1900, occur=5, pages=5, books=4), row(ngram='as the Eldest', year=1901, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1902, occur=4, pages=4, books=3), row(ngram='as the Eldest', year=1903, occur=8, pages=8, books=8), row(ngram='as the Eldest', year=1905, occur=2, pages=1, books=1), row(ngram='as the Eldest', year=1906, occur=4, pages=4, books=4), row(ngram='as the Eldest', year=1907, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1909, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1910, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1911, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1912, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1913, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1919, occur=4, pages=4, books=4), row(ngram='as the Eldest', year=1921, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1924, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1925, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1929, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1936, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1937, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1938, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1940, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1945, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1949, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1950, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1951, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1953, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1954, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1955, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1958, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1960, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1961, occur=2, pages=2, books=1), row(ngram='as the Eldest', year=1979, occur=3, pages=3, books=1), row(ngram='as the Eldest', year=1980, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1987, occur=3, pages=3, books=1), row(ngram='as the Eldest', year=1989, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1990, occur=4, pages=3, books=2), row(ngram='as the Eldest', year=1991, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1995, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1997, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1998, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1999, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=2002, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=2005, occur=2, pages=1, books=1), row(ngram='as the Eldest', year=2008, occur=1, pages=1, books=1)]\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Do a timing\n",
"%time len(list(bt.where(\"contains(ngram, 'Eldest')\")))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CPU times: user 7.88 s, sys: 248 ms, total: 8.13 s\n",
"Wall time: 3.39 s\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"147"
]
}
],
"prompt_number": 6
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Okay, doing a query in a dataset with more than 42 Mrows in less than 4 seconds is impressing. How much would it take using pure Python 'in' operator?"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%time len([i for i in bt if 'Eldest' in i[0]])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CPU times: user 43.3 s, sys: 443 ms, total: 43.8 s\n",
"Wall time: 42.6 s\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"147"
]
}
],
"prompt_number": 7
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"As can be seen, the new 'contains' can be more than 10 times faster than a regular 'in' operator."
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Of course, more complex queries can be done. For example, let's see how many mentions to Einstein before year 1920 there are in our subset of the Google books:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%time list(bt.where(\"(contains(ngram, 'Einstein')) & (year < 1920)\"))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CPU times: user 9.01 s, sys: 250 ms, total: 9.26 s\n",
"Wall time: 4.22 s\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"[row(ngram=\". Einstein's general\", year=1905, occur=6, pages=6, books=5),\n",
" row(ngram=\". Einstein's general\", year=1916, occur=1, pages=1, books=1),\n",
" row(ngram=\". Einstein's general\", year=1919, occur=2, pages=2, books=2),\n",
" row(ngram='Einstein equation to', year=1899, occur=1, pages=1, books=1),\n",
" row(ngram='Einstein published his', year=1905, occur=8, pages=8, books=6),\n",
" row(ngram='Einstein published his', year=1914, occur=2, pages=2, books=2),\n",
" row(ngram=\"Einstein's equation was\", year=1905, occur=2, pages=2, books=2),\n",
" row(ngram='and Einstein is', year=1911, occur=3, pages=3, books=3),\n",
" row(ngram='and Einstein is', year=1912, occur=1, pages=1, books=1),\n",
" row(ngram='Einstein puts it', year=1911, occur=4, pages=4, books=4)]"
]
}
],
"prompt_number": 9
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Ok, a few mentions to Einstein in Google books, including 5 books as soon as 1905. Supposedly some of these refers to his seminals articles from 1905. And using 4 seconds (at ~10 Mrow/s) for doing the new, more complex query is pretty good too."
]
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment