leotop/wikipedia-infoboxes-in-pandas.ipynb

## wikipedia-infoboxes-in-pandas.ipynb
{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from pandas.io.html import read_html\n",
      "page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'\n",
      "infoboxes = read_html(page, index_col=0, infer_types=False, attrs={\"class\":\"infobox\"})\n",
      "wikitables = read_html(page, index_col=0, infer_types=False, attrs={\"class\":\"wikitable\"})\n",
      "\n",
      "print \"Extracted {num} infoboxes\".format(num=len(infoboxes))\n",
      "print \"Extracted {num} wikitables\".format(num=len(wikitables))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Extracted 2 infoboxes\n",
        "Extracted 2 wikitables\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "infoboxes[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>1</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>University of California, Berkeley</th>\n",
        "      <td>                                               nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Unofficial seal of the University of California [1]</th>\n",
        "      <td>                                               nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Motto</th>\n",
        "      <td>                                  Fiat lux (Latin)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Motto in English</th>\n",
        "      <td>                                Let there be light</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Established</th>\n",
        "      <td>                                    March 23, 1868</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Type</th>\n",
        "      <td>                                   Flagship Public</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Endowment</th>\n",
        "      <td>                           $3.33 billion (2013)[2]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Chancellor</th>\n",
        "      <td>                                    Nicholas Dirks</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Students</th>\n",
        "      <td>                             36,204 (Fall 2013)[3]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Undergraduates</th>\n",
        "      <td>                             25,951 (Fall 2013)[4]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Postgraduates</th>\n",
        "      <td>                             10,125 (Fall 2012)[5]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Location</th>\n",
        "      <td>               Berkeley, California, United States</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Campus</th>\n",
        "      <td> Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Nobel Laureates</th>\n",
        "      <td>                                             72[8]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Colors</th>\n",
        "      <td>             Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Athletics</th>\n",
        "      <td> 27 Varsity Teams NCAA Division I California Go...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Nickname</th>\n",
        "      <td>                                      Golden Bears</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Mascot</th>\n",
        "      <td>                                     Oski the Bear</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Affiliations</th>\n",
        "      <td>      AAU IARU Pacific-12 University of California</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Website</th>\n",
        "      <td>                                      Berkeley.edu</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>NaN</th>\n",
        "      <td>                                               nan</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>21 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "                                                                                                     1\n",
        "0                                                                                                     \n",
        "University of California, Berkeley                                                                 nan\n",
        "Unofficial seal of the University of California [1]                                                nan\n",
        "Motto                                                                                 Fiat lux (Latin)\n",
        "Motto in English                                                                    Let there be light\n",
        "Established                                                                             March 23, 1868\n",
        "Type                                                                                   Flagship Public\n",
        "Endowment                                                                      $3.33 billion (2013)[2]\n",
        "Chancellor                                                                              Nicholas Dirks\n",
        "Students                                                                         36,204 (Fall 2013)[3]\n",
        "Undergraduates                                                                   25,951 (Fall 2013)[4]\n",
        "Postgraduates                                                                    10,125 (Fall 2012)[5]\n",
        "Location                                                           Berkeley, California, United States\n",
        "Campus                                               Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...\n",
        "Nobel Laureates                                                                                  72[8]\n",
        "Colors                                                           Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]\n",
        "Athletics                                            27 Varsity Teams NCAA Division I California Go...\n",
        "Nickname                                                                                  Golden Bears\n",
        "Mascot                                                                                   Oski the Bear\n",
        "Affiliations                                              AAU IARU Pacific-12 University of California\n",
        "Website                                                                                   Berkeley.edu\n",
        "NaN                                                                                                nan\n",
        "\n",
        "[21 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "infoboxes[1]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>1</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>University rankings</th>\n",
        "      <td>  nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>National</th>\n",
        "      <td>  nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>ARWU[58]</th>\n",
        "      <td>  3.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Forbes[59]</th>\n",
        "      <td> 22.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>U.S. News &amp; World Report[60]</th>\n",
        "      <td> 20.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Washington Monthly[61]</th>\n",
        "      <td>  5.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Global</th>\n",
        "      <td>  nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>ARWU[62]</th>\n",
        "      <td>  3.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>QS[63]</th>\n",
        "      <td> 25.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Times[64]</th>\n",
        "      <td>  8.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>10 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "                                 1\n",
        "0                                 \n",
        "University rankings            nan\n",
        "National                       nan\n",
        "ARWU[58]                       3.0\n",
        "Forbes[59]                    22.0\n",
        "U.S. News & World Report[60]  20.0\n",
        "Washington Monthly[61]         5.0\n",
        "Global                         nan\n",
        "ARWU[62]                       3.0\n",
        "QS[63]                        25.0\n",
        "Times[64]                      8.0\n",
        "\n",
        "[10 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "wikitables[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>1</th>\n",
        "      <th>2</th>\n",
        "      <th>3</th>\n",
        "      <th>4</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>NaN</th>\n",
        "      <td> Undergraduate</td>\n",
        "      <td> Graduate</td>\n",
        "      <td> California</td>\n",
        "      <td> U.S. Census</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>African American</th>\n",
        "      <td>          3.4%</td>\n",
        "      <td>     3.4%</td>\n",
        "      <td>       6.2%</td>\n",
        "      <td>       12.0%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Asian American</th>\n",
        "      <td>         39.2%</td>\n",
        "      <td>    17.8%</td>\n",
        "      <td>        13%</td>\n",
        "      <td>        4.7%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>White American</th>\n",
        "      <td>         29.0%</td>\n",
        "      <td>    38.0%</td>\n",
        "      <td>      40.1%</td>\n",
        "      <td>       63.7%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Hispanic American</th>\n",
        "      <td>         12.6%</td>\n",
        "      <td>     7.1%</td>\n",
        "      <td>      37.6%</td>\n",
        "      <td>       16.3%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Native American</th>\n",
        "      <td>          0.8%</td>\n",
        "      <td>     1.1%</td>\n",
        "      <td>         1%</td>\n",
        "      <td>        0.7%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>International student</th>\n",
        "      <td>         10.4%</td>\n",
        "      <td>    20.3%</td>\n",
        "      <td>        nan</td>\n",
        "      <td>         nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Other/Unknown</th>\n",
        "      <td>          4.7%</td>\n",
        "      <td>    11.9%</td>\n",
        "      <td>       2.1%</td>\n",
        "      <td>        2.4%</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>8 rows \u00d7 4 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "                                   1         2           3            4\n",
        "0                                                                      \n",
        "NaN                    Undergraduate  Graduate  California  U.S. Census\n",
        "African American                3.4%      3.4%        6.2%        12.0%\n",
        "Asian American                 39.2%     17.8%         13%         4.7%\n",
        "White American                 29.0%     38.0%       40.1%        63.7%\n",
        "Hispanic American              12.6%      7.1%       37.6%        16.3%\n",
        "Native American                 0.8%      1.1%          1%         0.7%\n",
        "International student          10.4%     20.3%         nan          nan\n",
        "Other/Unknown                   4.7%     11.9%        2.1%         2.4%\n",
        "\n",
        "[8 rows x 4 columns]"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "wikitables[1]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>1</th>\n",
        "      <th>2</th>\n",
        "      <th>3</th>\n",
        "      <th>4</th>\n",
        "      <th>5</th>\n",
        "      <th>6</th>\n",
        "      <th>7</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>NaN</th>\n",
        "      <td> 2014[99]</td>\n",
        "      <td> 2013[100]</td>\n",
        "      <td>  2012.0</td>\n",
        "      <td>  2011.0</td>\n",
        "      <td>  2010.0</td>\n",
        "      <td>  2009.0</td>\n",
        "      <td>  2008.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Applicants</th>\n",
        "      <td>    73700</td>\n",
        "      <td>     67665</td>\n",
        "      <td> 61731.0</td>\n",
        "      <td> 52966.0</td>\n",
        "      <td> 50393.0</td>\n",
        "      <td> 48650.0</td>\n",
        "      <td> 48461.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Admits</th>\n",
        "      <td>      nan</td>\n",
        "      <td>     14103</td>\n",
        "      <td> 11130.0</td>\n",
        "      <td> 11441.0</td>\n",
        "      <td> 10844.0</td>\n",
        "      <td> 10528.0</td>\n",
        "      <td> 10474.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>% Admitted</th>\n",
        "      <td>      nan</td>\n",
        "      <td>      20.8</td>\n",
        "      <td>    18.0</td>\n",
        "      <td>    21.6</td>\n",
        "      <td>    21.5</td>\n",
        "      <td>    21.6</td>\n",
        "      <td>    21.6</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Enrolled</th>\n",
        "      <td>      nan</td>\n",
        "      <td>       nan</td>\n",
        "      <td>  4162.0</td>\n",
        "      <td>  4443.0</td>\n",
        "      <td>  4109.0</td>\n",
        "      <td>  4356.0</td>\n",
        "      <td>  4261.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Average GPA</th>\n",
        "      <td>      nan</td>\n",
        "      <td>       nan</td>\n",
        "      <td>    3.84</td>\n",
        "      <td>    3.83</td>\n",
        "      <td>    3.84</td>\n",
        "      <td>     3.8</td>\n",
        "      <td>     3.9</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>6 rows \u00d7 7 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "                    1          2        3        4        5        6        7\n",
        "0                                                                            \n",
        "NaN          2014[99]  2013[100]   2012.0   2011.0   2010.0   2009.0   2008.0\n",
        "Applicants      73700      67665  61731.0  52966.0  50393.0  48650.0  48461.0\n",
        "Admits            nan      14103  11130.0  11441.0  10844.0  10528.0  10474.0\n",
        "% Admitted        nan       20.8     18.0     21.6     21.5     21.6     21.6\n",
        "Enrolled          nan        nan   4162.0   4443.0   4109.0   4356.0   4261.0\n",
        "Average GPA       nan        nan     3.84     3.83     3.84      3.8      3.9\n",
        "\n",
        "[6 rows x 7 columns]"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "malformed = read_html(page, index_col=0, attrs={\"class\":\"infobox\"})\n",
      "malformed[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>1</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>University of California, Berkeley</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Unofficial seal of the University of California [1]</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Motto</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Motto in English</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Established</th>\n",
        "      <td>1868-03-23</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Type</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Endowment</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Chancellor</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Students</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Undergraduates</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Postgraduates</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Location</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Campus</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Nobel Laureates</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Colors</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Athletics</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Nickname</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Mascot</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Affiliations</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Website</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>NaN</th>\n",
        "      <td>       NaT</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>21 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "                                                             1\n",
        "0                                                             \n",
        "University of California, Berkeley                         NaT\n",
        "Unofficial seal of the University of California [1]        NaT\n",
        "Motto                                                      NaT\n",
        "Motto in English                                           NaT\n",
        "Established                                         1868-03-23\n",
        "Type                                                       NaT\n",
        "Endowment                                                  NaT\n",
        "Chancellor                                                 NaT\n",
        "Students                                                   NaT\n",
        "Undergraduates                                             NaT\n",
        "Postgraduates                                              NaT\n",
        "Location                                                   NaT\n",
        "Campus                                                     NaT\n",
        "Nobel Laureates                                            NaT\n",
        "Colors                                                     NaT\n",
        "Athletics                                                  NaT\n",
        "Nickname                                                   NaT\n",
        "Mascot                                                     NaT\n",
        "Affiliations                                               NaT\n",
        "Website                                                    NaT\n",
        "NaN                                                        NaT\n",
        "\n",
        "[21 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.\n",
      "\n",
      "Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "infoboxes[0].xs(u'Motto').values[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 33,
       "text": [
        "u'Fiat lux (Latin)'"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "no_lefthand_labels = read_html(page, infer_types=False, attrs={\"class\":\"infobox\"})\n",
      "no_lefthand_labels[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>0</th>\n",
        "      <th>1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0 </th>\n",
        "      <td>                University of California, Berkeley</td>\n",
        "      <td>                                               nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1 </th>\n",
        "      <td> Unofficial seal of the University of Californi...</td>\n",
        "      <td>                                               nan</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2 </th>\n",
        "      <td>                                             Motto</td>\n",
        "      <td>                                  Fiat lux (Latin)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3 </th>\n",
        "      <td>                                  Motto in English</td>\n",
        "      <td>                                Let there be light</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4 </th>\n",
        "      <td>                                       Established</td>\n",
        "      <td>                                    March 23, 1868</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5 </th>\n",
        "      <td>                                              Type</td>\n",
        "      <td>                                   Flagship Public</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6 </th>\n",
        "      <td>                                         Endowment</td>\n",
        "      <td>                           $3.33 billion (2013)[2]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7 </th>\n",
        "      <td>                                        Chancellor</td>\n",
        "      <td>                                    Nicholas Dirks</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8 </th>\n",
        "      <td>                                          Students</td>\n",
        "      <td>                             36,204 (Fall 2013)[3]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9 </th>\n",
        "      <td>                                    Undergraduates</td>\n",
        "      <td>                             25,951 (Fall 2013)[4]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
        "      <td>                                     Postgraduates</td>\n",
        "      <td>                             10,125 (Fall 2012)[5]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
        "      <td>                                          Location</td>\n",
        "      <td>               Berkeley, California, United States</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
        "      <td>                                            Campus</td>\n",
        "      <td> Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
        "      <td>                                   Nobel Laureates</td>\n",
        "      <td>                                             72[8]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
        "      <td>                                            Colors</td>\n",
        "      <td>             Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
        "      <td>                                         Athletics</td>\n",
        "      <td> 27 Varsity Teams NCAA Division I California Go...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
        "      <td>                                          Nickname</td>\n",
        "      <td>                                      Golden Bears</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
        "      <td>                                            Mascot</td>\n",
        "      <td>                                     Oski the Bear</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
        "      <td>                                      Affiliations</td>\n",
        "      <td>      AAU IARU Pacific-12 University of California</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
        "      <td>                                           Website</td>\n",
        "      <td>                                      Berkeley.edu</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>20</th>\n",
        "      <td>                                               nan</td>\n",
        "      <td>                                               nan</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>21 rows \u00d7 2 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 11,
       "text": [
        "                                                    0  \\\n",
        "0                  University of California, Berkeley   \n",
        "1   Unofficial seal of the University of Californi...   \n",
        "2                                               Motto   \n",
        "3                                    Motto in English   \n",
        "4                                         Established   \n",
        "5                                                Type   \n",
        "6                                           Endowment   \n",
        "7                                          Chancellor   \n",
        "8                                            Students   \n",
        "9                                      Undergraduates   \n",
        "10                                      Postgraduates   \n",
        "11                                           Location   \n",
        "12                                             Campus   \n",
        "13                                    Nobel Laureates   \n",
        "14                                             Colors   \n",
        "15                                          Athletics   \n",
        "16                                           Nickname   \n",
        "17                                             Mascot   \n",
        "18                                       Affiliations   \n",
        "19                                            Website   \n",
        "20                                                nan   \n",
        "\n",
        "                                                    1  \n",
        "0                                                 nan  \n",
        "1                                                 nan  \n",
        "2                                    Fiat lux (Latin)  \n",
        "3                                  Let there be light  \n",
        "4                                      March 23, 1868  \n",
        "5                                     Flagship Public  \n",
        "6                             $3.33 billion (2013)[2]  \n",
        "7                                      Nicholas Dirks  \n",
        "8                               36,204 (Fall 2013)[3]  \n",
        "9                               25,951 (Fall 2013)[4]  \n",
        "10                              10,125 (Fall 2012)[5]  \n",
        "11                Berkeley, California, United States  \n",
        "12  Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...  \n",
        "13                                              72[8]  \n",
        "14              Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]  \n",
        "15  27 Varsity Teams NCAA Division I California Go...  \n",
        "16                                       Golden Bears  \n",
        "17                                      Oski the Bear  \n",
        "18       AAU IARU Pacific-12 University of California  \n",
        "19                                       Berkeley.edu  \n",
        "20                                                nan  \n",
        "\n",
        "[21 rows x 2 columns]"
       ]
      }
     ],
     "prompt_number": 11
    }
   ],
   "metadata": {}
  }
 ]
}

## wikipedia-infoboxes-in-pandas.py
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>

# <codecell>

from pandas.io.html import read_html
page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'
infoboxes = read_html(page, index_col=0, infer_types=False, attrs={"class":"infobox"})
wikitables = read_html(page, index_col=0, infer_types=False, attrs={"class":"wikitable"})

print "Extracted {num} infoboxes".format(num=len(infoboxes))
print "Extracted {num} wikitables".format(num=len(wikitables))

# <codecell>

infoboxes[0]

# <codecell>

infoboxes[1]

# <codecell>

wikitables[0]

# <codecell>

wikitables[1]

# <markdowncell>

# The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem.

# <codecell>

malformed = read_html(page, index_col=0, attrs={"class":"infobox"})
malformed[0]

# <markdowncell>

# The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.
#
# Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:

# <codecell>

infoboxes[0].xs(u'Motto').values[0]

# <markdowncell>

# Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data.

# <codecell>

no_lefthand_labels = read_html(page, infer_types=False, attrs={"class":"infobox"})
no_lefthand_labels[0]
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from pandas.io.html import read_html\n",
	"page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'\n",
	"infoboxes = read_html(page, index_col=0, infer_types=False, attrs={\"class\":\"infobox\"})\n",
	"wikitables = read_html(page, index_col=0, infer_types=False, attrs={\"class\":\"wikitable\"})\n",
	"\n",
	"print \"Extracted {num} infoboxes\".format(num=len(infoboxes))\n",
	"print \"Extracted {num} wikitables\".format(num=len(wikitables))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Extracted 2 infoboxes\n",
	"Extracted 2 wikitables\n"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"infoboxes[0]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>University of California, Berkeley</th>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Unofficial seal of the University of California [1]</th>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Motto</th>\n",
	" <td> Fiat lux (Latin)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Motto in English</th>\n",
	" <td> Let there be light</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Established</th>\n",
	" <td> March 23, 1868</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Type</th>\n",
	" <td> Flagship Public</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Endowment</th>\n",
	" <td> $3.33 billion (2013)[2]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Chancellor</th>\n",
	" <td> Nicholas Dirks</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Students</th>\n",
	" <td> 36,204 (Fall 2013)[3]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Undergraduates</th>\n",
	" <td> 25,951 (Fall 2013)[4]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Postgraduates</th>\n",
	" <td> 10,125 (Fall 2012)[5]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Location</th>\n",
	" <td> Berkeley, California, United States</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Campus</th>\n",
	" <td> Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Nobel Laureates</th>\n",
	" <td> 72[8]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Colors</th>\n",
	" <td> Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Athletics</th>\n",
	" <td> 27 Varsity Teams NCAA Division I California Go...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Nickname</th>\n",
	" <td> Golden Bears</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Mascot</th>\n",
	" <td> Oski the Bear</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Affiliations</th>\n",
	" <td> AAU IARU Pacific-12 University of California</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Website</th>\n",
	" <td> Berkeley.edu</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>NaN</th>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>21 rows \u00d7 1 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 3,
	"text": [
	" 1\n",
	"0 \n",
	"University of California, Berkeley nan\n",
	"Unofficial seal of the University of California [1] nan\n",
	"Motto Fiat lux (Latin)\n",
	"Motto in English Let there be light\n",
	"Established March 23, 1868\n",
	"Type Flagship Public\n",
	"Endowment $3.33 billion (2013)[2]\n",
	"Chancellor Nicholas Dirks\n",
	"Students 36,204 (Fall 2013)[3]\n",
	"Undergraduates 25,951 (Fall 2013)[4]\n",
	"Postgraduates 10,125 (Fall 2012)[5]\n",
	"Location Berkeley, California, United States\n",
	"Campus Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...\n",
	"Nobel Laureates 72[8]\n",
	"Colors Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]\n",
	"Athletics 27 Varsity Teams NCAA Division I California Go...\n",
	"Nickname Golden Bears\n",
	"Mascot Oski the Bear\n",
	"Affiliations AAU IARU Pacific-12 University of California\n",
	"Website Berkeley.edu\n",
	"NaN nan\n",
	"\n",
	"[21 rows x 1 columns]"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"infoboxes[1]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>University rankings</th>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>National</th>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>ARWU[58]</th>\n",
	" <td> 3.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Forbes[59]</th>\n",
	" <td> 22.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>U.S. News & World Report[60]</th>\n",
	" <td> 20.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Washington Monthly[61]</th>\n",
	" <td> 5.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Global</th>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>ARWU[62]</th>\n",
	" <td> 3.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>QS[63]</th>\n",
	" <td> 25.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Times[64]</th>\n",
	" <td> 8.0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>10 rows \u00d7 1 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 4,
	"text": [
	" 1\n",
	"0 \n",
	"University rankings nan\n",
	"National nan\n",
	"ARWU[58] 3.0\n",
	"Forbes[59] 22.0\n",
	"U.S. News & World Report[60] 20.0\n",
	"Washington Monthly[61] 5.0\n",
	"Global nan\n",
	"ARWU[62] 3.0\n",
	"QS[63] 25.0\n",
	"Times[64] 8.0\n",
	"\n",
	"[10 rows x 1 columns]"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"wikitables[0]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" <th>4</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>NaN</th>\n",
	" <td> Undergraduate</td>\n",
	" <td> Graduate</td>\n",
	" <td> California</td>\n",
	" <td> U.S. Census</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>African American</th>\n",
	" <td> 3.4%</td>\n",
	" <td> 3.4%</td>\n",
	" <td> 6.2%</td>\n",
	" <td> 12.0%</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Asian American</th>\n",
	" <td> 39.2%</td>\n",
	" <td> 17.8%</td>\n",
	" <td> 13%</td>\n",
	" <td> 4.7%</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>White American</th>\n",
	" <td> 29.0%</td>\n",
	" <td> 38.0%</td>\n",
	" <td> 40.1%</td>\n",
	" <td> 63.7%</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Hispanic American</th>\n",
	" <td> 12.6%</td>\n",
	" <td> 7.1%</td>\n",
	" <td> 37.6%</td>\n",
	" <td> 16.3%</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Native American</th>\n",
	" <td> 0.8%</td>\n",
	" <td> 1.1%</td>\n",
	" <td> 1%</td>\n",
	" <td> 0.7%</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>International student</th>\n",
	" <td> 10.4%</td>\n",
	" <td> 20.3%</td>\n",
	" <td> nan</td>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Other/Unknown</th>\n",
	" <td> 4.7%</td>\n",
	" <td> 11.9%</td>\n",
	" <td> 2.1%</td>\n",
	" <td> 2.4%</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>8 rows \u00d7 4 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 5,
	"text": [
	" 1 2 3 4\n",
	"0 \n",
	"NaN Undergraduate Graduate California U.S. Census\n",
	"African American 3.4% 3.4% 6.2% 12.0%\n",
	"Asian American 39.2% 17.8% 13% 4.7%\n",
	"White American 29.0% 38.0% 40.1% 63.7%\n",
	"Hispanic American 12.6% 7.1% 37.6% 16.3%\n",
	"Native American 0.8% 1.1% 1% 0.7%\n",
	"International student 10.4% 20.3% nan nan\n",
	"Other/Unknown 4.7% 11.9% 2.1% 2.4%\n",
	"\n",
	"[8 rows x 4 columns]"
	]
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"wikitables[1]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" <th>4</th>\n",
	" <th>5</th>\n",
	" <th>6</th>\n",
	" <th>7</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>NaN</th>\n",
	" <td> 2014[99]</td>\n",
	" <td> 2013[100]</td>\n",
	" <td> 2012.0</td>\n",
	" <td> 2011.0</td>\n",
	" <td> 2010.0</td>\n",
	" <td> 2009.0</td>\n",
	" <td> 2008.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Applicants</th>\n",
	" <td> 73700</td>\n",
	" <td> 67665</td>\n",
	" <td> 61731.0</td>\n",
	" <td> 52966.0</td>\n",
	" <td> 50393.0</td>\n",
	" <td> 48650.0</td>\n",
	" <td> 48461.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Admits</th>\n",
	" <td> nan</td>\n",
	" <td> 14103</td>\n",
	" <td> 11130.0</td>\n",
	" <td> 11441.0</td>\n",
	" <td> 10844.0</td>\n",
	" <td> 10528.0</td>\n",
	" <td> 10474.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>% Admitted</th>\n",
	" <td> nan</td>\n",
	" <td> 20.8</td>\n",
	" <td> 18.0</td>\n",
	" <td> 21.6</td>\n",
	" <td> 21.5</td>\n",
	" <td> 21.6</td>\n",
	" <td> 21.6</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Enrolled</th>\n",
	" <td> nan</td>\n",
	" <td> nan</td>\n",
	" <td> 4162.0</td>\n",
	" <td> 4443.0</td>\n",
	" <td> 4109.0</td>\n",
	" <td> 4356.0</td>\n",
	" <td> 4261.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Average GPA</th>\n",
	" <td> nan</td>\n",
	" <td> nan</td>\n",
	" <td> 3.84</td>\n",
	" <td> 3.83</td>\n",
	" <td> 3.84</td>\n",
	" <td> 3.8</td>\n",
	" <td> 3.9</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>6 rows \u00d7 7 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 6,
	"text": [
	" 1 2 3 4 5 6 7\n",
	"0 \n",
	"NaN 2014[99] 2013[100] 2012.0 2011.0 2010.0 2009.0 2008.0\n",
	"Applicants 73700 67665 61731.0 52966.0 50393.0 48650.0 48461.0\n",
	"Admits nan 14103 11130.0 11441.0 10844.0 10528.0 10474.0\n",
	"% Admitted nan 20.8 18.0 21.6 21.5 21.6 21.6\n",
	"Enrolled nan nan 4162.0 4443.0 4109.0 4356.0 4261.0\n",
	"Average GPA nan nan 3.84 3.83 3.84 3.8 3.9\n",
	"\n",
	"[6 rows x 7 columns]"
	]
	}
	],
	"prompt_number": 6
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"malformed = read_html(page, index_col=0, attrs={\"class\":\"infobox\"})\n",
	"malformed[0]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>University of California, Berkeley</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Unofficial seal of the University of California [1]</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Motto</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Motto in English</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Established</th>\n",
	" <td>1868-03-23</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Type</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Endowment</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Chancellor</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Students</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Undergraduates</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Postgraduates</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Location</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Campus</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Nobel Laureates</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Colors</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Athletics</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Nickname</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Mascot</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Affiliations</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Website</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>NaN</th>\n",
	" <td> NaT</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>21 rows \u00d7 1 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 8,
	"text": [
	" 1\n",
	"0 \n",
	"University of California, Berkeley NaT\n",
	"Unofficial seal of the University of California [1] NaT\n",
	"Motto NaT\n",
	"Motto in English NaT\n",
	"Established 1868-03-23\n",
	"Type NaT\n",
	"Endowment NaT\n",
	"Chancellor NaT\n",
	"Students NaT\n",
	"Undergraduates NaT\n",
	"Postgraduates NaT\n",
	"Location NaT\n",
	"Campus NaT\n",
	"Nobel Laureates NaT\n",
	"Colors NaT\n",
	"Athletics NaT\n",
	"Nickname NaT\n",
	"Mascot NaT\n",
	"Affiliations NaT\n",
	"Website NaT\n",
	"NaN NaT\n",
	"\n",
	"[21 rows x 1 columns]"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.\n",
	"\n",
	"Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"infoboxes[0].xs(u'Motto').values[0]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 33,
	"text": [
	"u'Fiat lux (Latin)'"
	]
	}
	],
	"prompt_number": 33
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"no_lefthand_labels = read_html(page, infer_types=False, attrs={\"class\":\"infobox\"})\n",
	"no_lefthand_labels[0]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0 </th>\n",
	" <td> University of California, Berkeley</td>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1 </th>\n",
	" <td> Unofficial seal of the University of Californi...</td>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2 </th>\n",
	" <td> Motto</td>\n",
	" <td> Fiat lux (Latin)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3 </th>\n",
	" <td> Motto in English</td>\n",
	" <td> Let there be light</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4 </th>\n",
	" <td> Established</td>\n",
	" <td> March 23, 1868</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5 </th>\n",
	" <td> Type</td>\n",
	" <td> Flagship Public</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6 </th>\n",
	" <td> Endowment</td>\n",
	" <td> $3.33 billion (2013)[2]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7 </th>\n",
	" <td> Chancellor</td>\n",
	" <td> Nicholas Dirks</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8 </th>\n",
	" <td> Students</td>\n",
	" <td> 36,204 (Fall 2013)[3]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9 </th>\n",
	" <td> Undergraduates</td>\n",
	" <td> 25,951 (Fall 2013)[4]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td> Postgraduates</td>\n",
	" <td> 10,125 (Fall 2012)[5]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td> Location</td>\n",
	" <td> Berkeley, California, United States</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>12</th>\n",
	" <td> Campus</td>\n",
	" <td> Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>13</th>\n",
	" <td> Nobel Laureates</td>\n",
	" <td> 72[8]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>14</th>\n",
	" <td> Colors</td>\n",
	" <td> Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>15</th>\n",
	" <td> Athletics</td>\n",
	" <td> 27 Varsity Teams NCAA Division I California Go...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>16</th>\n",
	" <td> Nickname</td>\n",
	" <td> Golden Bears</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>17</th>\n",
	" <td> Mascot</td>\n",
	" <td> Oski the Bear</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>18</th>\n",
	" <td> Affiliations</td>\n",
	" <td> AAU IARU Pacific-12 University of California</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>19</th>\n",
	" <td> Website</td>\n",
	" <td> Berkeley.edu</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>20</th>\n",
	" <td> nan</td>\n",
	" <td> nan</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>21 rows \u00d7 2 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 11,
	"text": [
	" 0 \\\n",
	"0 University of California, Berkeley \n",
	"1 Unofficial seal of the University of Californi... \n",
	"2 Motto \n",
	"3 Motto in English \n",
	"4 Established \n",
	"5 Type \n",
	"6 Endowment \n",
	"7 Chancellor \n",
	"8 Students \n",
	"9 Undergraduates \n",
	"10 Postgraduates \n",
	"11 Location \n",
	"12 Campus \n",
	"13 Nobel Laureates \n",
	"14 Colors \n",
	"15 Athletics \n",
	"16 Nickname \n",
	"17 Mascot \n",
	"18 Affiliations \n",
	"19 Website \n",
	"20 nan \n",
	"\n",
	" 1 \n",
	"0 nan \n",
	"1 nan \n",
	"2 Fiat lux (Latin) \n",
	"3 Let there be light \n",
	"4 March 23, 1868 \n",
	"5 Flagship Public \n",
	"6 $3.33 billion (2013)[2] \n",
	"7 Nicholas Dirks \n",
	"8 36,204 (Fall 2013)[3] \n",
	"9 25,951 (Fall 2013)[4] \n",
	"10 10,125 (Fall 2012)[5] \n",
	"11 Berkeley, California, United States \n",
	"12 Urban Total 1,232 acres (499\u00a0ha) Core Campus 1... \n",
	"13 72[8] \n",
	"14 Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9] \n",
	"15 27 Varsity Teams NCAA Division I California Go... \n",
	"16 Golden Bears \n",
	"17 Oski the Bear \n",
	"18 AAU IARU Pacific-12 University of California \n",
	"19 Berkeley.edu \n",
	"20 nan \n",
	"\n",
	"[21 rows x 2 columns]"
	]
	}
	],
	"prompt_number": 11
	}
	],
	"metadata": {}
	}
	]
	}
	# -- coding: utf-8 --
	# <nbformat>3.0</nbformat>

	# <codecell>

	from pandas.io.html import read_html
	page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'
	infoboxes = read_html(page, index_col=0, infer_types=False, attrs={"class":"infobox"})
	wikitables = read_html(page, index_col=0, infer_types=False, attrs={"class":"wikitable"})

	print "Extracted {num} infoboxes".format(num=len(infoboxes))
	print "Extracted {num} wikitables".format(num=len(wikitables))

	# <codecell>

	infoboxes[0]

	# <codecell>

	infoboxes[1]

	# <codecell>

	wikitables[0]

	# <codecell>

	wikitables[1]

	# <markdowncell>

	# The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem.

	# <codecell>

	malformed = read_html(page, index_col=0, attrs={"class":"infobox"})
	malformed[0]

	# <markdowncell>

	# The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.
	#
	# Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:

	# <codecell>

	infoboxes[0].xs(u'Motto').values[0]

	# <markdowncell>

	# Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data.

	# <codecell>

	no_lefthand_labels = read_html(page, infer_types=False, attrs={"class":"infobox"})
	no_lefthand_labels[0]