Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save leotop/40af49b1a4196358f8192147612e1c20 to your computer and use it in GitHub Desktop.
Save leotop/40af49b1a4196358f8192147612e1c20 to your computer and use it in GitHub Desktop.
How to extract Wikipedia infoboxes and wikitables using Pandas
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pandas.io.html import read_html\n",
"page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'\n",
"infoboxes = read_html(page, index_col=0, infer_types=False, attrs={\"class\":\"infobox\"})\n",
"wikitables = read_html(page, index_col=0, infer_types=False, attrs={\"class\":\"wikitable\"})\n",
"\n",
"print \"Extracted {num} infoboxes\".format(num=len(infoboxes))\n",
"print \"Extracted {num} wikitables\".format(num=len(wikitables))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Extracted 2 infoboxes\n",
"Extracted 2 wikitables\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"infoboxes[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>University of California, Berkeley</th>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Unofficial seal of the University of California [1]</th>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Motto</th>\n",
" <td> Fiat lux (Latin)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Motto in English</th>\n",
" <td> Let there be light</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Established</th>\n",
" <td> March 23, 1868</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Type</th>\n",
" <td> Flagship Public</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Endowment</th>\n",
" <td> $3.33 billion (2013)[2]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Chancellor</th>\n",
" <td> Nicholas Dirks</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Students</th>\n",
" <td> 36,204 (Fall 2013)[3]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Undergraduates</th>\n",
" <td> 25,951 (Fall 2013)[4]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Postgraduates</th>\n",
" <td> 10,125 (Fall 2012)[5]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Location</th>\n",
" <td> Berkeley, California, United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Campus</th>\n",
" <td> Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Nobel Laureates</th>\n",
" <td> 72[8]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Colors</th>\n",
" <td> Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Athletics</th>\n",
" <td> 27 Varsity Teams NCAA Division I California Go...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Nickname</th>\n",
" <td> Golden Bears</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mascot</th>\n",
" <td> Oski the Bear</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Affiliations</th>\n",
" <td> AAU IARU Pacific-12 University of California</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Website</th>\n",
" <td> Berkeley.edu</td>\n",
" </tr>\n",
" <tr>\n",
" <th>NaN</th>\n",
" <td> nan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>21 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
" 1\n",
"0 \n",
"University of California, Berkeley nan\n",
"Unofficial seal of the University of California [1] nan\n",
"Motto Fiat lux (Latin)\n",
"Motto in English Let there be light\n",
"Established March 23, 1868\n",
"Type Flagship Public\n",
"Endowment $3.33 billion (2013)[2]\n",
"Chancellor Nicholas Dirks\n",
"Students 36,204 (Fall 2013)[3]\n",
"Undergraduates 25,951 (Fall 2013)[4]\n",
"Postgraduates 10,125 (Fall 2012)[5]\n",
"Location Berkeley, California, United States\n",
"Campus Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...\n",
"Nobel Laureates 72[8]\n",
"Colors Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]\n",
"Athletics 27 Varsity Teams NCAA Division I California Go...\n",
"Nickname Golden Bears\n",
"Mascot Oski the Bear\n",
"Affiliations AAU IARU Pacific-12 University of California\n",
"Website Berkeley.edu\n",
"NaN nan\n",
"\n",
"[21 rows x 1 columns]"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"infoboxes[1]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>University rankings</th>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>National</th>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ARWU[58]</th>\n",
" <td> 3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Forbes[59]</th>\n",
" <td> 22.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>U.S. News &amp; World Report[60]</th>\n",
" <td> 20.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Washington Monthly[61]</th>\n",
" <td> 5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Global</th>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ARWU[62]</th>\n",
" <td> 3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>QS[63]</th>\n",
" <td> 25.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Times[64]</th>\n",
" <td> 8.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
" 1\n",
"0 \n",
"University rankings nan\n",
"National nan\n",
"ARWU[58] 3.0\n",
"Forbes[59] 22.0\n",
"U.S. News & World Report[60] 20.0\n",
"Washington Monthly[61] 5.0\n",
"Global nan\n",
"ARWU[62] 3.0\n",
"QS[63] 25.0\n",
"Times[64] 8.0\n",
"\n",
"[10 rows x 1 columns]"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"wikitables[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>NaN</th>\n",
" <td> Undergraduate</td>\n",
" <td> Graduate</td>\n",
" <td> California</td>\n",
" <td> U.S. Census</td>\n",
" </tr>\n",
" <tr>\n",
" <th>African American</th>\n",
" <td> 3.4%</td>\n",
" <td> 3.4%</td>\n",
" <td> 6.2%</td>\n",
" <td> 12.0%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Asian American</th>\n",
" <td> 39.2%</td>\n",
" <td> 17.8%</td>\n",
" <td> 13%</td>\n",
" <td> 4.7%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>White American</th>\n",
" <td> 29.0%</td>\n",
" <td> 38.0%</td>\n",
" <td> 40.1%</td>\n",
" <td> 63.7%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hispanic American</th>\n",
" <td> 12.6%</td>\n",
" <td> 7.1%</td>\n",
" <td> 37.6%</td>\n",
" <td> 16.3%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Native American</th>\n",
" <td> 0.8%</td>\n",
" <td> 1.1%</td>\n",
" <td> 1%</td>\n",
" <td> 0.7%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>International student</th>\n",
" <td> 10.4%</td>\n",
" <td> 20.3%</td>\n",
" <td> nan</td>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Other/Unknown</th>\n",
" <td> 4.7%</td>\n",
" <td> 11.9%</td>\n",
" <td> 2.1%</td>\n",
" <td> 2.4%</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows \u00d7 4 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" 1 2 3 4\n",
"0 \n",
"NaN Undergraduate Graduate California U.S. Census\n",
"African American 3.4% 3.4% 6.2% 12.0%\n",
"Asian American 39.2% 17.8% 13% 4.7%\n",
"White American 29.0% 38.0% 40.1% 63.7%\n",
"Hispanic American 12.6% 7.1% 37.6% 16.3%\n",
"Native American 0.8% 1.1% 1% 0.7%\n",
"International student 10.4% 20.3% nan nan\n",
"Other/Unknown 4.7% 11.9% 2.1% 2.4%\n",
"\n",
"[8 rows x 4 columns]"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"wikitables[1]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>NaN</th>\n",
" <td> 2014[99]</td>\n",
" <td> 2013[100]</td>\n",
" <td> 2012.0</td>\n",
" <td> 2011.0</td>\n",
" <td> 2010.0</td>\n",
" <td> 2009.0</td>\n",
" <td> 2008.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Applicants</th>\n",
" <td> 73700</td>\n",
" <td> 67665</td>\n",
" <td> 61731.0</td>\n",
" <td> 52966.0</td>\n",
" <td> 50393.0</td>\n",
" <td> 48650.0</td>\n",
" <td> 48461.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Admits</th>\n",
" <td> nan</td>\n",
" <td> 14103</td>\n",
" <td> 11130.0</td>\n",
" <td> 11441.0</td>\n",
" <td> 10844.0</td>\n",
" <td> 10528.0</td>\n",
" <td> 10474.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>% Admitted</th>\n",
" <td> nan</td>\n",
" <td> 20.8</td>\n",
" <td> 18.0</td>\n",
" <td> 21.6</td>\n",
" <td> 21.5</td>\n",
" <td> 21.6</td>\n",
" <td> 21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Enrolled</th>\n",
" <td> nan</td>\n",
" <td> nan</td>\n",
" <td> 4162.0</td>\n",
" <td> 4443.0</td>\n",
" <td> 4109.0</td>\n",
" <td> 4356.0</td>\n",
" <td> 4261.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Average GPA</th>\n",
" <td> nan</td>\n",
" <td> nan</td>\n",
" <td> 3.84</td>\n",
" <td> 3.83</td>\n",
" <td> 3.84</td>\n",
" <td> 3.8</td>\n",
" <td> 3.9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6 rows \u00d7 7 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
" 1 2 3 4 5 6 7\n",
"0 \n",
"NaN 2014[99] 2013[100] 2012.0 2011.0 2010.0 2009.0 2008.0\n",
"Applicants 73700 67665 61731.0 52966.0 50393.0 48650.0 48461.0\n",
"Admits nan 14103 11130.0 11441.0 10844.0 10528.0 10474.0\n",
"% Admitted nan 20.8 18.0 21.6 21.5 21.6 21.6\n",
"Enrolled nan nan 4162.0 4443.0 4109.0 4356.0 4261.0\n",
"Average GPA nan nan 3.84 3.83 3.84 3.8 3.9\n",
"\n",
"[6 rows x 7 columns]"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"malformed = read_html(page, index_col=0, attrs={\"class\":\"infobox\"})\n",
"malformed[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>University of California, Berkeley</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Unofficial seal of the University of California [1]</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Motto</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Motto in English</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Established</th>\n",
" <td>1868-03-23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Type</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Endowment</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Chancellor</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Students</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Undergraduates</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Postgraduates</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Location</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Campus</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Nobel Laureates</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Colors</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Athletics</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Nickname</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mascot</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Affiliations</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Website</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>NaN</th>\n",
" <td> NaT</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>21 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
" 1\n",
"0 \n",
"University of California, Berkeley NaT\n",
"Unofficial seal of the University of California [1] NaT\n",
"Motto NaT\n",
"Motto in English NaT\n",
"Established 1868-03-23\n",
"Type NaT\n",
"Endowment NaT\n",
"Chancellor NaT\n",
"Students NaT\n",
"Undergraduates NaT\n",
"Postgraduates NaT\n",
"Location NaT\n",
"Campus NaT\n",
"Nobel Laureates NaT\n",
"Colors NaT\n",
"Athletics NaT\n",
"Nickname NaT\n",
"Mascot NaT\n",
"Affiliations NaT\n",
"Website NaT\n",
"NaN NaT\n",
"\n",
"[21 rows x 1 columns]"
]
}
],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.\n",
"\n",
"Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"infoboxes[0].xs(u'Motto').values[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 33,
"text": [
"u'Fiat lux (Latin)'"
]
}
],
"prompt_number": 33
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"no_lefthand_labels = read_html(page, infer_types=False, attrs={\"class\":\"infobox\"})\n",
"no_lefthand_labels[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0 </th>\n",
" <td> University of California, Berkeley</td>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1 </th>\n",
" <td> Unofficial seal of the University of Californi...</td>\n",
" <td> nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2 </th>\n",
" <td> Motto</td>\n",
" <td> Fiat lux (Latin)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3 </th>\n",
" <td> Motto in English</td>\n",
" <td> Let there be light</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4 </th>\n",
" <td> Established</td>\n",
" <td> March 23, 1868</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5 </th>\n",
" <td> Type</td>\n",
" <td> Flagship Public</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6 </th>\n",
" <td> Endowment</td>\n",
" <td> $3.33 billion (2013)[2]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7 </th>\n",
" <td> Chancellor</td>\n",
" <td> Nicholas Dirks</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8 </th>\n",
" <td> Students</td>\n",
" <td> 36,204 (Fall 2013)[3]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9 </th>\n",
" <td> Undergraduates</td>\n",
" <td> 25,951 (Fall 2013)[4]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td> Postgraduates</td>\n",
" <td> 10,125 (Fall 2012)[5]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td> Location</td>\n",
" <td> Berkeley, California, United States</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td> Campus</td>\n",
" <td> Urban Total 1,232 acres (499\u00a0ha) Core Campus 1...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td> Nobel Laureates</td>\n",
" <td> 72[8]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td> Colors</td>\n",
" <td> Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td> Athletics</td>\n",
" <td> 27 Varsity Teams NCAA Division I California Go...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td> Nickname</td>\n",
" <td> Golden Bears</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td> Mascot</td>\n",
" <td> Oski the Bear</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td> Affiliations</td>\n",
" <td> AAU IARU Pacific-12 University of California</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td> Website</td>\n",
" <td> Berkeley.edu</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td> nan</td>\n",
" <td> nan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>21 rows \u00d7 2 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
" 0 \\\n",
"0 University of California, Berkeley \n",
"1 Unofficial seal of the University of Californi... \n",
"2 Motto \n",
"3 Motto in English \n",
"4 Established \n",
"5 Type \n",
"6 Endowment \n",
"7 Chancellor \n",
"8 Students \n",
"9 Undergraduates \n",
"10 Postgraduates \n",
"11 Location \n",
"12 Campus \n",
"13 Nobel Laureates \n",
"14 Colors \n",
"15 Athletics \n",
"16 Nickname \n",
"17 Mascot \n",
"18 Affiliations \n",
"19 Website \n",
"20 nan \n",
"\n",
" 1 \n",
"0 nan \n",
"1 nan \n",
"2 Fiat lux (Latin) \n",
"3 Let there be light \n",
"4 March 23, 1868 \n",
"5 Flagship Public \n",
"6 $3.33 billion (2013)[2] \n",
"7 Nicholas Dirks \n",
"8 36,204 (Fall 2013)[3] \n",
"9 25,951 (Fall 2013)[4] \n",
"10 10,125 (Fall 2012)[5] \n",
"11 Berkeley, California, United States \n",
"12 Urban Total 1,232 acres (499\u00a0ha) Core Campus 1... \n",
"13 72[8] \n",
"14 Berkeley Blue \u00a0\u00a0\u00a0\u00a0 California Gold[9] \n",
"15 27 Varsity Teams NCAA Division I California Go... \n",
"16 Golden Bears \n",
"17 Oski the Bear \n",
"18 AAU IARU Pacific-12 University of California \n",
"19 Berkeley.edu \n",
"20 nan \n",
"\n",
"[21 rows x 2 columns]"
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
from pandas.io.html import read_html
page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'
infoboxes = read_html(page, index_col=0, infer_types=False, attrs={"class":"infobox"})
wikitables = read_html(page, index_col=0, infer_types=False, attrs={"class":"wikitable"})
print "Extracted {num} infoboxes".format(num=len(infoboxes))
print "Extracted {num} wikitables".format(num=len(wikitables))
# <codecell>
infoboxes[0]
# <codecell>
infoboxes[1]
# <codecell>
wikitables[0]
# <codecell>
wikitables[1]
# <markdowncell>
# The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem.
# <codecell>
malformed = read_html(page, index_col=0, attrs={"class":"infobox"})
malformed[0]
# <markdowncell>
# The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.
#
# Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:
# <codecell>
infoboxes[0].xs(u'Motto').values[0]
# <markdowncell>
# Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data.
# <codecell>
no_lefthand_labels = read_html(page, infer_types=False, attrs={"class":"infobox"})
no_lefthand_labels[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment