Skip to content

Instantly share code, notes, and snippets.

@rjpower
Created March 1, 2017 18:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjpower/7b11a3197f1bff0d09bc8b984592ba76 to your computer and use it in GitHub Desktop.
Save rjpower/7b11a3197f1bff0d09bc8b984592ba76 to your computer and use it in GitHub Desktop.
wiki parsing example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Wikipedia Paragraph Conversion"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import tqdm\n",
"import subprocess\n",
"import elasticsearch\n",
"\n",
"%config ZMQInteractiveShell.cache_size = 0\n",
"WIKI_BZ2 = '/mnt/spin6TB/enwiki-20161001-pages-meta-current.xml.bz2'\n",
"ES = elasticsearch.Elasticsearch('localhost:9200')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"\n",
"TAG_RE = re.compile('{{[^}]+}}')\n",
"\n",
"FILE_LINK = re.compile('\\\\[\\\\[File:[^]]+\\\\]\\\\]')\n",
"\n",
"EXTERNAL_LINK = re.compile('\\\\[http[^]]+\\\\]') \n",
"EXTERNAL_REF = re.compile('<ref name[^/]+/>|<ref name[^<]*</ref>|<ref>[^<]*</ref>')\n",
"COMMENT = re.compile('<!--[^-]+-->|<comment>[^<]+</comment>')\n",
"HTML_REF = re.compile('[&][a-z0-9]{2,4};')\n",
"REFS = {\n",
" 'quot': '\"',\n",
" 'lt': '<',\n",
" 'gt': '>',\n",
" 'amp': '&',\n",
"}\n",
"\n",
"LINK_RES = [\n",
" re.compile(r'[[]{2}([^]]+)[]]{2}'),\n",
" re.compile(r'{{Main article[|]([^}]+)}}'),\n",
"]\n",
"\n",
"def replace_ref(ref):\n",
" name = ref.group(0)[1:-1]\n",
" if name in REFS:\n",
" return REFS[name]\n",
" return ''\n",
" \n",
"cleanups = [\n",
" lambda l: TAG_RE.sub('', l),\n",
" lambda l: EXTERNAL_LINK.sub('', l),\n",
" lambda l: HTML_REF.sub(replace_ref, l),\n",
" lambda l: FILE_LINK.sub('', l),\n",
" lambda l: EXTERNAL_REF.sub('', l),\n",
" lambda l: COMMENT.sub('', l),\n",
"]\n",
"\n",
"def outlinks(text):\n",
" refs = []\n",
" for regex in LINK_RES:\n",
" for ref in regex.findall(text): \n",
" ref = ref.split('|')[0]\n",
" refs.append(ref)\n",
" return refs\n",
" \n",
"\n",
"MIN_PARAGRAPH_LEN = 150\n",
"MIN_LINKS_PER_PARAGRAPH = 1\n",
"MAX_LINKS_PER_PARAGRAPH = 10\n",
"MAX_LINKS_PER_ARTICLE = 25\n",
"MIN_SENTENCES = 3\n",
"MIN_PARAGRAPHS_PER_ARTICLE = 3\n",
"MAX_PARAGRAPHS_PER_ARTICLE = 10"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import collections\n",
"\n",
"def bad_link(l):\n",
" if not l.strip():\n",
" return True\n",
" if ':' in l or 'List of' in l or 'Category' in l:\n",
" return True\n",
" if l[0] in '0123456789':\n",
" return True \n",
" return False\n",
" \n",
"def finish_article(title, lines):\n",
" if bad_link(title):\n",
" return\n",
"\n",
" links = collections.Counter()\n",
"\n",
" text = ''.join(lines)\n",
" for cleanup in cleanups:\n",
" text = cleanup(text)\n",
" lines = text.split('\\n')\n",
" links.update([link for link in outlinks(text) if not bad_link(link)])\n",
"\n",
" paragraph = []\n",
" paragraphs = []\n",
"\n",
" for line in lines:\n",
" if len(line.strip()) < 5 or '==' in line:\n",
" ptext = '\\n'.join(paragraph)\n",
" paragraph = []\n",
" \n",
" plinks = outlinks(ptext)\n",
" link_count = len(plinks)\n",
"\n",
" if ptext.count('.') >= MIN_SENTENCES\\\n",
" and len(ptext) >= MIN_PARAGRAPH_LEN\\\n",
" and link_count >= MIN_LINKS_PER_PARAGRAPH\\\n",
" and link_count <= MAX_LINKS_PER_PARAGRAPH:\n",
" paragraphs.append(ptext)\n",
" else:\n",
" paragraph.append(line)\n",
"\n",
" if len(paragraphs) < MIN_PARAGRAPHS_PER_ARTICLE:\n",
" return\n",
"\n",
" first_line = paragraphs[0:1]\n",
" paragraphs.sort(key=lambda l: len(l))\n",
" rest = paragraphs[-(MAX_PARAGRAPHS_PER_ARTICLE - 1):]\n",
"\n",
" all_links = list(links.items())\n",
" all_links.sort(key=lambda kv: kv[1])\n",
" top_links = [k for (k,v) in all_links[-MAX_LINKS_PER_ARTICLE:]]\n",
"\n",
" abstract = '\\n'.join(first_line + rest)\n",
"\n",
" return {'title': title, \n",
" 'abstract': abstract,\n",
" 'out_citations': top_links,\n",
" 'in_citations': [],\n",
" 'key_phrases': [],\n",
" 'authors': []\n",
" }\n",
" \n",
"LINES = None\n",
"def read_articles(limit=-1):\n",
" title = None\n",
"\n",
" lines = []\n",
" in_content = False\n",
"\n",
" with subprocess.Popen('bzcat %s' % WIKI_BZ2, shell=True, bufsize=32768, \n",
" stdout=subprocess.PIPE, close_fds=True) as p:\n",
" for i, line in enumerate(p.stdout):\n",
" line = line.decode('utf8')\n",
" if '<title>' in line:\n",
" title = line.replace('<title>', '').replace('</title>', '').strip()\n",
" \n",
" if '<text xml:space' in line:\n",
" line = line.replace('<text xml:space=\"preserve\">', '')\n",
" in_content = True\n",
" \n",
" if '</text' in line:\n",
" article = finish_article(title, lines)\n",
" if article:\n",
" yield article\n",
" \n",
" lines = []\n",
" in_content = False\n",
" if in_content:\n",
" lines.append(line)\n",
" else:\n",
" pass\n",
" \n",
"def write_records(count=-1):\n",
" with open('/data/citeomatic/wiki-paragraphs.json', 'w', encoding='utf8') as out:\n",
" for (i, page) in enumerate(tqdm.tqdm(read_articles())):\n",
" if i > count and count > 0:\n",
" break\n",
" \n",
" json.dump(page, out)\n",
" out.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"201845it [15:00, 222.11it/s], 42.91it/s]"
]
}
],
"source": [
"write_records()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0it [00:00, ?it/s]101it [00:00, 11204.34it/s]\n",
"101it [00:00, 3793.81it/s]\n",
"101it [00:00, 7086.40it/s]\n",
"101it [00:00, 5760.86it/s]\n"
]
}
],
"source": [
"records = []\n",
"\n",
"with open('/data/citeomatic/wiki-paragraphs.json', 'r', encoding='utf8') as input:\n",
" for i, line in enumerate(tqdm.tqdm(input)):\n",
" records.append(json.loads(line))\n",
" \n",
"df = pd.DataFrame.from_records(records)\n",
"del records\n",
"\n",
"df.title = df.title.str.lower()\n",
"\n",
"title_to_idx = { k:i for (i,k) in enumerate(df.title) }\n",
"idx_to_title = { i:t for (t,i) in title_to_idx.items() }\n",
"\n",
"titles = set(df.title)\n",
"link_map = {}\n",
"\n",
"import collections\n",
"reverse_links = collections.defaultdict(list)\n",
"\n",
"for i, row in tqdm.tqdm(df.iterrows()):\n",
" out_citations = [o.lower() for o in row.out_citations]\n",
" out_citations = [o for o in row.out_citations if o in titles]\n",
" links = np.asarray([title_to_idx[r] for r in out_citations])\n",
" row.out_citations = links\n",
"\n",
"# invert the link map\n",
"for i, row in tqdm.tqdm(df.iterrows()):\n",
" for j in row.out_citations:\n",
" reverse_links[j].append(i)\n",
" \n",
"for i, row in tqdm.tqdm(df.iterrows()):\n",
" row.in_citations = reverse_links[title_to_idx[row.title]]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>abstract</th>\n",
" <th>authors</th>\n",
" <th>in_citations</th>\n",
" <th>key_phrases</th>\n",
" <th>out_citations</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'''Anarchism''' is a [[political philosophy]] ...</td>\n",
" <td>[]</td>\n",
" <td>[15]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>anarchism</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>'''Autism''' is a [[neurodevelopmental disorde...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>autism</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>'''Albedo''' () is the \"whiteness\" of a surfac...</td>\n",
" <td>[]</td>\n",
" <td>[100]</td>\n",
" <td>[]</td>\n",
" <td>[100]</td>\n",
" <td>albedo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>'''A''' ([[English alphabet#Letter names|named...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[53]</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>From the [[American Civil War]] until [[World ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>alabama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Achilles’ most notable feat during the Trojan ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>achilles</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>'''Abraham Lincoln''' (; February 12, 1809&amp;nbs...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>abraham lincoln</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Teaching Alexander the Great gave Aristotle ma...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>aristotle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Gershwin composed ''An American in Paris'' on ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>an american in paris</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>The '''Academy Award for Best Production Desig...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>academy award for best production design</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>The '''Academy Awards''', or \"'''Oscars'''\", i...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>academy awards</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>'''''Actresses''''' ([[Catalan language|Catala...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>actrius</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>&lt;!----&gt;\\n'''''Animalia''''' is an illustrated ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[53]</td>\n",
" <td>animalia (book)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>'''International Atomic Time''' ('''TAI'...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>international atomic time</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>to the poor is often considered an altruistic...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>altruism</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>'''Ayn Rand''' (; born '''Alisa Zinov'yevna Ro...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[0]</td>\n",
" <td>ayn rand</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Born '''Joseph Aloysius Dwan''' in [[Toronto|T...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>allan dwan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>}}}}&lt;br&gt;}}}}''\\n|common_name = Algeria\\n|i...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>algeria</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>'''Anthropology''' is the study of various asp...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>anthropology</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>With the exception of [[theoretical production...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>agricultural science</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>The word alchemy was borrowed from [[Old Frenc...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>alchemy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>[[Image:Galileo.arp.300pix.jpg|left|thumb|upri...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[83]</td>\n",
" <td>astronomer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>'''ASCII''' ( ), abbreviated from '''American ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>ascii</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>'''Animation''' is the process of making the [...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>animation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>The etymology of the name is uncertain. The sp...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>apollo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>'''Andre Kirk Agassi''' (; born April 29, 1970...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>andre agassi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>''[[Ethnologue]]'' identifies 168 Austroasiati...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>austroasiatic languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>'''Afroasiatic''' ('''Afro-Asiatic'''), also k...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>afroasiatic languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Andorra is the [[European microstates|sixth-sm...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>andorra</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>In [[mathematics]] and [[statistics]], the '''...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>arithmetic mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>The '''Economy of Angola''' is one of the fast...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[34]</td>\n",
" <td>economy of angola</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>The FAA succeeded to the previous [[Armed Forc...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>angolan armed forces</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>The '''foreign relations of Angola''' are base...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>foreign relations of angola</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>'''Albert Sidney Johnston''' (February 2, 1803...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>albert sidney johnston</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>An '''android''' is a [[humanoid robot]] or [[...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>android (robot)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>'''Alberta''' () is a western [[provinces and ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>alberta</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>The ray-finned [[fish]]es are so called becaus...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>actinopterygii</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>'''Albert Einstein''' (; ; 14 March 1879&amp;nbsp;...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>albert einstein</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>The political history of the modern state of A...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>afghanistan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>'''Albania''' (, ; ; &lt;/ref&gt;}}), officially the...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>albania</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>&lt;br /&gt;2. [[Hamza#Hamzat waṣl|hamzat waṣl]] (هم...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>allah</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>'''Azerbaijan''' ( ; ), officially the '''Repu...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>azerbaijan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>Most amateur astronomers work at [[visible spe...</td>\n",
" <td>[]</td>\n",
" <td>[21]</td>\n",
" <td>[]</td>\n",
" <td>[100]</td>\n",
" <td>amateur astronomy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>is a [[gendai budō|modern]] [[Japanese marti...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>aikido</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>The oldest documented forms of art are [[visua...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>art</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>'''Agnostida''' is an [[order (biology)|order]...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>agnostida</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>'''Abortion''' is the ending of [[pregnancy]] ...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>abortion</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>|commander1= [[George Washington]]&lt;br /&gt;\\n [[N...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>american revolutionary war</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>The ampere is equivalent to one [[coulomb]] (r...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>ampere</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>[[Image:Euclid flowchart.svg|thumb|lright| [[F...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>algorithm</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>[[Image:Doperwt rijserwt peulen Pisum sativum....</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>annual plant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>Usually mouthwashes are an [[antiseptic]] solu...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>mouthwash</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>Seeking to reach the \"ends of the world and th...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>alexander the great</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>'''Alfred Habdank Skarbek Korzybski''' (; July...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>alfred korzybski</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>'''''Asteroids''''' is an arcade space shooter...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>asteroids (video game)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>Thus although most species in the order are [[...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>asparagales</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>'') of [[Araceae]] family in [[Crete]], [[Gree...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>alismatales</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>The '''Apiales''' are an [[Order (biology)|ord...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>apiales</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>Asterales are organisms that seem to have evol...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>asterales</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>'''Asteroids''' are [[minor planet]]s, especia...</td>\n",
" <td>[]</td>\n",
" <td>[2, 83]</td>\n",
" <td>[]</td>\n",
" <td>[2]</td>\n",
" <td>asteroid</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>101 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" abstract authors in_citations \\\n",
"0 '''Anarchism''' is a [[political philosophy]] ... [] [15] \n",
"1 '''Autism''' is a [[neurodevelopmental disorde... [] [] \n",
"2 '''Albedo''' () is the \"whiteness\" of a surfac... [] [100] \n",
"3 '''A''' ([[English alphabet#Letter names|named... [] [] \n",
"4 From the [[American Civil War]] until [[World ... [] [] \n",
"5 Achilles’ most notable feat during the Trojan ... [] [] \n",
"6 '''Abraham Lincoln''' (; February 12, 1809&nbs... [] [] \n",
"7 Teaching Alexander the Great gave Aristotle ma... [] [] \n",
"8 Gershwin composed ''An American in Paris'' on ... [] [] \n",
"9 The '''Academy Award for Best Production Desig... [] [] \n",
"10 The '''Academy Awards''', or \"'''Oscars'''\", i... [] [] \n",
"11 '''''Actresses''''' ([[Catalan language|Catala... [] [] \n",
"12 <!---->\\n'''''Animalia''''' is an illustrated ... [] [] \n",
"13 '''International Atomic Time''' ('''TAI'... [] [] \n",
"14 to the poor is often considered an altruistic... [] [] \n",
"15 '''Ayn Rand''' (; born '''Alisa Zinov'yevna Ro... [] [] \n",
"16 Born '''Joseph Aloysius Dwan''' in [[Toronto|T... [] [] \n",
"17 }}}}<br>}}}}''\\n|common_name = Algeria\\n|i... [] [] \n",
"18 '''Anthropology''' is the study of various asp... [] [] \n",
"19 With the exception of [[theoretical production... [] [] \n",
"20 The word alchemy was borrowed from [[Old Frenc... [] [] \n",
"21 [[Image:Galileo.arp.300pix.jpg|left|thumb|upri... [] [] \n",
"22 '''ASCII''' ( ), abbreviated from '''American ... [] [] \n",
"23 '''Animation''' is the process of making the [... [] [] \n",
"24 The etymology of the name is uncertain. The sp... [] [] \n",
"25 '''Andre Kirk Agassi''' (; born April 29, 1970... [] [] \n",
"26 ''[[Ethnologue]]'' identifies 168 Austroasiati... [] [] \n",
"27 '''Afroasiatic''' ('''Afro-Asiatic'''), also k... [] [] \n",
"28 Andorra is the [[European microstates|sixth-sm... [] [] \n",
"29 In [[mathematics]] and [[statistics]], the '''... [] [] \n",
".. ... ... ... \n",
"71 The '''Economy of Angola''' is one of the fast... [] [] \n",
"72 The FAA succeeded to the previous [[Armed Forc... [] [] \n",
"73 The '''foreign relations of Angola''' are base... [] [] \n",
"74 '''Albert Sidney Johnston''' (February 2, 1803... [] [] \n",
"75 An '''android''' is a [[humanoid robot]] or [[... [] [] \n",
"76 '''Alberta''' () is a western [[provinces and ... [] [] \n",
"77 The ray-finned [[fish]]es are so called becaus... [] [] \n",
"78 '''Albert Einstein''' (; ; 14 March 1879&nbsp;... [] [] \n",
"79 The political history of the modern state of A... [] [] \n",
"80 '''Albania''' (, ; ; </ref>}}), officially the... [] [] \n",
"81 <br />2. [[Hamza#Hamzat waṣl|hamzat waṣl]] (هم... [] [] \n",
"82 '''Azerbaijan''' ( ; ), officially the '''Repu... [] [] \n",
"83 Most amateur astronomers work at [[visible spe... [] [21] \n",
"84 is a [[gendai budō|modern]] [[Japanese marti... [] [] \n",
"85 The oldest documented forms of art are [[visua... [] [] \n",
"86 '''Agnostida''' is an [[order (biology)|order]... [] [] \n",
"87 '''Abortion''' is the ending of [[pregnancy]] ... [] [] \n",
"88 |commander1= [[George Washington]]<br />\\n [[N... [] [] \n",
"89 The ampere is equivalent to one [[coulomb]] (r... [] [] \n",
"90 [[Image:Euclid flowchart.svg|thumb|lright| [[F... [] [] \n",
"91 [[Image:Doperwt rijserwt peulen Pisum sativum.... [] [] \n",
"92 Usually mouthwashes are an [[antiseptic]] solu... [] [] \n",
"93 Seeking to reach the \"ends of the world and th... [] [] \n",
"94 '''Alfred Habdank Skarbek Korzybski''' (; July... [] [] \n",
"95 '''''Asteroids''''' is an arcade space shooter... [] [] \n",
"96 Thus although most species in the order are [[... [] [] \n",
"97 '') of [[Araceae]] family in [[Crete]], [[Gree... [] [] \n",
"98 The '''Apiales''' are an [[Order (biology)|ord... [] [] \n",
"99 Asterales are organisms that seem to have evol... [] [] \n",
"100 '''Asteroids''' are [[minor planet]]s, especia... [] [2, 83] \n",
"\n",
" key_phrases out_citations title \n",
"0 [] [] anarchism \n",
"1 [] [] autism \n",
"2 [] [100] albedo \n",
"3 [] [53] a \n",
"4 [] [] alabama \n",
"5 [] [] achilles \n",
"6 [] [] abraham lincoln \n",
"7 [] [] aristotle \n",
"8 [] [] an american in paris \n",
"9 [] [] academy award for best production design \n",
"10 [] [] academy awards \n",
"11 [] [] actrius \n",
"12 [] [53] animalia (book) \n",
"13 [] [] international atomic time \n",
"14 [] [] altruism \n",
"15 [] [0] ayn rand \n",
"16 [] [] allan dwan \n",
"17 [] [] algeria \n",
"18 [] [] anthropology \n",
"19 [] [] agricultural science \n",
"20 [] [] alchemy \n",
"21 [] [83] astronomer \n",
"22 [] [] ascii \n",
"23 [] [] animation \n",
"24 [] [] apollo \n",
"25 [] [] andre agassi \n",
"26 [] [] austroasiatic languages \n",
"27 [] [] afroasiatic languages \n",
"28 [] [] andorra \n",
"29 [] [] arithmetic mean \n",
".. ... ... ... \n",
"71 [] [34] economy of angola \n",
"72 [] [] angolan armed forces \n",
"73 [] [] foreign relations of angola \n",
"74 [] [] albert sidney johnston \n",
"75 [] [] android (robot) \n",
"76 [] [] alberta \n",
"77 [] [] actinopterygii \n",
"78 [] [] albert einstein \n",
"79 [] [] afghanistan \n",
"80 [] [] albania \n",
"81 [] [] allah \n",
"82 [] [] azerbaijan \n",
"83 [] [100] amateur astronomy \n",
"84 [] [] aikido \n",
"85 [] [] art \n",
"86 [] [] agnostida \n",
"87 [] [] abortion \n",
"88 [] [] american revolutionary war \n",
"89 [] [] ampere \n",
"90 [] [] algorithm \n",
"91 [] [] annual plant \n",
"92 [] [] mouthwash \n",
"93 [] [] alexander the great \n",
"94 [] [] alfred korzybski \n",
"95 [] [] asteroids (video game) \n",
"96 [] [] asparagales \n",
"97 [] [] alismatales \n",
"98 [] [] apiales \n",
"99 [] [] asterales \n",
"100 [] [2] asteroid \n",
"\n",
"[101 rows x 6 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "SSH s2-server1 Python 3.5",
"language": "",
"name": "rik_ssh_s2_server1_python35"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment