Created
March 1, 2017 18:45
-
-
Save rjpower/7b11a3197f1bff0d09bc8b984592ba76 to your computer and use it in GitHub Desktop.
wiki parsing example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Wikipedia Paragraph Conversion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import os\n", | |
"import tqdm\n", | |
"import subprocess\n", | |
"import elasticsearch\n", | |
"\n", | |
"%config ZMQInteractiveShell.cache_size = 0\n", | |
"WIKI_BZ2 = '/mnt/spin6TB/enwiki-20161001-pages-meta-current.xml.bz2'\n", | |
"ES = elasticsearch.Elasticsearch('localhost:9200')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"\n", | |
"TAG_RE = re.compile('{{[^}]+}}')\n", | |
"\n", | |
"FILE_LINK = re.compile('\\\\[\\\\[File:[^]]+\\\\]\\\\]')\n", | |
"\n", | |
"EXTERNAL_LINK = re.compile('\\\\[http[^]]+\\\\]') \n", | |
"EXTERNAL_REF = re.compile('<ref name[^/]+/>|<ref name[^<]*</ref>|<ref>[^<]*</ref>')\n", | |
"COMMENT = re.compile('<!--[^-]+-->|<comment>[^<]+</comment>')\n", | |
"HTML_REF = re.compile('[&][a-z0-9]{2,4};')\n", | |
"REFS = {\n", | |
" 'quot': '\"',\n", | |
" 'lt': '<',\n", | |
" 'gt': '>',\n", | |
" 'amp': '&',\n", | |
"}\n", | |
"\n", | |
"LINK_RES = [\n", | |
" re.compile(r'[[]{2}([^]]+)[]]{2}'),\n", | |
" re.compile(r'{{Main article[|]([^}]+)}}'),\n", | |
"]\n", | |
"\n", | |
"def replace_ref(ref):\n", | |
" name = ref.group(0)[1:-1]\n", | |
" if name in REFS:\n", | |
" return REFS[name]\n", | |
" return ''\n", | |
" \n", | |
"cleanups = [\n", | |
" lambda l: TAG_RE.sub('', l),\n", | |
" lambda l: EXTERNAL_LINK.sub('', l),\n", | |
" lambda l: HTML_REF.sub(replace_ref, l),\n", | |
" lambda l: FILE_LINK.sub('', l),\n", | |
" lambda l: EXTERNAL_REF.sub('', l),\n", | |
" lambda l: COMMENT.sub('', l),\n", | |
"]\n", | |
"\n", | |
"def outlinks(text):\n", | |
" refs = []\n", | |
" for regex in LINK_RES:\n", | |
" for ref in regex.findall(text): \n", | |
" ref = ref.split('|')[0]\n", | |
" refs.append(ref)\n", | |
" return refs\n", | |
" \n", | |
"\n", | |
"MIN_PARAGRAPH_LEN = 150\n", | |
"MIN_LINKS_PER_PARAGRAPH = 1\n", | |
"MAX_LINKS_PER_PARAGRAPH = 10\n", | |
"MAX_LINKS_PER_ARTICLE = 25\n", | |
"MIN_SENTENCES = 3\n", | |
"MIN_PARAGRAPHS_PER_ARTICLE = 3\n", | |
"MAX_PARAGRAPHS_PER_ARTICLE = 10" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import collections\n", | |
"\n", | |
"def bad_link(l):\n", | |
" if not l.strip():\n", | |
" return True\n", | |
" if ':' in l or 'List of' in l or 'Category' in l:\n", | |
" return True\n", | |
" if l[0] in '0123456789':\n", | |
" return True \n", | |
" return False\n", | |
" \n", | |
"def finish_article(title, lines):\n", | |
" if bad_link(title):\n", | |
" return\n", | |
"\n", | |
" links = collections.Counter()\n", | |
"\n", | |
" text = ''.join(lines)\n", | |
" for cleanup in cleanups:\n", | |
" text = cleanup(text)\n", | |
" lines = text.split('\\n')\n", | |
" links.update([link for link in outlinks(text) if not bad_link(link)])\n", | |
"\n", | |
" paragraph = []\n", | |
" paragraphs = []\n", | |
"\n", | |
" for line in lines:\n", | |
" if len(line.strip()) < 5 or '==' in line:\n", | |
" ptext = '\\n'.join(paragraph)\n", | |
" paragraph = []\n", | |
" \n", | |
" plinks = outlinks(ptext)\n", | |
" link_count = len(plinks)\n", | |
"\n", | |
" if ptext.count('.') >= MIN_SENTENCES\\\n", | |
" and len(ptext) >= MIN_PARAGRAPH_LEN\\\n", | |
" and link_count >= MIN_LINKS_PER_PARAGRAPH\\\n", | |
" and link_count <= MAX_LINKS_PER_PARAGRAPH:\n", | |
" paragraphs.append(ptext)\n", | |
" else:\n", | |
" paragraph.append(line)\n", | |
"\n", | |
" if len(paragraphs) < MIN_PARAGRAPHS_PER_ARTICLE:\n", | |
" return\n", | |
"\n", | |
" first_line = paragraphs[0:1]\n", | |
" paragraphs.sort(key=lambda l: len(l))\n", | |
" rest = paragraphs[-(MAX_PARAGRAPHS_PER_ARTICLE - 1):]\n", | |
"\n", | |
" all_links = list(links.items())\n", | |
" all_links.sort(key=lambda kv: kv[1])\n", | |
" top_links = [k for (k,v) in all_links[-MAX_LINKS_PER_ARTICLE:]]\n", | |
"\n", | |
" abstract = '\\n'.join(first_line + rest)\n", | |
"\n", | |
" return {'title': title, \n", | |
" 'abstract': abstract,\n", | |
" 'out_citations': top_links,\n", | |
" 'in_citations': [],\n", | |
" 'key_phrases': [],\n", | |
" 'authors': []\n", | |
" }\n", | |
" \n", | |
"LINES = None\n", | |
"def read_articles(limit=-1):\n", | |
" title = None\n", | |
"\n", | |
" lines = []\n", | |
" in_content = False\n", | |
"\n", | |
" with subprocess.Popen('bzcat %s' % WIKI_BZ2, shell=True, bufsize=32768, \n", | |
" stdout=subprocess.PIPE, close_fds=True) as p:\n", | |
" for i, line in enumerate(p.stdout):\n", | |
" line = line.decode('utf8')\n", | |
" if '<title>' in line:\n", | |
" title = line.replace('<title>', '').replace('</title>', '').strip()\n", | |
" \n", | |
" if '<text xml:space' in line:\n", | |
" line = line.replace('<text xml:space=\"preserve\">', '')\n", | |
" in_content = True\n", | |
" \n", | |
" if '</text' in line:\n", | |
" article = finish_article(title, lines)\n", | |
" if article:\n", | |
" yield article\n", | |
" \n", | |
" lines = []\n", | |
" in_content = False\n", | |
" if in_content:\n", | |
" lines.append(line)\n", | |
" else:\n", | |
" pass\n", | |
" \n", | |
"def write_records(count=-1):\n", | |
" with open('/data/citeomatic/wiki-paragraphs.json', 'w', encoding='utf8') as out:\n", | |
" for (i, page) in enumerate(tqdm.tqdm(read_articles())):\n", | |
" if i > count and count > 0:\n", | |
" break\n", | |
" \n", | |
" json.dump(page, out)\n", | |
" out.write('\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"201845it [15:00, 222.11it/s], 42.91it/s]" | |
] | |
} | |
], | |
"source": [ | |
"write_records()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"0it [00:00, ?it/s]101it [00:00, 11204.34it/s]\n", | |
"101it [00:00, 3793.81it/s]\n", | |
"101it [00:00, 7086.40it/s]\n", | |
"101it [00:00, 5760.86it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"records = []\n", | |
"\n", | |
"with open('/data/citeomatic/wiki-paragraphs.json', 'r', encoding='utf8') as input:\n", | |
" for i, line in enumerate(tqdm.tqdm(input)):\n", | |
" records.append(json.loads(line))\n", | |
" \n", | |
"df = pd.DataFrame.from_records(records)\n", | |
"del records\n", | |
"\n", | |
"df.title = df.title.str.lower()\n", | |
"\n", | |
"title_to_idx = { k:i for (i,k) in enumerate(df.title) }\n", | |
"idx_to_title = { i:t for (t,i) in title_to_idx.items() }\n", | |
"\n", | |
"titles = set(df.title)\n", | |
"link_map = {}\n", | |
"\n", | |
"import collections\n", | |
"reverse_links = collections.defaultdict(list)\n", | |
"\n", | |
"for i, row in tqdm.tqdm(df.iterrows()):\n", | |
" out_citations = [o.lower() for o in row.out_citations]\n", | |
" out_citations = [o for o in row.out_citations if o in titles]\n", | |
" links = np.asarray([title_to_idx[r] for r in out_citations])\n", | |
" row.out_citations = links\n", | |
"\n", | |
"# invert the link map\n", | |
"for i, row in tqdm.tqdm(df.iterrows()):\n", | |
" for j in row.out_citations:\n", | |
" reverse_links[j].append(i)\n", | |
" \n", | |
"for i, row in tqdm.tqdm(df.iterrows()):\n", | |
" row.in_citations = reverse_links[title_to_idx[row.title]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>abstract</th>\n", | |
" <th>authors</th>\n", | |
" <th>in_citations</th>\n", | |
" <th>key_phrases</th>\n", | |
" <th>out_citations</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>'''Anarchism''' is a [[political philosophy]] ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[15]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>anarchism</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>'''Autism''' is a [[neurodevelopmental disorde...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>autism</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>'''Albedo''' () is the \"whiteness\" of a surfac...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[100]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[100]</td>\n", | |
" <td>albedo</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>'''A''' ([[English alphabet#Letter names|named...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[53]</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>From the [[American Civil War]] until [[World ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>alabama</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Achilles’ most notable feat during the Trojan ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>achilles</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>'''Abraham Lincoln''' (; February 12, 1809&nbs...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>abraham lincoln</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>Teaching Alexander the Great gave Aristotle ma...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>aristotle</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>Gershwin composed ''An American in Paris'' on ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>an american in paris</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>The '''Academy Award for Best Production Desig...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>academy award for best production design</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>The '''Academy Awards''', or \"'''Oscars'''\", i...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>academy awards</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>'''''Actresses''''' ([[Catalan language|Catala...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>actrius</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td><!---->\\n'''''Animalia''''' is an illustrated ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[53]</td>\n", | |
" <td>animalia (book)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>'''International Atomic Time''' ('''TAI'...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>international atomic time</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>to the poor is often considered an altruistic...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>altruism</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>'''Ayn Rand''' (; born '''Alisa Zinov'yevna Ro...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[0]</td>\n", | |
" <td>ayn rand</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>Born '''Joseph Aloysius Dwan''' in [[Toronto|T...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>allan dwan</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>}}}}<br>}}}}''\\n|common_name = Algeria\\n|i...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>algeria</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>'''Anthropology''' is the study of various asp...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>anthropology</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>With the exception of [[theoretical production...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>agricultural science</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>The word alchemy was borrowed from [[Old Frenc...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>alchemy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>[[Image:Galileo.arp.300pix.jpg|left|thumb|upri...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[83]</td>\n", | |
" <td>astronomer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>'''ASCII''' ( ), abbreviated from '''American ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>ascii</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>'''Animation''' is the process of making the [...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>animation</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>The etymology of the name is uncertain. The sp...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>apollo</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>'''Andre Kirk Agassi''' (; born April 29, 1970...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>andre agassi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>''[[Ethnologue]]'' identifies 168 Austroasiati...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>austroasiatic languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>'''Afroasiatic''' ('''Afro-Asiatic'''), also k...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>afroasiatic languages</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>Andorra is the [[European microstates|sixth-sm...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>andorra</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>In [[mathematics]] and [[statistics]], the '''...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>arithmetic mean</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>71</th>\n", | |
" <td>The '''Economy of Angola''' is one of the fast...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[34]</td>\n", | |
" <td>economy of angola</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>72</th>\n", | |
" <td>The FAA succeeded to the previous [[Armed Forc...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>angolan armed forces</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>73</th>\n", | |
" <td>The '''foreign relations of Angola''' are base...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>foreign relations of angola</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>74</th>\n", | |
" <td>'''Albert Sidney Johnston''' (February 2, 1803...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>albert sidney johnston</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75</th>\n", | |
" <td>An '''android''' is a [[humanoid robot]] or [[...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>android (robot)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>76</th>\n", | |
" <td>'''Alberta''' () is a western [[provinces and ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>alberta</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>77</th>\n", | |
" <td>The ray-finned [[fish]]es are so called becaus...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>actinopterygii</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>78</th>\n", | |
" <td>'''Albert Einstein''' (; ; 14 March 1879&nbsp;...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>albert einstein</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>79</th>\n", | |
" <td>The political history of the modern state of A...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>afghanistan</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80</th>\n", | |
" <td>'''Albania''' (, ; ; </ref>}}), officially the...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>albania</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>81</th>\n", | |
" <td><br />2. [[Hamza#Hamzat waṣl|hamzat waṣl]] (هم...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>allah</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>82</th>\n", | |
" <td>'''Azerbaijan''' ( ; ), officially the '''Repu...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>azerbaijan</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>83</th>\n", | |
" <td>Most amateur astronomers work at [[visible spe...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[21]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[100]</td>\n", | |
" <td>amateur astronomy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>84</th>\n", | |
" <td>is a [[gendai budō|modern]] [[Japanese marti...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>aikido</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>85</th>\n", | |
" <td>The oldest documented forms of art are [[visua...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>art</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>86</th>\n", | |
" <td>'''Agnostida''' is an [[order (biology)|order]...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>agnostida</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87</th>\n", | |
" <td>'''Abortion''' is the ending of [[pregnancy]] ...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>abortion</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>88</th>\n", | |
" <td>|commander1= [[George Washington]]<br />\\n [[N...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>american revolutionary war</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>89</th>\n", | |
" <td>The ampere is equivalent to one [[coulomb]] (r...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>ampere</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>90</th>\n", | |
" <td>[[Image:Euclid flowchart.svg|thumb|lright| [[F...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>algorithm</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>91</th>\n", | |
" <td>[[Image:Doperwt rijserwt peulen Pisum sativum....</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>annual plant</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>92</th>\n", | |
" <td>Usually mouthwashes are an [[antiseptic]] solu...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>mouthwash</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>93</th>\n", | |
" <td>Seeking to reach the \"ends of the world and th...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>alexander the great</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94</th>\n", | |
" <td>'''Alfred Habdank Skarbek Korzybski''' (; July...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>alfred korzybski</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>95</th>\n", | |
" <td>'''''Asteroids''''' is an arcade space shooter...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>asteroids (video game)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>Thus although most species in the order are [[...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>asparagales</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>'') of [[Araceae]] family in [[Crete]], [[Gree...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>alismatales</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>The '''Apiales''' are an [[Order (biology)|ord...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>apiales</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99</th>\n", | |
" <td>Asterales are organisms that seem to have evol...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[]</td>\n", | |
" <td>asterales</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>100</th>\n", | |
" <td>'''Asteroids''' are [[minor planet]]s, especia...</td>\n", | |
" <td>[]</td>\n", | |
" <td>[2, 83]</td>\n", | |
" <td>[]</td>\n", | |
" <td>[2]</td>\n", | |
" <td>asteroid</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>101 rows × 6 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" abstract authors in_citations \\\n", | |
"0 '''Anarchism''' is a [[political philosophy]] ... [] [15] \n", | |
"1 '''Autism''' is a [[neurodevelopmental disorde... [] [] \n", | |
"2 '''Albedo''' () is the \"whiteness\" of a surfac... [] [100] \n", | |
"3 '''A''' ([[English alphabet#Letter names|named... [] [] \n", | |
"4 From the [[American Civil War]] until [[World ... [] [] \n", | |
"5 Achilles’ most notable feat during the Trojan ... [] [] \n", | |
"6 '''Abraham Lincoln''' (; February 12, 1809&nbs... [] [] \n", | |
"7 Teaching Alexander the Great gave Aristotle ma... [] [] \n", | |
"8 Gershwin composed ''An American in Paris'' on ... [] [] \n", | |
"9 The '''Academy Award for Best Production Desig... [] [] \n", | |
"10 The '''Academy Awards''', or \"'''Oscars'''\", i... [] [] \n", | |
"11 '''''Actresses''''' ([[Catalan language|Catala... [] [] \n", | |
"12 <!---->\\n'''''Animalia''''' is an illustrated ... [] [] \n", | |
"13 '''International Atomic Time''' ('''TAI'... [] [] \n", | |
"14 to the poor is often considered an altruistic... [] [] \n", | |
"15 '''Ayn Rand''' (; born '''Alisa Zinov'yevna Ro... [] [] \n", | |
"16 Born '''Joseph Aloysius Dwan''' in [[Toronto|T... [] [] \n", | |
"17 }}}}<br>}}}}''\\n|common_name = Algeria\\n|i... [] [] \n", | |
"18 '''Anthropology''' is the study of various asp... [] [] \n", | |
"19 With the exception of [[theoretical production... [] [] \n", | |
"20 The word alchemy was borrowed from [[Old Frenc... [] [] \n", | |
"21 [[Image:Galileo.arp.300pix.jpg|left|thumb|upri... [] [] \n", | |
"22 '''ASCII''' ( ), abbreviated from '''American ... [] [] \n", | |
"23 '''Animation''' is the process of making the [... [] [] \n", | |
"24 The etymology of the name is uncertain. The sp... [] [] \n", | |
"25 '''Andre Kirk Agassi''' (; born April 29, 1970... [] [] \n", | |
"26 ''[[Ethnologue]]'' identifies 168 Austroasiati... [] [] \n", | |
"27 '''Afroasiatic''' ('''Afro-Asiatic'''), also k... [] [] \n", | |
"28 Andorra is the [[European microstates|sixth-sm... [] [] \n", | |
"29 In [[mathematics]] and [[statistics]], the '''... [] [] \n", | |
".. ... ... ... \n", | |
"71 The '''Economy of Angola''' is one of the fast... [] [] \n", | |
"72 The FAA succeeded to the previous [[Armed Forc... [] [] \n", | |
"73 The '''foreign relations of Angola''' are base... [] [] \n", | |
"74 '''Albert Sidney Johnston''' (February 2, 1803... [] [] \n", | |
"75 An '''android''' is a [[humanoid robot]] or [[... [] [] \n", | |
"76 '''Alberta''' () is a western [[provinces and ... [] [] \n", | |
"77 The ray-finned [[fish]]es are so called becaus... [] [] \n", | |
"78 '''Albert Einstein''' (; ; 14 March 1879 ... [] [] \n", | |
"79 The political history of the modern state of A... [] [] \n", | |
"80 '''Albania''' (, ; ; </ref>}}), officially the... [] [] \n", | |
"81 <br />2. [[Hamza#Hamzat waṣl|hamzat waṣl]] (هم... [] [] \n", | |
"82 '''Azerbaijan''' ( ; ), officially the '''Repu... [] [] \n", | |
"83 Most amateur astronomers work at [[visible spe... [] [21] \n", | |
"84 is a [[gendai budō|modern]] [[Japanese marti... [] [] \n", | |
"85 The oldest documented forms of art are [[visua... [] [] \n", | |
"86 '''Agnostida''' is an [[order (biology)|order]... [] [] \n", | |
"87 '''Abortion''' is the ending of [[pregnancy]] ... [] [] \n", | |
"88 |commander1= [[George Washington]]<br />\\n [[N... [] [] \n", | |
"89 The ampere is equivalent to one [[coulomb]] (r... [] [] \n", | |
"90 [[Image:Euclid flowchart.svg|thumb|lright| [[F... [] [] \n", | |
"91 [[Image:Doperwt rijserwt peulen Pisum sativum.... [] [] \n", | |
"92 Usually mouthwashes are an [[antiseptic]] solu... [] [] \n", | |
"93 Seeking to reach the \"ends of the world and th... [] [] \n", | |
"94 '''Alfred Habdank Skarbek Korzybski''' (; July... [] [] \n", | |
"95 '''''Asteroids''''' is an arcade space shooter... [] [] \n", | |
"96 Thus although most species in the order are [[... [] [] \n", | |
"97 '') of [[Araceae]] family in [[Crete]], [[Gree... [] [] \n", | |
"98 The '''Apiales''' are an [[Order (biology)|ord... [] [] \n", | |
"99 Asterales are organisms that seem to have evol... [] [] \n", | |
"100 '''Asteroids''' are [[minor planet]]s, especia... [] [2, 83] \n", | |
"\n", | |
" key_phrases out_citations title \n", | |
"0 [] [] anarchism \n", | |
"1 [] [] autism \n", | |
"2 [] [100] albedo \n", | |
"3 [] [53] a \n", | |
"4 [] [] alabama \n", | |
"5 [] [] achilles \n", | |
"6 [] [] abraham lincoln \n", | |
"7 [] [] aristotle \n", | |
"8 [] [] an american in paris \n", | |
"9 [] [] academy award for best production design \n", | |
"10 [] [] academy awards \n", | |
"11 [] [] actrius \n", | |
"12 [] [53] animalia (book) \n", | |
"13 [] [] international atomic time \n", | |
"14 [] [] altruism \n", | |
"15 [] [0] ayn rand \n", | |
"16 [] [] allan dwan \n", | |
"17 [] [] algeria \n", | |
"18 [] [] anthropology \n", | |
"19 [] [] agricultural science \n", | |
"20 [] [] alchemy \n", | |
"21 [] [83] astronomer \n", | |
"22 [] [] ascii \n", | |
"23 [] [] animation \n", | |
"24 [] [] apollo \n", | |
"25 [] [] andre agassi \n", | |
"26 [] [] austroasiatic languages \n", | |
"27 [] [] afroasiatic languages \n", | |
"28 [] [] andorra \n", | |
"29 [] [] arithmetic mean \n", | |
".. ... ... ... \n", | |
"71 [] [34] economy of angola \n", | |
"72 [] [] angolan armed forces \n", | |
"73 [] [] foreign relations of angola \n", | |
"74 [] [] albert sidney johnston \n", | |
"75 [] [] android (robot) \n", | |
"76 [] [] alberta \n", | |
"77 [] [] actinopterygii \n", | |
"78 [] [] albert einstein \n", | |
"79 [] [] afghanistan \n", | |
"80 [] [] albania \n", | |
"81 [] [] allah \n", | |
"82 [] [] azerbaijan \n", | |
"83 [] [100] amateur astronomy \n", | |
"84 [] [] aikido \n", | |
"85 [] [] art \n", | |
"86 [] [] agnostida \n", | |
"87 [] [] abortion \n", | |
"88 [] [] american revolutionary war \n", | |
"89 [] [] ampere \n", | |
"90 [] [] algorithm \n", | |
"91 [] [] annual plant \n", | |
"92 [] [] mouthwash \n", | |
"93 [] [] alexander the great \n", | |
"94 [] [] alfred korzybski \n", | |
"95 [] [] asteroids (video game) \n", | |
"96 [] [] asparagales \n", | |
"97 [] [] alismatales \n", | |
"98 [] [] apiales \n", | |
"99 [] [] asterales \n", | |
"100 [] [2] asteroid \n", | |
"\n", | |
"[101 rows x 6 columns]" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "SSH s2-server1 Python 3.5", | |
"language": "", | |
"name": "rik_ssh_s2_server1_python35" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment