Skip to content

Instantly share code, notes, and snippets.

@peeter-t2
Last active May 20, 2019 20:43
Show Gist options
  • Save peeter-t2/9545a25f2129f9be1150a2b7d1e70b89 to your computer and use it in GitHub Desktop.
Save peeter-t2/9545a25f2129f9be1150a2b7d1e70b89 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hello world\n"
]
}
],
"source": [
"print(\"hello world\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('data/filename.json', <http.client.HTTPMessage at 0x7f2d04040550>)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import urllib.request\n",
"\n",
"url = \"https://vm0824.kaj.pouta.csc.fi/octavo/ecco/search??pretty&fieldEnricher=&offsetDataConverter=&query=%3CDOCUMENT%C2%A7documentID%3A0071400200%0A%C2%A7DOCUMENT%3E&field=content&offset=0&limit=20&snippetLimit=20&contextLevel=Sentence&contextExpandLeft=0&contextExpandRight=0&level=&endpoint=https%3A%2F%2Fvm0824.kaj.pouta.csc.fi%2Foctavo%2Fecco%2F\"\n",
"\n",
"urllib.request.urlretrieve(url,\"data/filename.json\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open('data/filename.json') as f:\n",
" d = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"IOPub data rate exceeded.\n",
"The notebook server will temporarily stop sending output\n",
"to the client in order to avoid crashing it.\n",
"To change this limit, set the config variable\n",
"`--NotebookApp.iopub_data_rate_limit`.\n",
"\n",
"Current values:\n",
"NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
"NotebookApp.rate_limit_window=3.0 (secs)\n",
"\n"
]
}
],
"source": [
"print(d[\"results\"][\"docs\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting stanfordcorenlp\n",
" Downloading https://files.pythonhosted.org/packages/35/cb/0a271890bbe3a77fc1aca2bc3a58b14e11799ea77cb5f7d6fb0a8b4c46fa/stanfordcorenlp-3.9.1.1-py2.py3-none-any.whl\n",
"Collecting psutil (from stanfordcorenlp)\n",
" Downloading https://files.pythonhosted.org/packages/c6/c1/beed5e4eaa1345901b595048fab1c85aee647ea0fc02d9e8bf9aceb81078/psutil-5.6.2.tar.gz (432kB)\n",
"\u001b[K 100% |████████████████████████████████| 440kB 1.9MB/s ta 0:00:011\n",
"\u001b[?25hCollecting requests (from stanfordcorenlp)\n",
" Using cached https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl\n",
"Collecting certifi>=2017.4.17 (from requests->stanfordcorenlp)\n",
" Using cached https://files.pythonhosted.org/packages/60/75/f692a584e85b7eaba0e03827b3d51f45f571c2e793dd731e598828d380aa/certifi-2019.3.9-py2.py3-none-any.whl\n",
"Collecting chardet<3.1.0,>=3.0.2 (from requests->stanfordcorenlp)\n",
" Using cached https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl\n",
"Collecting idna<2.9,>=2.5 (from requests->stanfordcorenlp)\n",
" Using cached https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl\n",
"Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 (from requests->stanfordcorenlp)\n",
" Using cached https://files.pythonhosted.org/packages/39/ec/d93dfc69617a028915df914339ef66936ea976ef24fa62940fd86ba0326e/urllib3-1.25.2-py2.py3-none-any.whl\n",
"Building wheels for collected packages: psutil\n",
" Running setup.py bdist_wheel for psutil ... \u001b[?25ldone\n",
"\u001b[?25h Stored in directory: /home/ubuntu/.cache/pip/wheels/17/08/ec/22b464874958c3fc91e1a75748fae2220eb704a8b1035f9a03\n",
"Successfully built psutil\n",
"Installing collected packages: psutil, certifi, chardet, idna, urllib3, requests, stanfordcorenlp\n",
"Successfully installed certifi-2019.3.9 chardet-3.0.4 idna-2.8 psutil-5.6.2 requests-2.22.0 stanfordcorenlp-3.9.1.1 urllib3-1.25.2\n"
]
}
],
"source": [
"! pip3 install stanfordcorenlp"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0538301300.json\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"20\n",
"21\n",
"22\n",
"0218402600.json\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"0275300200.json\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"1043000300.json\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m--------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;31m# Python 2.7, use buffering of HTTP responses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 379\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 380\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: getresponse() got an unexpected keyword argument 'buffering'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-641b8fe85acb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 29\u001b[0m jsontext = nlp.annotate(\n\u001b[1;32m 30\u001b[0m \u001b[0mcontenti\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m properties=props)\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mcodecs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/med_NERs_ville/'\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'.persons'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf8\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjsontext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/stanfordcorenlp/corenlp.py\u001b[0m in \u001b[0;36mannotate\u001b[0;34m(self, text, properties)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m r = requests.post(self.url, params={'properties': str(properties)}, data=text,\n\u001b[0;32m--> 154\u001b[0;31m headers={'Connection': 'close'})\n\u001b[0m\u001b[1;32m 155\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 114\u001b[0m \"\"\"\n\u001b[1;32m 115\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 60\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 61\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 531\u001b[0m }\n\u001b[1;32m 532\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 646\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m )\n\u001b[1;32m 451\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 601\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 603\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 604\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 605\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/.local/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;31m# Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 383\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 384\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 385\u001b[0m \u001b[0;31m# Remove the TypeError from the exception chain in Python 3;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1329\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1330\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1331\u001b[0;31m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1332\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1333\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36mbegin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;31m# read until we get a non-100 response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 297\u001b[0;31m \u001b[0mversion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 298\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36m_read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 258\u001b[0;31m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"iso-8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 259\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"status line\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import stanfordcorenlp\n",
"import codecs\n",
"import textwrap\n",
"import json\n",
"from stanfordcorenlp import StanfordCoreNLP\n",
"nlp = StanfordCoreNLP(\"/home/ubuntu/genre/data/stanford/stanford-corenlp-full-2018-10-05/\", memory='8g')\n",
"\n",
"props = {\n",
" 'annotators': 'ner',\n",
" 'pipelineLanguage': 'en',\n",
" 'timeout': '500000',\n",
" 'outputFormat': 'json',\n",
" 'ner.model': '/home/ubuntu/genre/data/stanford/ner-model.ser.gz'\n",
" }\n",
"\n",
"\n",
"import os\n",
"files=os.listdir(\"data/med_texts/\")\n",
"for file in files[3:len(files)]:\n",
" print(file)\n",
" with codecs.open('data/med_texts/'+file, encoding=\"utf8\") as f:\n",
" i=0\n",
" d = json.load(f)\n",
" content = d[\"results\"][\"docs\"][0][\"content\"]\n",
" content_split= textwrap.wrap(content,100000)\n",
" for contenti in content_split:\n",
" i=i+1\n",
"#sentence = 'Rome is the capital of Italy. This is one more sentence. Aristotle went for a walk.'\n",
" jsontext = nlp.annotate(\n",
" contenti,\n",
" properties=props)\n",
" with codecs.open('data/med_NERs_ville/'+file+''+str(i)+'.persons', 'w', encoding=\"utf8\") as f:\n",
" f.write(jsontext)\n",
" print(i)\n",
" \n",
" \n",
"\n",
"nlp.close()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'{\\n \"sentences\": [\\n {\\n \"index\": 0,\\n \"entitymentions\": [\\n {\\n \"docTokenBegin\": 5,\\n \"docTokenEnd\": 6,\\n \"tokenBegin\": 5,\\n \"tokenEnd\": 6,\\n \"text\": \"Italy\",\\n \"characterOffsetBegin\": 23,\\n \"characterOffsetEnd\": 28,\\n \"ner\": \"COUNTRY\"\\n }\\n ],\\n \"tokens\": [\\n {\\n \"index\": 1,\\n \"word\": \"Rome\",\\n \"originalText\": \"Rome\",\\n \"lemma\": \"Rome\",\\n \"characterOffsetBegin\": 0,\\n \"characterOffsetEnd\": 4,\\n \"pos\": \"NNP\",\\n \"ner\": \"O\",\\n \"before\": \"\",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 2,\\n \"word\": \"is\",\\n \"originalText\": \"is\",\\n \"lemma\": \"be\",\\n \"characterOffsetBegin\": 5,\\n \"characterOffsetEnd\": 7,\\n \"pos\": \"VBZ\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 3,\\n \"word\": \"the\",\\n \"originalText\": \"the\",\\n \"lemma\": \"the\",\\n \"characterOffsetBegin\": 8,\\n \"characterOffsetEnd\": 11,\\n \"pos\": \"DT\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 4,\\n \"word\": \"capital\",\\n \"originalText\": \"capital\",\\n \"lemma\": \"capital\",\\n \"characterOffsetBegin\": 12,\\n \"characterOffsetEnd\": 19,\\n \"pos\": \"NN\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 5,\\n \"word\": \"of\",\\n \"originalText\": \"of\",\\n \"lemma\": \"of\",\\n \"characterOffsetBegin\": 20,\\n \"characterOffsetEnd\": 22,\\n \"pos\": \"IN\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 6,\\n \"word\": \"Italy\",\\n \"originalText\": \"Italy\",\\n \"lemma\": \"Italy\",\\n \"characterOffsetBegin\": 23,\\n \"characterOffsetEnd\": 28,\\n \"pos\": \"NNP\",\\n \"ner\": \"COUNTRY\",\\n \"before\": \" \",\\n \"after\": \"\"\\n },\\n {\\n \"index\": 7,\\n \"word\": \".\",\\n \"originalText\": \".\",\\n \"lemma\": \".\",\\n \"characterOffsetBegin\": 28,\\n \"characterOffsetEnd\": 29,\\n \"pos\": \".\",\\n \"ner\": \"O\",\\n \"before\": \"\",\\n \"after\": \" \"\\n }\\n ]\\n },\\n {\\n \"index\": 1,\\n \"entitymentions\": [\\n {\\n \"docTokenBegin\": 9,\\n \"docTokenEnd\": 10,\\n \"tokenBegin\": 2,\\n \"tokenEnd\": 3,\\n \"text\": \"one\",\\n \"characterOffsetBegin\": 38,\\n \"characterOffsetEnd\": 41,\\n \"ner\": \"NUMBER\",\\n \"normalizedNER\": \"1.0\"\\n }\\n ],\\n \"tokens\": [\\n {\\n \"index\": 1,\\n \"word\": \"This\",\\n \"originalText\": \"This\",\\n \"lemma\": \"this\",\\n \"characterOffsetBegin\": 30,\\n \"characterOffsetEnd\": 34,\\n \"pos\": \"DT\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 2,\\n \"word\": \"is\",\\n \"originalText\": \"is\",\\n \"lemma\": \"be\",\\n \"characterOffsetBegin\": 35,\\n \"characterOffsetEnd\": 37,\\n \"pos\": \"VBZ\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 3,\\n \"word\": \"one\",\\n \"originalText\": \"one\",\\n \"lemma\": \"one\",\\n \"characterOffsetBegin\": 38,\\n \"characterOffsetEnd\": 41,\\n \"pos\": \"CD\",\\n \"ner\": \"NUMBER\",\\n \"normalizedNER\": \"1.0\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 4,\\n \"word\": \"more\",\\n \"originalText\": \"more\",\\n \"lemma\": \"more\",\\n \"characterOffsetBegin\": 42,\\n \"characterOffsetEnd\": 46,\\n \"pos\": \"JJR\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 5,\\n \"word\": \"sentence\",\\n \"originalText\": \"sentence\",\\n \"lemma\": \"sentence\",\\n \"characterOffsetBegin\": 47,\\n \"characterOffsetEnd\": 55,\\n \"pos\": \"NN\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \"\"\\n },\\n {\\n \"index\": 6,\\n \"word\": \".\",\\n \"originalText\": \".\",\\n \"lemma\": \".\",\\n \"characterOffsetBegin\": 55,\\n \"characterOffsetEnd\": 56,\\n \"pos\": \".\",\\n \"ner\": \"O\",\\n \"before\": \"\",\\n \"after\": \" \"\\n }\\n ]\\n },\\n {\\n \"index\": 2,\\n \"entitymentions\": [\\n ],\\n \"tokens\": [\\n {\\n \"index\": 1,\\n \"word\": \"Aristotle\",\\n \"originalText\": \"Aristotle\",\\n \"lemma\": \"Aristotle\",\\n \"characterOffsetBegin\": 57,\\n \"characterOffsetEnd\": 66,\\n \"pos\": \"NNP\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 2,\\n \"word\": \"went\",\\n \"originalText\": \"went\",\\n \"lemma\": \"go\",\\n \"characterOffsetBegin\": 67,\\n \"characterOffsetEnd\": 71,\\n \"pos\": \"VBD\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 3,\\n \"word\": \"for\",\\n \"originalText\": \"for\",\\n \"lemma\": \"for\",\\n \"characterOffsetBegin\": 72,\\n \"characterOffsetEnd\": 75,\\n \"pos\": \"IN\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 4,\\n \"word\": \"a\",\\n \"originalText\": \"a\",\\n \"lemma\": \"a\",\\n \"characterOffsetBegin\": 76,\\n \"characterOffsetEnd\": 77,\\n \"pos\": \"DT\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \" \"\\n },\\n {\\n \"index\": 5,\\n \"word\": \"walk\",\\n \"originalText\": \"walk\",\\n \"lemma\": \"walk\",\\n \"characterOffsetBegin\": 78,\\n \"characterOffsetEnd\": 82,\\n \"pos\": \"NN\",\\n \"ner\": \"O\",\\n \"before\": \" \",\\n \"after\": \"\"\\n },\\n {\\n \"index\": 6,\\n \"word\": \".\",\\n \"originalText\": \".\",\\n \"lemma\": \".\",\\n \"characterOffsetBegin\": 82,\\n \"characterOffsetEnd\": 83,\\n \"pos\": \".\",\\n \"ner\": \"O\",\\n \"before\": \"\",\\n \"after\": \"\"\\n }\\n ]\\n }\\n ]\\n}\\n'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jsontext"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import codecs\n",
"with codecs.open('jsontest1.json', 'w', encoding=\"utf8\") as f:\n",
" f.write(jsontext)\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with codecs.open('jsontest1.json', encoding=\"utf8\") as f:\n",
" d = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentences</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>{'index': 0, 'entitymentions': [{'docTokenBegi...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sentences\n",
"0 {'index': 0, 'entitymentions': [{'docTokenBegi..."
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas\n",
"pandas.read_json(\"jsontest1.json\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'one'"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d[\"sentences\"][0][\"entitymentions\"][0][\"ner\"]\n",
"\n",
"d[\"sentences\"][1][\"entitymentions\"][0][\"text\"]\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'entitymentions': [{'characterOffsetBegin': 23,\n",
" 'characterOffsetEnd': 28,\n",
" 'docTokenBegin': 5,\n",
" 'docTokenEnd': 6,\n",
" 'ner': 'COUNTRY',\n",
" 'text': 'Italy',\n",
" 'tokenBegin': 5,\n",
" 'tokenEnd': 6}],\n",
" 'index': 0,\n",
" 'tokens': [{'after': ' ',\n",
" 'before': '',\n",
" 'characterOffsetBegin': 0,\n",
" 'characterOffsetEnd': 4,\n",
" 'index': 1,\n",
" 'lemma': 'Rome',\n",
" 'ner': 'O',\n",
" 'originalText': 'Rome',\n",
" 'pos': 'NNP',\n",
" 'word': 'Rome'},\n",
" {'after': ' ',\n",
" 'before': ' ',\n",
" 'characterOffsetBegin': 5,\n",
" 'characterOffsetEnd': 7,\n",
" 'index': 2,\n",
" 'lemma': 'be',\n",
" 'ner': 'O',\n",
" 'originalText': 'is',\n",
" 'pos': 'VBZ',\n",
" 'word': 'is'},\n",
" {'after': ' ',\n",
" 'before': ' ',\n",
" 'characterOffsetBegin': 8,\n",
" 'characterOffsetEnd': 11,\n",
" 'index': 3,\n",
" 'lemma': 'the',\n",
" 'ner': 'O',\n",
" 'originalText': 'the',\n",
" 'pos': 'DT',\n",
" 'word': 'the'},\n",
" {'after': ' ',\n",
" 'before': ' ',\n",
" 'characterOffsetBegin': 12,\n",
" 'characterOffsetEnd': 19,\n",
" 'index': 4,\n",
" 'lemma': 'capital',\n",
" 'ner': 'O',\n",
" 'originalText': 'capital',\n",
" 'pos': 'NN',\n",
" 'word': 'capital'},\n",
" {'after': ' ',\n",
" 'before': ' ',\n",
" 'characterOffsetBegin': 20,\n",
" 'characterOffsetEnd': 22,\n",
" 'index': 5,\n",
" 'lemma': 'of',\n",
" 'ner': 'O',\n",
" 'originalText': 'of',\n",
" 'pos': 'IN',\n",
" 'word': 'of'},\n",
" {'after': '',\n",
" 'before': ' ',\n",
" 'characterOffsetBegin': 23,\n",
" 'characterOffsetEnd': 28,\n",
" 'index': 6,\n",
" 'lemma': 'Italy',\n",
" 'ner': 'COUNTRY',\n",
" 'originalText': 'Italy',\n",
" 'pos': 'NNP',\n",
" 'word': 'Italy'},\n",
" {'after': '',\n",
" 'before': '',\n",
" 'characterOffsetBegin': 28,\n",
" 'characterOffsetEnd': 29,\n",
" 'index': 7,\n",
" 'lemma': '.',\n",
" 'ner': 'O',\n",
" 'originalText': '.',\n",
" 'pos': '.',\n",
" 'word': '.'}]}]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d[\"sentences\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment