Skip to content

Instantly share code, notes, and snippets.

@jermnelson
Created August 5, 2021 16:07
Show Gist options
  • Save jermnelson/116b8923bd148e4e509852b199df34c5 to your computer and use it in GitHub Desktop.
Save jermnelson/116b8923bd148e4e509852b199df34c5 to your computer and use it in GitHub Desktop.
Using GNDRD API to send PDFs or full-text and returning list of matches.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "c63e4345-190d-420f-b8d4-c6d5335f1371",
"metadata": {},
"source": [
"# Using Global Names Recognition and Discovery API\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b7873594-fd7e-46e6-8657-f77dced0484a",
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline\n",
"\n",
"import csv\n",
"import datetime\n",
"import pathlib\n",
"import requests\n",
"\n",
"import lxml.etree as etree\n",
"\n",
"gnrd_api_url = \"http://gnrd.globalnames.org/name_finder.json\"\n",
"papers_pdf = pathlib.Path(\"/Volumes/GoogleDrive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_pdf\")\n",
"papers_tei = pathlib.Path(\"/Volumes/GoogleDrive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei\")\n",
"TEI = {\"tei\": \"http://www.tei-c.org/ns/1.0\"}"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "97b4b723-66c8-4ecb-8724-61934f3467f4",
"metadata": {},
"outputs": [],
"source": [
"result1 = requests.post(gnrd_api_url, \n",
" files={ \"file\": first_paper.read_bytes()})"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "aa763b2a-ec55-49ff-b4d6-725720d20dda",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result1.status_code"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "a6113365-8218-4376-b63c-a6d4edb69897",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[{'verbatim': '(Myoxocephalus polyacanthocephalus)',\n",
" 'scientificName': 'Myoxocephalus polyacanthocephalus',\n",
" 'offsetStart': 93,\n",
" 'offsetEnd': 128},\n",
" {'verbatim': 'Myoxocephalus',\n",
" 'scientificName': 'Myoxocephalus',\n",
" 'offsetStart': 519,\n",
" 'offsetEnd': 532},\n",
" {'verbatim': '(M.\\npolyacanthocephalus)',\n",
" 'scientificName': 'M. polyacanthocephalus',\n",
" 'offsetStart': 1347,\n",
" 'offsetEnd': 1371},\n",
" {'verbatim': 'Leonardo',\n",
" 'scientificName': 'Leonardo',\n",
" 'offsetStart': 1590,\n",
" 'offsetEnd': 1598},\n",
" {'verbatim': 'Myoxocephalus polyacanthocephalus.',\n",
" 'scientificName': 'Myoxocephalus polyacanthocephalus',\n",
" 'offsetStart': 5667,\n",
" 'offsetEnd': 5701},\n",
" {'verbatim': '(Myoxocephalus polyacanthocephalus',\n",
" 'scientificName': 'Myoxocephalus polyacanthocephalus',\n",
" 'offsetStart': 5757,\n",
" 'offsetEnd': 5791},\n",
" {'verbatim': 'Leonardo',\n",
" 'scientificName': 'Leonardo',\n",
" 'offsetStart': 6354,\n",
" 'offsetEnd': 6362},\n",
" {'verbatim': 'M. polyacanthocephalus',\n",
" 'scientificName': 'M. polyacanthocephalus',\n",
" 'offsetStart': 7410,\n",
" 'offsetEnd': 7432},\n",
" {'verbatim': 'Leonardo',\n",
" 'scientificName': 'Leonardo',\n",
" 'offsetStart': 7491,\n",
" 'offsetEnd': 7499},\n",
" {'verbatim': '(Ammodytes',\n",
" 'scientificName': 'Ammodytes',\n",
" 'offsetStart': 8639,\n",
" 'offsetEnd': 8649},\n",
" {'verbatim': 'M.\\npolyacanthocephalus',\n",
" 'scientificName': 'M. polyacanthocephalus',\n",
" 'offsetStart': 18396,\n",
" 'offsetEnd': 18418},\n",
" {'verbatim': 'Esox lucius.',\n",
" 'scientificName': 'Esox lucius',\n",
" 'offsetStart': 33892,\n",
" 'offsetEnd': 33904},\n",
" {'verbatim': 'Paralichthys\\n\\nolivaceus',\n",
" 'scientificName': 'Paralichthys olivaceus',\n",
" 'offsetStart': 34092,\n",
" 'offsetEnd': 34115},\n",
" {'verbatim': '(Albula vulpes)',\n",
" 'scientificName': 'Albula vulpes',\n",
" 'offsetStart': 34272,\n",
" 'offsetEnd': 34287},\n",
" {'verbatim': 'Oncorhynchus\\n\\nketa',\n",
" 'scientificName': 'Oncorhynchus keta',\n",
" 'offsetStart': 34557,\n",
" 'offsetEnd': 34575},\n",
" {'verbatim': 'Quintana,',\n",
" 'scientificName': 'Quintana',\n",
" 'offsetStart': 35086,\n",
" 'offsetEnd': 35095}]"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result1.json()['names']"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "a4908b5a-e900-4825-b15a-0f097021cea4",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Started processing PDF reports with GNRD web services at 2021-06-04 15:43:59.930634\n",
"0...E34..50....E99.100.....150.....200.....250.....300.....350.....400.....450.....500.....550.....600.....650.....700.....750.....800.....850.....900.....950.....1000E1007.....1050.....1100.....1150.....1200...."
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-97-174075bb9b97>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfilepath\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_directory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mreport_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m gnrd_result = requests.post(gnrd_api_url,\n\u001b[0m\u001b[1;32m 8\u001b[0m files={ \"file\": filepath.read_bytes() })\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/api.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \"\"\"\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 540\u001b[0m }\n\u001b[1;32m 541\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 542\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 544\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 654\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 655\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 656\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 657\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mchunked\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 439\u001b[0;31m resp = conn.urlopen(\n\u001b[0m\u001b[1;32m 440\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 697\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 698\u001b[0m \u001b[0;31m# Make the request on the httplib connection object.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 699\u001b[0;31m httplib_response = self._make_request(\n\u001b[0m\u001b[1;32m 700\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 701\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 392\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 394\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 395\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 396\u001b[0m \u001b[0;31m# We are swallowing BrokenPipeError (errno.EPIPE) since the server is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"user-agent\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mensure_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"User-Agent\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_default_user_agent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 235\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 236\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1228\u001b[0m encode_chunked=False):\n\u001b[1;32m 1229\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1230\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1231\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1275\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1276\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1278\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1223\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1224\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1225\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1227\u001b[0m def request(self, method, url, body=None, headers={}, *,\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1041\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf'{len(chunk):X}\\r\\n'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ascii'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1042\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34mb'\\r\\n'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1043\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1044\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1045\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mencode_chunked\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_http_vsn\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m11\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 965\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 966\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"gnrd_names_papers = []\n",
"errors = []\n",
"start = datetime.datetime.utcnow()\n",
"print(f\"Started processing PDF reports with GNRD web services at {start}\")\n",
"for i,filepath in enumerate(pdf_directory.iterdir()):\n",
" report_name = filepath.name.split(\".\")[:-1][0]\n",
" gnrd_result = requests.post(gnrd_api_url,\n",
" files={ \"file\": filepath.read_bytes() })\n",
"\n",
" if gnrd_result.status_code < 400:\n",
" retrieved_on = datetime.datetime.utcnow().isoformat()\n",
" for name in gnrd_result.json().get('names', []):\n",
" gnrd_names_papers.append( {\"report\": report_name, \n",
" \"scientificName\": name['scientificName'],\n",
" \"retrieved\": retrieved_on})\n",
" else:\n",
" errors.append({ \"report\": report_name, \"http_status\": gnrd_result.status_code})\n",
" print(f\"E{i}\", end=\"\")\n",
" if not i%10 and i > 0:\n",
" print(\".\", end=\"\")\n",
" if not i%50:\n",
" print(f\"{i}\", end=\"\")\n",
"end = datetime.datetime.utcnow()\n",
"print(f\"Finished processing at {end}, total time {(end-start).seconds / 60.} for {len(gnrd_names_papers)}\")"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "494c15aa-6129-43f3-b5d6-5b5a8743a856",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"43272"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(gnrd_names_papers)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "1ff40074-d4b8-42e3-b07f-446533da396c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'report': ['fhl_2011_Ho_26600'], 'http_status': 500},\n",
" {'report': ['fhl_2011_Witt_25966'], 'http_status': 500},\n",
" {'report': ['hms_hg072nm6762'], 'http_status': 413}]"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"errors"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "c51e0d86-1799-4e6a-b8fd-0650c12f1d3c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['hms_gr619jn4381']"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report_name"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "bd393635-74a1-4a0f-b345-291583a03616",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'report': 'hms_hd008gc7852',\n",
" 'scientificName': 'Chthamalus dalli',\n",
" 'retrieved': '2021-06-04T23:06:25.344051'}"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnrd_names_papers[-1]"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "3c241f59-6615-4fb4-9675-6b2b73380a0f",
"metadata": {},
"outputs": [],
"source": [
"for in gnrd_names_papers:\n",
" row['report'] = row['report'][0]"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "4aeca82c-ae29-436e-b613-8172754f3c44",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"with open(\"/Volumes/GoogleDrive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/gnrd_report.csv\",\n",
" 'w+', newline='') as fo:\n",
" field_names = ['report', 'scientificName', 'retrieved']\n",
" csv_writer = csv.DictWriter(fo, field_names)\n",
" \n",
" csv_writer.writeheader()\n",
" for row in gnrd_names_papers:\n",
" csv_writer.writerow(row)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "63fde6aa-ce1c-452a-9ec9-a8f4e3d52a53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1243"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"i"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d34a5761-5913-4b0d-8b4c-66b4200934eb",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Started processing PDF reports with GNRD web services at 2021-06-25 15:04:16.417841\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-df05c57cec09>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mreport_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m gnrd_result = requests.post(gnrd_api_url,\n\u001b[0m\u001b[1;32m 10\u001b[0m files={ \"file\": filepath.read_bytes() })\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/api.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \"\"\"\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 540\u001b[0m }\n\u001b[1;32m 541\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 542\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 544\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 654\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 655\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 656\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 657\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mchunked\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 439\u001b[0;31m resp = conn.urlopen(\n\u001b[0m\u001b[1;32m 440\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 697\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 698\u001b[0m \u001b[0;31m# Make the request on the httplib connection object.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 699\u001b[0;31m httplib_response = self._make_request(\n\u001b[0m\u001b[1;32m 700\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 701\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 392\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 394\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 395\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 396\u001b[0m \u001b[0;31m# We are swallowing BrokenPipeError (errno.EPIPE) since the server is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/02021/sul-dlss/labs/ml-env/lib/python3.8/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"user-agent\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mensure_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"User-Agent\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_default_user_agent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 235\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 236\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1228\u001b[0m encode_chunked=False):\n\u001b[1;32m 1229\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1230\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1231\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1275\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1276\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1278\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1223\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1224\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1225\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1227\u001b[0m def request(self, method, url, body=None, headers={}, *,\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1041\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf'{len(chunk):X}\\r\\n'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ascii'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1042\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34mb'\\r\\n'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1043\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1044\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1045\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mencode_chunked\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_http_vsn\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m11\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 965\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 966\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"gnrd_names_papers_remaining = []\n",
"errors_remaining = []\n",
"start = datetime.datetime.utcnow()\n",
"print(f\"Started processing PDF reports with GNRD web services at {start}\")\n",
"for i,filepath in enumerate(pdf_directory.iterdir()):\n",
" if i <= 1243:\n",
" continue\n",
" report_name = filepath.name.split(\".\")[:-1][0]\n",
" gnrd_result = requests.post(gnrd_api_url,\n",
" files={ \"file\": filepath.read_bytes() })\n",
"\n",
" if gnrd_result.status_code < 400:\n",
" retrieved_on = datetime.datetime.utcnow().isoformat()\n",
" for name in gnrd_result.json().get('names', []):\n",
" gnrd_names_papers_remaining.append( {\"report\": report_name, \n",
" \"scientificName\": name['scientificName'],\n",
" \"retrieved\": retrieved_on})\n",
" else:\n",
" errors_remaining.append({ \"report\": report_name, \"http_status\": gnrd_result.status_code})\n",
" print(f\"E{i}\", end=\"\")\n",
" if not i%10 and i > 0:\n",
" print(\".\", end=\"\")\n",
" if not i%50:\n",
" print(f\"{i}\", end=\"\")\n",
"end = datetime.datetime.utcnow()\n",
"print(f\"Finished processing at {end}, total time {(end-start).seconds / 60.} for {len(gnrd_names_papers)}\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "666d59f4-0efb-4fe9-8b92-ef78a7c04bc0",
"metadata": {},
"outputs": [],
"source": [
"taxa_report_names = []\n",
"for row in pdf_directory.iterdir():\n",
" taxa_report_names.append(row.name)\n",
"end = datetime.datetime.utcnow()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ed80001a-f37d-43b0-95ca-1c882aba6c13",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1705"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(taxa_report_names)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "0670442f-0145-43ec-8fdb-62965d9ad174",
"metadata": {},
"outputs": [],
"source": [
"import multiprocessing"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "10d0d26e-8a18-4731-952f-75811fdbb1c8",
"metadata": {},
"outputs": [],
"source": [
"def retrieve_names(report):\n",
" result = requests.post(gnrd_api_url,\n",
" files={'file': report.read_bytes()})\n",
" if result.status_code < 400:\n",
" return { 'report': filename, \n",
" 'names': result.json().get('names', []),\n",
" 'retrieved': datetime.datetime.utcnow().isoformat() }\n",
" return { 'report': filename, 'names': [], 'error': result.text }"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "e0f181a6-e21e-47ff-9a2a-222e373400c2",
"metadata": {},
"outputs": [],
"source": [
"def get_gnrd_names():\n",
" PROCESSES = 4\n",
" start = datetime.datetime.utcnow()\n",
" with multiprocessing.Pool(PROCESSES) as pool:\n",
" results = []\n",
" for i,filepath in enumerate(pdf_directory.iterdir()):\n",
" results.append(pool.apply_async(retrieve_names, filepath))\n",
" if not i%10 and i > 0:\n",
" print(\".\", end=\"\")\n",
" if not i%50:\n",
" print(f\"{i}\", end=\"\")\n",
" end = datetime.datetime.utcnow()\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "9b207fce-93be-40e9-b154-5eb3e59eae6b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.....50.....100.....150.....200.....250.....300.....350.....400.....450.....500.....550.....600.....650.....700.....750.....800.....850.....900.....950.....1000.....1050.....1100.....1150.....1200.....1250.....1300.....1350.....1400.....1450.....1500.....1550.....1600.....1650.....1700"
]
}
],
"source": [
"results = get_gnrd_names()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "250741bb-6663-453f-b6db-e5134a68fa54",
"metadata": {},
"outputs": [],
"source": [
"for i,row in enumerate(results):\n",
" if row.ready():\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9a73c93a-e7be-4090-bbbb-390d7f97cf23",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"502"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"1705-1203"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b5f2155b-f186-459e-a62e-d2808de22f86",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1539"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"1371 + 168"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "6d5a7e20-6dc5-4933-be4a-e5a31b9b1c85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished at 2021-06-28 20:02:29.528724 total time 0.0\n"
]
}
],
"source": [
"start = datetime.datetime.utcnow()\n",
"result_text = requests.post(gnrd_api_url, \n",
" data={ \"text\": all_text, \"with_verification\": 'true'})\n",
"end = datetime.datetime.utcnow()\n",
"print(f\"Finished at {end} total time {(end-start).seconds / 60.}\")"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "441de6e8-0ccb-4d05-85c9-eca0c006fcce",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"{'token_url': 'http://gnrd.globalnames.org/name_finder.json?token=5bfmm7fkcg',\n",
" 'input_url': None,\n",
" 'file': None,\n",
" 'status': 200,\n",
" 'engine': 'gnfinder',\n",
" 'unique': False,\n",
" 'verbatim': True,\n",
" 'parameters': {'return_content': False,\n",
" 'with_verification': True,\n",
" 'preferred_data_sources': [],\n",
" 'detect_language': False,\n",
" 'engine': 0,\n",
" 'no_bayes': False},\n",
" 'names': [{'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 404,\n",
" 'offsetEnd': 413},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 547,\n",
" 'offsetEnd': 556},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 596,\n",
" 'offsetEnd': 605},\n",
" {'verbatim': 'Pelvetia fastiqiata',\n",
" 'scientificName': 'Pelvetia fastiqiata',\n",
" 'offsetStart': 798,\n",
" 'offsetEnd': 817},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 1399,\n",
" 'offsetEnd': 1408},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 1461,\n",
" 'offsetEnd': 1470},\n",
" {'verbatim': 'Cyanonlax hartwegii',\n",
" 'scientificName': 'Cyanonlax hartwegii',\n",
" 'offsetStart': 1829,\n",
" 'offsetEnd': 1848},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 2251,\n",
" 'offsetEnd': 2270},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 2806,\n",
" 'offsetEnd': 2825},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 3492,\n",
" 'offsetEnd': 3501},\n",
" {'verbatim': 'Cyanoplax.',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 3682,\n",
" 'offsetEnd': 3692},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 3759,\n",
" 'offsetEnd': 3778},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 3797,\n",
" 'offsetEnd': 3816},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 5765,\n",
" 'offsetEnd': 5784},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 6063,\n",
" 'offsetEnd': 6072},\n",
" {'verbatim': 'Cyanoplax hartwegii.',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 6128,\n",
" 'offsetEnd': 6148},\n",
" {'verbatim': 'Polyplacophora',\n",
" 'scientificName': 'Polyplacophora',\n",
" 'offsetStart': 6330,\n",
" 'offsetEnd': 6344},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 6432,\n",
" 'offsetEnd': 6451},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 6715,\n",
" 'offsetEnd': 6724},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 7103,\n",
" 'offsetEnd': 7112},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 7298,\n",
" 'offsetEnd': 7317},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 7429,\n",
" 'offsetEnd': 7448},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 7530,\n",
" 'offsetEnd': 7539},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 7581,\n",
" 'offsetEnd': 7600},\n",
" {'verbatim': 'Cyanonlax hartwegii',\n",
" 'scientificName': 'Cyanonlax hartwegii',\n",
" 'offsetStart': 7626,\n",
" 'offsetEnd': 7645},\n",
" {'verbatim': 'Sypharochiton',\n",
" 'scientificName': 'Sypharochiton',\n",
" 'offsetStart': 7910,\n",
" 'offsetEnd': 7923},\n",
" {'verbatim': 'Nuttallina',\n",
" 'scientificName': 'Nuttallina',\n",
" 'offsetStart': 7961,\n",
" 'offsetEnd': 7971},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 7992,\n",
" 'offsetEnd': 8001},\n",
" {'verbatim': '(Mollusca:',\n",
" 'scientificName': 'Mollusca',\n",
" 'offsetStart': 8060,\n",
" 'offsetEnd': 8070},\n",
" {'verbatim': 'Polyplacophora).',\n",
" 'scientificName': 'Polyplacophora',\n",
" 'offsetStart': 8071,\n",
" 'offsetEnd': 8087},\n",
" {'verbatim': '(Mollusca:',\n",
" 'scientificName': 'Mollusca',\n",
" 'offsetStart': 8116,\n",
" 'offsetEnd': 8126},\n",
" {'verbatim': 'Polyplacophora).',\n",
" 'scientificName': 'Polyplacophora',\n",
" 'offsetStart': 8127,\n",
" 'offsetEnd': 8143},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 8329,\n",
" 'offsetEnd': 8338},\n",
" {'verbatim': 'Sypharochiton pelliserpentis',\n",
" 'scientificName': 'Sypharochiton pelliserpentis',\n",
" 'offsetStart': 8757,\n",
" 'offsetEnd': 8785},\n",
" {'verbatim': 'Polyplacophora).',\n",
" 'scientificName': 'Polyplacophora',\n",
" 'offsetStart': 8797,\n",
" 'offsetEnd': 8813},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 9053,\n",
" 'offsetEnd': 9072},\n",
" {'verbatim': 'Acmaea',\n",
" 'scientificName': 'Acmaea',\n",
" 'offsetStart': 9326,\n",
" 'offsetEnd': 9332},\n",
" {'verbatim': 'Aplysia',\n",
" 'scientificName': 'Aplysia',\n",
" 'offsetStart': 9417,\n",
" 'offsetEnd': 9424},\n",
" {'verbatim': 'Henrietta fasciata',\n",
" 'scientificName': 'Henrietta fasciata',\n",
" 'offsetStart': 9495,\n",
" 'offsetEnd': 9513},\n",
" {'verbatim': 'Mollusca',\n",
" 'scientificName': 'Mollusca',\n",
" 'offsetStart': 9656,\n",
" 'offsetEnd': 9664},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 9858,\n",
" 'offsetEnd': 9867},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 10487,\n",
" 'offsetEnd': 10496},\n",
" {'verbatim': 'Cyanoplax hartwegii',\n",
" 'scientificName': 'Cyanoplax hartwegii',\n",
" 'offsetStart': 10525,\n",
" 'offsetEnd': 10544},\n",
" {'verbatim': 'Mollusca,',\n",
" 'scientificName': 'Mollusca',\n",
" 'offsetStart': 10843,\n",
" 'offsetEnd': 10852},\n",
" {'verbatim': 'Cyanoplax',\n",
" 'scientificName': 'Cyanoplax',\n",
" 'offsetStart': 10925,\n",
" 'offsetEnd': 10934},\n",
" {'verbatim': 'Cyanonlax hartwegii',\n",
" 'scientificName': 'Cyanonlax hartwegii',\n",
" 'offsetStart': 11346,\n",
" 'offsetEnd': 11365}],\n",
" 'language_used': 'eng',\n",
" 'execution_time': {'text_preparation_duration': 0.06183362007141113,\n",
" 'find_names_duration': 0.23911786079406738,\n",
" 'total_duration': 0.30721473693847656},\n",
" 'verified_names': [{'supplied_name_string': 'Cyanoplax',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Cyanoplax',\n",
" 'current_name_string': 'Cyanoplax',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora|Chitonida|Mopalioidea|Lepidochitonidae|Cyanoplax',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Pelvetia fastiqiata',\n",
" 'is_known_name': False,\n",
" 'results': {'match_value': 'FUZZY',\n",
" 'name_string': 'Pelvetia fastigiata (J. Agardh) De Toni, 1895',\n",
" 'current_name_string': 'Pelvetia fastigiata (J. Agardh) De Toni, 1895',\n",
" 'data_source_id': 8,\n",
" 'data_source_title': 'The Interim Register of Marine and Nonmarine Genera',\n",
" 'classification_path': 'Protista|Heterokontophyta|Phaeophyceae|Fucales|Fucaceae|Pelvetia|Pelvetia fastigiata',\n",
" 'edit_distance': 1},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Cyanonlax hartwegii',\n",
" 'is_known_name': False,\n",
" 'results': {'match_value': 'FUZZY',\n",
" 'name_string': 'Cyanoplax hartwegii (Carpenter, 1855)',\n",
" 'current_name_string': 'Cyanoplax hartwegii (Carpenter, 1855)',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora|Chitonida|Mopalioidea|Lepidochitonidae|Cyanoplax|Cyanoplax hartwegii',\n",
" 'edit_distance': 1},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Cyanoplax hartwegii',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Cyanoplax hartwegii (Carpenter, 1855)',\n",
" 'current_name_string': 'Cyanoplax hartwegii (Carpenter, 1855)',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora|Chitonida|Mopalioidea|Lepidochitonidae|Cyanoplax|Cyanoplax hartwegii',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Polyplacophora',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Polyplacophora',\n",
" 'current_name_string': 'Polyplacophora',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Sypharochiton',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Sypharochiton',\n",
" 'current_name_string': 'Sypharochiton',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora|Chitonida|Chitonoidea|Chitonidae|Sypharochiton',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Nuttallina',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Nuttallina',\n",
" 'current_name_string': 'Nuttallina',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora|Chitonida|Mopalioidea|Lepidochitonidae|Nuttallina',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Mollusca',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Mollusca',\n",
" 'current_name_string': 'Mollusca',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Sypharochiton pelliserpentis',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Sypharochiton pelliserpentis (Quoy & Gaimard, 1835)',\n",
" 'current_name_string': 'Sypharochiton pelliserpentis (Quoy & Gaimard, 1835)',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Polyplacophora|Chitonida|Chitonoidea|Chitonidae|Sypharochiton|Sypharochiton pelliserpentis',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Acmaea',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Acmaea',\n",
" 'current_name_string': 'Acmaea',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Gastropoda|Not assigned|Lottioidea|Acmaeidae|Acmaea',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Aplysia',\n",
" 'is_known_name': True,\n",
" 'results': {'match_value': 'EXACT',\n",
" 'name_string': 'Aplysia',\n",
" 'current_name_string': 'Aplysia',\n",
" 'data_source_id': 1,\n",
" 'data_source_title': 'Catalogue of Life',\n",
" 'classification_path': 'Animalia|Mollusca|Gastropoda|Aplysiida|Aplysioidea|Aplysiidae|Aplysia',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []},\n",
" {'supplied_name_string': 'Henrietta fasciata',\n",
" 'is_known_name': False,\n",
" 'results': {'match_value': 'PARTIAL_EXACT',\n",
" 'name_string': 'Henrietta Macfad.',\n",
" 'current_name_string': 'Henrietta Macfad.',\n",
" 'data_source_id': 165,\n",
" 'data_source_title': 'Tropicos - Missouri Botanical Garden',\n",
" 'classification_path': '',\n",
" 'edit_distance': 0},\n",
" 'preferred_results': []}],\n",
" 'total': 46}"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_text.json()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "14dfc755-5569-4769-9bc1-5f25087682c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Started processing PDF reports with GNRD web services at 2021-06-28 21:49:14.404673\n",
"0.....50.....100.....150.....200.....250.....300.....350.....400.....450.....500.....550.....600.....650.....700.....750.....800.....850.....900.....950.....1000.....1050...E1086..1100.....1150.....1200.....1250.....1300..E1329...1350.....1400.....1450.....1500..E1526...1550.....1600..E1622Finished processing at 2021-06-28 22:02:53.102518, total time 13.633333333333333 for 41367\n"
]
}
],
"source": [
"gnrd_names_papers = []\n",
"errors = []\n",
"start = datetime.datetime.utcnow()\n",
"print(f\"Started processing PDF reports with GNRD web services at {start}\")\n",
"for i,filepath in enumerate(papers_tei.iterdir()):\n",
" report_name = filepath.name.split(\".\")[:-1][0]\n",
" try:\n",
" report_xml = etree.XML(filepath.read_bytes())\n",
" except:\n",
" errors.append({\"report\": report_name, \"error\": \"XML Parse Error\"})\n",
" print(f\"E{i}\", end=\"\")\n",
" continue\n",
" report_body = report_xml.find(\"tei:text/tei:body\", namespaces=TEI)\n",
" raw_text = ''\n",
" for row in report_body.itertext():\n",
" raw_text += f\" {row}\"\n",
" gnrd_result = requests.post(gnrd_api_url,\n",
" data={ \"text\": raw_text })\n",
"\n",
" if gnrd_result.status_code < 400:\n",
" retrieved_on = datetime.datetime.utcnow().isoformat()\n",
" for name in gnrd_result.json().get('names', []):\n",
" gnrd_names_papers.append( {\"report\": report_name, \n",
" \"scientificName\": name['scientificName'],\n",
" \"retrieved\": retrieved_on})\n",
" else:\n",
" errors.append({ \"report\": report_name, \"http_status\": gnrd_result.status_code})\n",
" print(f\"E{i}\", end=\"\")\n",
" if not i%10 and i > 0:\n",
" print(\".\", end=\"\")\n",
" if not i%50:\n",
" print(f\"{i}\", end=\"\")\n",
"end = datetime.datetime.utcnow()\n",
"print(f\"Finished processing at {end}, total time {(end-start).seconds / 60.} for {len(gnrd_names_papers)}\")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "24806ad3-a9e5-4b16-b118-2f08dbf29e01",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"41367"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(gnrd_names_papers )"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "c2674eed-592c-495e-8270-4ecb9ed7b41e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(errors)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "fcc29adf-6865-4a26-a857-bc41465889c8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'report': 'hms_zw387wm0746', 'http_status': 400},\n",
" {'report': 'usc_1976_Su_INDEX', 'http_status': 400},\n",
" {'report': 'usc_1979_Su_Ryall', 'http_status': 400},\n",
" {'report': '', 'error': 'XML Parse Error'}]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"errors"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "530d93ca-c5de-4cce-956d-7c59159efee7",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[{'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Mitrella',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Homalopoma',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'P. hirsutiusculus',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'T. funebralis',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Littorina',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Tegula funebralis',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Pagurus samuelis',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'P. hirsutiusculus',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Pagurus',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'},\n",
" {'report': 'hms_zx897dq3818',\n",
" 'scientificName': 'Tegula funebralis',\n",
" 'retrieved': '2021-06-28T21:58:41.768659'}]"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnrd_names_papers[30000:30010]"
]
},
{
"cell_type": "markdown",
"id": "93b82f1a-6f61-434d-b572-094f94484917",
"metadata": {},
"source": [
"## Process OSU PDF Papers\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b809e111-8d42-4445-b3f2-d30aa5deb89f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start processing OSU Papers at 2021-07-02 16:28:04.507753\n",
"0 1 2 3 4 5 6 7 8 9 Finish processing OSU Papers at 2021-07-02 16:28:42.247340, total time 0.6166666666666667 minutes\n"
]
}
],
"source": [
"osu_results, osu_errors = [],[] \n",
"start = datetime.datetime.utcnow()\n",
"print(f\"Start processing OSU Papers at {start}\")\n",
"for i,paper in enumerate(papers_pdf.glob(\"osu*\")):\n",
" report_name = paper.name.split(\".\")[0]\n",
" result = requests.post(gnrd_api_url, \n",
" files={ \"file\": paper.read_bytes()}) \n",
" if result.status_code < 400:\n",
" retrieved_on = datetime.datetime.utcnow().isoformat()\n",
" for name in result.json().get('names', []):\n",
" osu_results.append({\"report\": report_name, \n",
" \"scientificName\": name['scientificName'],\n",
" \"retrieved\": retrieved_on})\n",
" print(f\"{i}\", end=\" \")\n",
" else:\n",
" osu_errors.append({ \"report\": report_name, \"http_status\": gnrd_result.status_code})\n",
" print(f\"E{i}\", end=\" \")\n",
"end = datetime.datetime.utcnow()\n",
"print(f\"Finish processing OSU Papers at {end}, total time {(end-start).seconds / 60.} minutes\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0a2e0bcb-7ab9-4809-963d-2d613e656783",
"metadata": {},
"outputs": [],
"source": [
"with open(\"/Volumes/GoogleDrive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/gnrd_report.csv\") as fo:\n",
" gnrd_names_papers = [row for row in csv.DictReader(fo)]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c28fb725-1d29-4382-9cd2-1e79e4147228",
"metadata": {},
"outputs": [],
"source": [
"with open(\"/Volumes/GoogleDrive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/gnrd_report.csv\",\n",
" 'w+', newline='') as fo:\n",
" field_names = ['report', 'scientificName', 'retrieved']\n",
" csv_writer = csv.DictWriter(fo, field_names)\n",
"\n",
" csv_writer.writeheader()\n",
" for row in gnrd_names_papers:\n",
" csv_writer.writerow(row)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7acc9f3d-3f1a-4ac9-96f4-f39df0115878",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"41367"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(gnrd_names_papers)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2989a9d7-da57-4c1b-a988-cfdc13cd24e7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 237 osu_20200612144428207\n",
"Found 238 osu_20200612144428207\n",
"Found 239 osu_20200612144428207\n",
"Found 240 osu_20200612144428207\n",
"Found 241 osu_20200612144428207\n",
"Found 242 osu_20200612144428207\n",
"Found 243 osu_20200612144428207\n",
"Found 244 osu_20200612144428207\n",
"Found 237 osu_20200612144428207\n",
"Found 238 osu_20200612144428207\n",
"Found 239 osu_20200612144428207\n",
"Found 240 osu_20200612144428207\n",
"Found 241 osu_20200612144428207\n",
"Found 242 osu_20200612144428207\n",
"Found 243 osu_20200612144428207\n",
"Found 244 osu_20200612144428207\n"
]
}
],
"source": [
"for row in gnrd_names_papers:\n",
" for i,osu_report in enumerate(osu_results):\n",
" if row['report'] == osu_report['report']:\n",
" print(f\"Found {i} {osu_report['report']}\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "45bff310-244c-43aa-85f1-017a463e638c",
"metadata": {},
"outputs": [],
"source": [
"for row in osu_results:\n",
" if row['report'].startswith('osu_20200612144428207'):\n",
" continue\n",
" gnrd_names_papers.append(row)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "554fecbe-79a4-4066-a186-8db7a0aec0c9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'report': 'osu_20200612145031481',\n",
" 'scientificName': 'Geobacteraceae',\n",
" 'retrieved': '2021-07-02T16:28:05.956653'},\n",
" {'report': 'osu_20200612145031481',\n",
" 'scientificName': 'Geobacter sulfurreducens',\n",
" 'retrieved': '2021-07-02T16:28:05.956653'},\n",
" {'report': 'osu_20200612154034430',\n",
" 'scientificName': 'Panopea abrupta',\n",
" 'retrieved': '2021-07-02T16:28:09.410947'},\n",
" {'report': 'osu_20200612154034430',\n",
" 'scientificName': 'Panopea abrupta',\n",
" 'retrieved': '2021-07-02T16:28:09.410947'},\n",
" {'report': 'osu_20200612154034430',\n",
" 'scientificName': 'Sebastes',\n",
" 'retrieved': '2021-07-02T16:28:09.410947'},\n",
" {'report': 'osu_20200612154034430',\n",
" 'scientificName': 'Arctica islandica',\n",
" 'retrieved': '2021-07-02T16:28:09.410947'},\n",
" {'report': 'osu_20200612154034430',\n",
" 'scientificName': 'Panopea abrupta',\n",
" 'retrieved': '2021-07-02T16:28:09.410947'},\n",
" {'report': 'osu_20200612154034430',\n",
" 'scientificName': 'P. abrupta',\n",
" 'retrieved': '2021-07-02T16:28:09.410947'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'Lutjanus campechanus',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. pinniger',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. melanops',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. pinniger',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. maliger',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. melanops',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. pinniger',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. maliger',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. mystinus',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. paucispinis',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. auriculatus',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. caurinus',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. elongates',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'S. entomelas',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'Alena',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'Alena',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153848020',\n",
" 'scientificName': 'Torn tunica externus',\n",
" 'retrieved': '2021-07-02T16:28:12.416188'},\n",
" {'report': 'osu_20200612153616194',\n",
" 'scientificName': 'Oncorhynchus tshawytsha',\n",
" 'retrieved': '2021-07-02T16:28:15.793740'},\n",
" {'report': 'osu_20200612153616194',\n",
" 'scientificName': 'Micas',\n",
" 'retrieved': '2021-07-02T16:28:15.793740'},\n",
" {'report': 'osu_20200612153616194',\n",
" 'scientificName': 'Micas',\n",
" 'retrieved': '2021-07-02T16:28:15.793740'},\n",
" {'report': 'osu_20200612153616194',\n",
" 'scientificName': 'Micas',\n",
" 'retrieved': '2021-07-02T16:28:15.793740'},\n",
" {'report': 'osu_20200612153616194',\n",
" 'scientificName': 'Acaca',\n",
" 'retrieved': '2021-07-02T16:28:15.793740'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Gadus macrocepltalus',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Gadus morhua',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Theragra chalcogramma',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Gadus macrocephalus',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Artemia',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Theragra chalcogramma larvae',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Gadus morhua',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Pleuronectes platessa',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Scophthalmus maximus',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Melanogrammus aeglefinus',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Gadus morhua',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Theragra chalcogramma',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Theragra chalcogramma',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153509246',\n",
" 'scientificName': 'Phyllis',\n",
" 'retrieved': '2021-07-02T16:28:18.768755'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Oncorhynchus mykiss',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Jessica',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Oncorhynchus mykiss',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Campana',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Brevoortia',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Morone sexatilis',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153112738',\n",
" 'scientificName': 'Salmo',\n",
" 'retrieved': '2021-07-02T16:28:22.519968'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Oncorhynchus mykiss mykiss',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Oncorhynchus mykiss mykiss',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Deropegus aspina',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Anisakis simplex',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Rhadinorhynchus trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hysterothylacium aduncum',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'H. aduncum',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'R. trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'R. trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'H. aduncum',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'R. trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'R. trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'R. trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'R. trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Centropages abdominalis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Pseudocalanus minutus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'C. abdominalis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. minutus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Anisakis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Deropegus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hemiurus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hysterothylacium',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Rhadinorhynchus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Apophallus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'L. gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'P. shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Reinhardtius hippoglossoides',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Oncorhynchus kisutch',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Anisakis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Nematoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Mallotus villosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Clupea harengus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Delphinapterus leucas',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Digenea',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hemiuroidea',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Deropegus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'D. aspina',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Oncorhynchus c�arki clarki',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Trematoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Opecoelidae',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Crepidostomum farionis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Deropegus aspina',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Parahemiurus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Rhadinorhynchus trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hysterothylacium aduncum',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Anisakis simplex unknown',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Rhadinorhynchus trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Cestoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Nematoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Anisakis simplex',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hysterothylacium aduncum unknown',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Trematoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Crepidostomum farionis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Deropegus aspina',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Parahemiurus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Oncorhynchus mykiss',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Rhadinorhynchus trachuri',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Cestoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Nematoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Anisakis simplex',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Hysterothylacium aduncum',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Trematoda',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Crepidostomum farionis',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Deropegus aspina',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Lecithaster gibbosus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Parahemiurus',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612153337143',\n",
" 'scientificName': 'Plagioporus shawi',\n",
" 'retrieved': '2021-07-02T16:28:28.960387'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Neotrypaea',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Callianassa',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis larvae',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Neotrypaea californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Neot�ypaea californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis larvae',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis larvae',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis larvae',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'N. californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Neotrypaea californiensus larvae',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Callianassa',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Neotrypea californiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Callianassa cal�fomiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Crustacea',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Decapoda',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152633039',\n",
" 'scientificName': 'Callianassa califomiensis',\n",
" 'retrieved': '2021-07-02T16:28:32.403263'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Crustacea',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia pugettensis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione grijfenis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia pugettensis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione grijfenis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia littoralis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia pugettensis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia sex',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Gyge branchialis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Neotrypaea californiensis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia pugettensis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Probopyrus pandalicola',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Palaemonetes paludosus',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Crustacea',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Isopoda',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Crustacea',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Decapoda',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Cornalia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Crustacea',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Gyge branchialis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia littoralis',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Orthione',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612152822066',\n",
" 'scientificName': 'Upogebia',\n",
" 'retrieved': '2021-07-02T16:28:34.997472'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Sebastes diploproa',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Echeverria',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Sebastes pinniger',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'S. diploproa',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Scorpaenidae',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Sebastes',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Echeverria',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'},\n",
" {'report': 'osu_20200612144428207',\n",
" 'scientificName': 'Sebastes flavidus',\n",
" 'retrieved': '2021-07-02T16:28:42.246943'}]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(\"osu_results.pk\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ee63e25-535e-4c74-84fe-7821c9651ff7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment