Skip to content

Instantly share code, notes, and snippets.

@peeter-t2
Created May 20, 2019 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peeter-t2/6e9cac6547af63662f490c63f1671b6c to your computer and use it in GitHub Desktop.
Save peeter-t2/6e9cac6547af63662f490c63f1671b6c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#libraries\n",
"import urllib.request"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Line 1: 0307000101\n"
]
},
{
"ename": "HTTPError",
"evalue": "HTTP Error 503: Backend fetch failed",
"output_type": "error",
"traceback": [
"\u001b[0;31m--------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mHTTPError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-dd0f8d922b70>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0murl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"https://vm0824.kaj.pouta.csc.fi/octavo/ecco/search??pretty&fieldEnricher=&offsetDataConverter=&query=%3CDOCUMENT%C2%A7documentID%3A\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"%0A%C2%A7DOCUMENT%3E&field=content&offset=0&limit=20&snippetLimit=20&contextLevel=Sentence&contextExpandLeft=0&contextExpandRight=0&level=&endpoint=https%3A%2F%2Fvm0824.kaj.pouta.csc.fi%2Foctavo%2Fecco%2F\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;31m#print(url)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murlretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"data/med_texts/\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".json\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36murlretrieve\u001b[0;34m(url, filename, reporthook, data)\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0murl_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msplittype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 248\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mcontextlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclosing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 249\u001b[0m \u001b[0mheaders\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 223\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 224\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mprocessor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mmeth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36mhttp_response\u001b[0;34m(self, request, response)\u001b[0m\n\u001b[1;32m 640\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m200\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mcode\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 641\u001b[0m response = self.parent.error(\n\u001b[0;32m--> 642\u001b[0;31m 'http', request, response, code, msg, hdrs)\n\u001b[0m\u001b[1;32m 643\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36merror\u001b[0;34m(self, proto, *args)\u001b[0m\n\u001b[1;32m 568\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhttp_err\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'default'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'http_error_default'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0morig_args\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 570\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_chain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 571\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 572\u001b[0m \u001b[0;31m# XXX probably also want an abstract factory that knows when it makes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 503\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 504\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 505\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/urllib/request.py\u001b[0m in \u001b[0;36mhttp_error_default\u001b[0;34m(self, req, fp, code, msg, hdrs)\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mHTTPDefaultErrorHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttp_error_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 650\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mHTTPError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfull_url\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 651\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mHTTPRedirectHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mHTTPError\u001b[0m: HTTP Error 503: Backend fetch failed"
]
}
],
"source": [
"#to get the medical files\n",
"\n",
"filepath = 'medical_IDs.txt' \n",
"with open(filepath) as fp: \n",
" line = fp.readline().rstrip()\n",
" cnt = 1\n",
" while line and cnt<200:\n",
" print(\"Line {}: {}\".format(cnt, line.strip()))\n",
" line = fp.readline().rstrip()\n",
" cnt += 1\n",
" url = \"https://vm0824.kaj.pouta.csc.fi/octavo/ecco/search??pretty&fieldEnricher=&offsetDataConverter=&query=%3CDOCUMENT%C2%A7documentID%3A\"+line+\"%0A%C2%A7DOCUMENT%3E&field=content&offset=0&limit=20&snippetLimit=20&contextLevel=Sentence&contextExpandLeft=0&contextExpandRight=0&level=&endpoint=https%3A%2F%2Fvm0824.kaj.pouta.csc.fi%2Foctavo%2Fecco%2F\"\n",
" #print(url)\n",
" urllib.request.urlretrieve(url,\"data/med_texts/\"+line+\".json\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Line 1: 0810100101\n",
"Line 2: 0459200100\n",
"Line 3: 0118200101\n",
"Line 4: 0130301800\n",
"Line 5: 0118300103\n",
"Line 6: 0466200100\n",
"Line 7: 0133100200\n",
"Line 8: 1051100102\n",
"Line 9: 0316000101\n",
"Line 10: 0810200100\n",
"Line 11: 1015100300\n",
"Line 12: 1188400100\n",
"Line 13: 1001400200\n",
"Line 14: 0488400200\n",
"Line 15: 0249400200\n",
"Line 16: 1218500200\n",
"Line 17: 0389100100\n",
"Line 18: 1100600105\n",
"Line 19: 0017800101\n",
"Line 20: 1051900100\n",
"Line 21: 0654800101\n",
"Line 22: 0047001000\n",
"Line 23: 0002200102\n",
"Line 24: 0320900400\n",
"Line 25: 0630000114\n",
"Line 26: 0868500201\n",
"Line 27: 0138200100\n",
"Line 28: 1173600200\n",
"Line 29: 0529600500\n",
"Line 30: 0371200104\n",
"Line 31: 1127401100\n",
"Line 32: 0434500100\n",
"Line 33: 1012400117\n",
"Line 34: 0480500100\n",
"Line 35: 1121900600\n",
"Line 36: 1273100200\n",
"Line 37: 1286700500\n",
"Line 38: 0743300100\n",
"Line 39: 0354300400\n",
"Line 40: 0002800112\n",
"Line 41: 0583100102\n",
"Line 42: 0151900200\n",
"Line 43: 0528800103\n",
"Line 44: 0558600300\n",
"Line 45: 0004300300\n",
"Line 46: 0623500900\n",
"Line 47: 1039300301\n",
"Line 48: 0782600100\n",
"Line 49: 0487400102\n",
"Line 50: 0472700200\n",
"Line 51: 1261200200\n",
"Line 52: 0040600100\n",
"Line 53: 0764800303\n",
"Line 54: 1124700300\n",
"Line 55: 0431600400\n",
"Line 56: 0254700200\n",
"Line 57: 0928500104\n",
"Line 58: 0746000100\n",
"Line 59: 0151601602\n",
"Line 60: 0390300200\n",
"Line 61: 0510400112\n",
"Line 62: 0461400103\n",
"Line 63: 0580600702\n",
"Line 64: 0002600100\n",
"Line 65: 0245900205\n",
"Line 66: 0004300405\n",
"Line 67: 0118400300\n",
"Line 68: 1029300200\n",
"Line 69: 0245100100\n",
"Line 70: 0003400201\n",
"Line 71: 0154800201\n",
"Line 72: 0529300500\n",
"Line 73: 0718900400\n",
"Line 74: 0228600702\n",
"Line 75: 0010800104\n",
"Line 76: 0591500200\n",
"Line 77: 0024700101\n",
"Line 78: 0270200104\n",
"Line 79: 0002301200\n",
"Line 80: 1036100301\n",
"Line 81: 0309500300\n",
"Line 82: 1199800200\n",
"Line 83: 0012100704\n",
"Line 84: 0440500302\n",
"Line 85: 0343900101\n",
"Line 86: 0364300500\n",
"Line 87: 0354400800\n",
"Line 88: 0382000700\n",
"Line 89: 0365000100\n",
"Line 90: 1228900102\n",
"Line 91: 0189500900\n",
"Line 92: 0339900103\n",
"Line 93: 0542801500\n",
"Line 94: 1197500102\n",
"Line 95: 0317300203\n",
"Line 96: 0813300300\n",
"Line 97: 0382000600\n",
"Line 98: 0011100403\n",
"Line 99: 0321000501\n",
"Line 100: 1029201400\n",
"Line 101: 0116500108\n",
"Line 102: 0412100103\n",
"Line 103: 0885100800\n",
"Line 104: 0212601300\n",
"Line 105: 0002500109\n",
"Line 106: 0505400502\n",
"Line 107: 0331200602\n",
"Line 108: 0304500400\n",
"Line 109: 0465900104\n",
"Line 110: 1050200401\n",
"Line 111: 1039900800\n",
"Line 112: 1242301103\n",
"Line 113: 1245700103\n",
"Line 114: 0352600300\n",
"Line 115: 0002901602\n",
"Line 116: 0808100301\n",
"Line 117: 0006700800\n",
"Line 118: 0739500300\n",
"Line 119: 0015000600\n",
"Line 120: 1226300600\n",
"Line 121: 0275000202\n",
"Line 122: 0721800102\n",
"Line 123: 1199600200\n",
"Line 124: 0138400600\n",
"Line 125: 0761101000\n",
"Line 126: 0018300100\n",
"Line 127: 0617900700\n",
"Line 128: 0561201000\n",
"Line 129: 1086100401\n",
"Line 130: 0262201100\n",
"Line 131: 1222800101\n",
"Line 132: 0672500102\n",
"Line 133: 0634900701\n",
"Line 134: 0224402000\n",
"Line 135: 0279800405\n",
"Line 136: 1218500101\n",
"Line 137: 0163100901\n",
"Line 138: 0764800501\n",
"Line 139: 0585600202\n",
"Line 140: 0492200102\n",
"Line 141: 1135000901\n",
"Line 142: 0393200502\n",
"Line 143: 0385200802\n",
"Line 144: 1064000200\n",
"Line 145: 0145500102\n",
"Line 146: 0362900502\n",
"Line 147: 0516900405\n",
"Line 148: 0241601500\n",
"Line 149: 0249300800\n",
"Line 150: 0477500404\n",
"Line 151: 0204400100\n",
"Line 152: 0182300302\n",
"Line 153: 0045600300\n",
"Line 154: 1239300400\n",
"Line 155: 0015000200\n",
"Line 156: 0060601000\n",
"Line 157: 0412800800\n",
"Line 158: 0343200400\n",
"Line 159: 0599800202\n",
"Line 160: 1002500401\n",
"Line 161: 1199600101\n",
"Line 162: 0725700501\n",
"Line 163: 0816900102\n",
"Line 164: 0480400200\n",
"Line 165: 0643000502\n",
"Line 166: 0002200400\n",
"Line 167: 0249100702\n",
"Line 168: 1204500101\n",
"Line 169: 0003300502\n",
"Line 170: 0791100601\n",
"Line 171: 0693800800\n",
"Line 172: 0209000200\n",
"Line 173: 0079100500\n",
"Line 174: 0543601602\n",
"Line 175: 0863800201\n",
"Line 176: 0061705100\n",
"Line 177: 0185100203\n",
"Line 178: 1289500201\n",
"Line 179: 0183400101\n",
"Line 180: 0021100102\n",
"Line 181: 0456800101\n",
"Line 182: 1044200400\n",
"Line 183: 0925600600\n",
"Line 184: 0309700102\n",
"Line 185: 0445200600\n",
"Line 186: 0403401300\n",
"Line 187: 0911500800\n",
"Line 188: 0651800500\n",
"Line 189: 0151601301\n",
"Line 190: 0616400903\n",
"Line 191: 0643500100\n",
"Line 192: 0389201600\n",
"Line 193: 0018200301\n",
"Line 194: 0306900102\n",
"Line 195: 0002300800\n",
"Line 196: 0950000202\n",
"Line 197: 0973400103\n",
"Line 198: 0014900100\n",
"Line 199: 0145500302\n"
]
}
],
"source": [
"# to get the history files\n",
"\n",
"filepath = 'history_IDs_norepeats.txt' \n",
"with open(filepath) as fp: \n",
" line = fp.readline().rstrip()\n",
" cnt = 1\n",
" while line:\n",
" print(\"Line {}: {}\".format(cnt, line.strip()))\n",
" line = fp.readline().rstrip()\n",
" cnt += 1\n",
" url = \"https://vm0824.kaj.pouta.csc.fi/octavo/ecco/search??pretty&fieldEnricher=&offsetDataConverter=&query=%3CDOCUMENT%C2%A7documentID%3A\"+line+\"%0A%C2%A7DOCUMENT%3E&field=content&offset=0&limit=20&snippetLimit=20&contextLevel=Sentence&contextExpandLeft=0&contextExpandRight=0&level=&endpoint=https%3A%2F%2Fvm0824.kaj.pouta.csc.fi%2Foctavo%2Fecco%2F\"\n",
" #print(url)\n",
" urllib.request.urlretrieve(url,\"data/hist_texts/\"+line+\".json\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('data/filename.json', <http.client.HTTPMessage at 0x7f2d04040550>)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#import urllib.request\n",
"\n",
"#url = \"https://vm0824.kaj.pouta.csc.fi/octavo/ecco/search??pretty&fieldEnricher=&offsetDataConverter=&query=%3CDOCUMENT%C2%A7documentID%3A0071400200%0A%C2%A7DOCUMENT%3E&field=content&offset=0&limit=20&snippetLimit=20&contextLevel=Sentence&contextExpandLeft=0&contextExpandRight=0&level=&endpoint=https%3A%2F%2Fvm0824.kaj.pouta.csc.fi%2Foctavo%2Fecco%2F\"\n",
"\n",
"#urllib.request.urlretrieve(url,\"data/filename.json\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#import json\n",
"#with open('data/filename.json') as f:\n",
"# d = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-10-aa11dfae24b0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"results\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"docs\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'd' is not defined"
]
}
],
"source": [
"#print(d[\"results\"][\"docs\"][0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment