dodijk/ClueWebService.ipynb

## ClueWebService.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              ClueWebService.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Indri Server.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:2916867824ab5339aacf3fe9f893c5d0bfa76e43bb2d1d871da5225a5eb81a76"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "This small webservice will serve an indri index using the [dumpindex](http://sourceforge.net/p/lemur/wiki/dumpdoc,%20dumpterm,%20and%20dumpindex/) command. It was created by [Daan Odijk](http://daan.odijk.me) in December 2014."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from flask import Flask\n",
      "\n",
      "indri = \"~/indri/indri-5.5/bin/dumpindex /datastore/rreinan1/ClueWeb09_English_1.index-nostopwording\"\n",
      "\n",
      "usage_text = \"\"\"This web service allows a user to search indexed documents using an indexed term, \n",
      "and provides access to general statistics about an index in general.\n",
      "\n",
      "The webservice is more of less of a \"Swiss-army knife\" for various index functions.\n",
      "\n",
      "Commands for retrieving data from a repository are as follows:\n",
      "\n",
      "Command               Argument(s)  Description\n",
      "/                     (None)       Print this usage description\n",
      "/term (/t)            Term text    Print inverted list for a term\n",
      "/termpositions (/tp)  Term text    Print inverted list for a term, with positions\n",
      "/fieldpositions /(fp) Field name   Print inverted list for a field, with positions\n",
      "/documentname (/dn)   Document ID  Print the text representation of a document ID \n",
      "/documenttext (/dt)   Document ID  Print the text of a document\n",
      "/documentdata (/dd)   Document ID  Print the full representation of a document\n",
      "/documentvector (/dv) Document ID  Print the document vector of a document\n",
      "/spam  (/sp)          Document ID  Print the spamminess percentile score*\n",
      "/stats (/s)           (None)       Print statistics for the Repository\n",
      "\n",
      "* The percentile score indicates the percentage of the documents in the corpus that are \"spammier.\" \n",
      "  That is, the spammiest 1% of the documents have percentile-score=0, the next spammiest have \n",
      "  percentile-score=1, and so on. The least spammy 1% have percentile-score=99. If you just want to \n",
      "  label pages as spam or not, label those with percentile-score<70 to be spam, and the rest non-spam.\"\"\"\n",
      "\n",
      "app = Flask(\"Flask\")\n",
      "@app.route(\"/\")\n",
      "def usage():\n",
      "    return usage_text, 200, {'Content-Type': 'text/plain; charset=utf-8'}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import cPickle as pickle\n",
      "with open(\"/datastore/dodijk2/ClueWeb09-SpamRankings.subsetB.pickle\") as f:\n",
      "    spamscores = pickle.load(f)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "@app.route(\"/spamcw/<docid>\")\n",
      "def spam_clueweb(docid):\n",
      "    print docid\n",
      "    a, b, c = docid[12:].split(\"-\")\n",
      "    print a, b, c\n",
      "    out = spamscores[a][int(b)][int(c)]\n",
      "    print out\n",
      "    return str(out), 200, {'Content-Type': 'text/plain; charset=utf-8'}\n",
      "\n",
      "@app.route(\"/sp/<int:docid>\")\n",
      "@app.route(\"/spam/<int:docid>\")\n",
      "def spam(docid):\n",
      "    docname = !$indri documentname $docid\n",
      "    return spam_clueweb(\"\\n\".join(docname))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import re\n",
      "\n",
      "allowed_commands = \"term t termpositions tp fieldpositions fp documentname dn documenttext dt documentdata dd documentvector dv stats s\".split()\n",
      "@app.route(\"/<command>\")\n",
      "@app.route(\"/<command>/<query>\")\n",
      "def command(command, query=\"\"):\n",
      "    if not command in allowed_commands: return usage()\n",
      "    match = re.search(u\"[\\w-]+\", query, re.UNICODE) # Allow only characters.\n",
      "    if not command in (\"stats\", \"s\"):\n",
      "        if not match or not match.group(0): return usage()\n",
      "        print command, \"|\", match.group(0), \"|\", query\n",
      "        query = match.group(0)\n",
      "    else: print command\n",
      "    out = !$indri $command $query\n",
      "    return \"\\n\".join(out), 200, {'Content-Type': 'text/plain; charset=utf-8'}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "app.run(host=\"0.0.0.0\", port=8003, debug=True, use_reloader=False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Spam\n",
      "\n",
      "Process [SpamRanking](http://durum0.uwaterloo.ca/clueweb09spam/). "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Process SpamRanking\n",
      "\n",
      "from random import randint\n",
      "spamscores = {}\n",
      "with open(\"/datastore/rreinan1/ClueWeb09-SpamRanking/clueweb09spam.Fusion\") as f:\n",
      "    for line in f:\n",
      "        score, doc = line.split()\n",
      "        include = doc[12:14] == \"wp\" and int(doc[14:16]) < 4\n",
      "        include |= doc[12:14] == \"00\" and int(doc[14:16]) < 12\n",
      "        if not include: continue\n",
      "        \n",
      "        assert doc[:12] == \"clueweb09-en\"\n",
      "        a, b, c = doc[12:].split(\"-\")\n",
      "        if a not in spamscores: spamscores[a] = []\n",
      "        if int(b) == len(spamscores[a]): spamscores[a].append([])\n",
      "        assert int(c) == len(spamscores[a][int(b)])\n",
      "        spamscores[a][int(b)].append(int(score))\n",
      "        if randint(0, 1000000) == 0: print doc\n",
      "            \n",
      "import cPickle as pickle\n",
      "with open(\"/datastore/dodijk2/ClueWeb09-SpamRankings.subsetB.pickle\", \"w\") as out:\n",
      "    pickle.dump(spamscores, out)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import cPickle as pickle\n",
      "pickle.dump(spamscores, open(\"/datastore/dodijk2/ClueWeb09-SpamRankings.subsetB.pickle\", \"w\"))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:2916867824ab5339aacf3fe9f893c5d0bfa76e43bb2d1d871da5225a5eb81a76"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This small webservice will serve an indri index using the [dumpindex](http://sourceforge.net/p/lemur/wiki/dumpdoc,%20dumpterm,%20and%20dumpindex/) command. It was created by [Daan Odijk](http://daan.odijk.me) in December 2014."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from flask import Flask\n",
	"\n",
	"indri = \"~/indri/indri-5.5/bin/dumpindex /datastore/rreinan1/ClueWeb09_English_1.index-nostopwording\"\n",
	"\n",
	"usage_text = \"\"\"This web service allows a user to search indexed documents using an indexed term, \n",
	"and provides access to general statistics about an index in general.\n",
	"\n",
	"The webservice is more of less of a \"Swiss-army knife\" for various index functions.\n",
	"\n",
	"Commands for retrieving data from a repository are as follows:\n",
	"\n",
	"Command Argument(s) Description\n",
	"/ (None) Print this usage description\n",
	"/term (/t) Term text Print inverted list for a term\n",
	"/termpositions (/tp) Term text Print inverted list for a term, with positions\n",
	"/fieldpositions /(fp) Field name Print inverted list for a field, with positions\n",
	"/documentname (/dn) Document ID Print the text representation of a document ID \n",
	"/documenttext (/dt) Document ID Print the text of a document\n",
	"/documentdata (/dd) Document ID Print the full representation of a document\n",
	"/documentvector (/dv) Document ID Print the document vector of a document\n",
	"/spam (/sp) Document ID Print the spamminess percentile score*\n",
	"/stats (/s) (None) Print statistics for the Repository\n",
	"\n",
	"* The percentile score indicates the percentage of the documents in the corpus that are \"spammier.\" \n",
	" That is, the spammiest 1% of the documents have percentile-score=0, the next spammiest have \n",
	" percentile-score=1, and so on. The least spammy 1% have percentile-score=99. If you just want to \n",
	" label pages as spam or not, label those with percentile-score<70 to be spam, and the rest non-spam.\"\"\"\n",
	"\n",
	"app = Flask(\"Flask\")\n",
	"@app.route(\"/\")\n",
	"def usage():\n",
	" return usage_text, 200, {'Content-Type': 'text/plain; charset=utf-8'}"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import cPickle as pickle\n",
	"with open(\"/datastore/dodijk2/ClueWeb09-SpamRankings.subsetB.pickle\") as f:\n",
	" spamscores = pickle.load(f)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"@app.route(\"/spamcw/<docid>\")\n",
	"def spam_clueweb(docid):\n",
	" print docid\n",
	" a, b, c = docid[12:].split(\"-\")\n",
	" print a, b, c\n",
	" out = spamscores[a][int(b)][int(c)]\n",
	" print out\n",
	" return str(out), 200, {'Content-Type': 'text/plain; charset=utf-8'}\n",
	"\n",
	"@app.route(\"/sp/<int:docid>\")\n",
	"@app.route(\"/spam/<int:docid>\")\n",
	"def spam(docid):\n",
	" docname = !$indri documentname $docid\n",
	" return spam_clueweb(\"\\n\".join(docname))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import re\n",
	"\n",
	"allowed_commands = \"term t termpositions tp fieldpositions fp documentname dn documenttext dt documentdata dd documentvector dv stats s\".split()\n",
	"@app.route(\"/<command>\")\n",
	"@app.route(\"/<command>/<query>\")\n",
	"def command(command, query=\"\"):\n",
	" if not command in allowed_commands: return usage()\n",
	" match = re.search(u\"[\\w-]+\", query, re.UNICODE) # Allow only characters.\n",
	" if not command in (\"stats\", \"s\"):\n",
	" if not match or not match.group(0): return usage()\n",
	" print command, \"\|\", match.group(0), \"\|\", query\n",
	" query = match.group(0)\n",
	" else: print command\n",
	" out = !$indri $command $query\n",
	" return \"\\n\".join(out), 200, {'Content-Type': 'text/plain; charset=utf-8'}"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"app.run(host=\"0.0.0.0\", port=8003, debug=True, use_reloader=False)"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Spam\n",
	"\n",
	"Process [SpamRanking](http://durum0.uwaterloo.ca/clueweb09spam/). "
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Process SpamRanking\n",
	"\n",
	"from random import randint\n",
	"spamscores = {}\n",
	"with open(\"/datastore/rreinan1/ClueWeb09-SpamRanking/clueweb09spam.Fusion\") as f:\n",
	" for line in f:\n",
	" score, doc = line.split()\n",
	" include = doc[12:14] == \"wp\" and int(doc[14:16]) < 4\n",
	" include \|= doc[12:14] == \"00\" and int(doc[14:16]) < 12\n",
	" if not include: continue\n",
	" \n",
	" assert doc[:12] == \"clueweb09-en\"\n",
	" a, b, c = doc[12:].split(\"-\")\n",
	" if a not in spamscores: spamscores[a] = []\n",
	" if int(b) == len(spamscores[a]): spamscores[a].append([])\n",
	" assert int(c) == len(spamscores[a][int(b)])\n",
	" spamscores[a][int(b)].append(int(score))\n",
	" if randint(0, 1000000) == 0: print doc\n",
	" \n",
	"import cPickle as pickle\n",
	"with open(\"/datastore/dodijk2/ClueWeb09-SpamRankings.subsetB.pickle\", \"w\") as out:\n",
	" pickle.dump(spamscores, out)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 26
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import cPickle as pickle\n",
	"pickle.dump(spamscores, open(\"/datastore/dodijk2/ClueWeb09-SpamRankings.subsetB.pickle\", \"w\"))"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}