JnBrymn-EB/Toxic Comment Elasticsearch.ipynb

## Toxic Comment Elasticsearch.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#bin/elasticsearch -Dhttp.port=9201 -Dtransport.tcp.port=9301\n",
    "from collections import Counter\n",
    "\n",
    "import pandas as pd\n",
    "from elasticsearch import Elasticsearch, helpers\n",
    "es = Elasticsearch('localhost:9201')  # 9201!!!\n",
    "es.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sample_submission = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/sample_submission.csv')\n",
    "test = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/test.csv')\n",
    "train = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/train.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "es.indices.delete(index, ignore=404)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = 'toxic_comments'\n",
    "doc_type = 'comment'\n",
    "settings = {\n",
    "    'settings': {\n",
    "        'number_of_shards': 1,\n",
    "        'number_of_replicas': 0,\n",
    "    },\n",
    "    'mappings': {\n",
    "        doc_type: {\n",
    "            '_all': {'enabled': False},\n",
    "            'properties': {\n",
    "                'comment_text': {\n",
    "                    'type': 'string',\n",
    "                    'analyzer': 'standard',\n",
    "                    'copy_to': 'comment_text_english',\n",
    "                },\n",
    "                'comment_text_english': {\n",
    "                    'type': 'string',\n",
    "                    'analyzer': 'english',\n",
    "                },\n",
    "                'toxic': {\n",
    "                    'type': 'boolean',\n",
    "                },\n",
    "                'severe_toxic': {\n",
    "                    'type': 'boolean',\n",
    "                },\n",
    "                'obscene': {\n",
    "                    'type': 'boolean',\n",
    "                },\n",
    "                'threat': {\n",
    "                    'type': 'boolean',\n",
    "                },\n",
    "                'insult': {\n",
    "                    'type': 'boolean',\n",
    "                },\n",
    "                'identity_hate': {\n",
    "                    'type': 'boolean',\n",
    "                },\n",
    "            }\n",
    "        }\n",
    "    },\n",
    "}\n",
    "es.indices.create(index, body=settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_action(docs, index, op_type='update', type='event'):\n",
    "    for doc in docs:\n",
    "        if op_type == 'update':\n",
    "            action = {\n",
    "                '_op_type': op_type,\n",
    "                '_index': index,\n",
    "                '_type': type,\n",
    "                'doc': doc,\n",
    "                'doc_as_upsert': True\n",
    "            }\n",
    "        elif op_type == 'create':\n",
    "            action = {\n",
    "                '_op_type': op_type,\n",
    "                '_index': index,\n",
    "                '_type': type,\n",
    "                '_source': doc\n",
    "            }\n",
    "        if 'id' in doc:\n",
    "            action['_id'] = doc['id']\n",
    "        yield action\n",
    "            \n",
    "        \n",
    "def get_training_iterator():\n",
    "    for index, row in train.iterrows():\n",
    "        yield row.to_dict()\n",
    "\n",
    "docs = get_training_iterator()\n",
    "actions = make_action(docs, index, 'create', 'comment')\n",
    "\n",
    "details = []\n",
    "count = 0\n",
    "for ok, detail in helpers.streaming_bulk(es, actions):\n",
    "    count += 1\n",
    "    if not count % 1000:\n",
    "        print(count)\n",
    "    if not ok:\n",
    "        details.append[detail]\n",
    "\n",
    "print(len(details))\n",
    "\n",
    "es.indices.forcemerge(index='toxic_comments', max_num_segments=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "size = 25\n",
    "\n",
    "def make_query(row):\n",
    "    body = {\n",
    "        'query': {\n",
    "            'multi_match': {\n",
    "                'query': row['comment_text'],\n",
    "                'fields': ['comment_text', 'comment_text_english'],\n",
    "                'type': 'best_fields',\n",
    "            }\n",
    "        },\n",
    "        'size': size,\n",
    "        'fields': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],\n",
    "    }\n",
    "    return body\n",
    "\n",
    "def process_response(row, resp):\n",
    "    if 'hits' not in resp or resp['hits']['total'] == 0:\n",
    "        print('no hits for {}'.format(row['id']))\n",
    "        print(resp)\n",
    "        answer = dict(zip(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], [0,0,0,0,0,0]))\n",
    "        answer['id'] = row['id']\n",
    "        return answer\n",
    "    hits = [hit['fields'] for hit in resp['hits']['hits']]\n",
    "    counter = Counter()\n",
    "    for hit in hits:\n",
    "        for k,v in hit.items():\n",
    "            hit[k] = v[0]\n",
    "        counter.update(hit)\n",
    "    for k,v in counter.items():\n",
    "        counter[k] /= size\n",
    "    answer = dict(counter)\n",
    "    answer['id'] = row['id']\n",
    "    return answer\n",
    "\n",
    "def get_test_iterator():\n",
    "    for index, row in test.iterrows():\n",
    "        yield row.to_dict()\n",
    "        \n",
    "def get_batch_iterator(row_iterator, batch_size):\n",
    "    items = []\n",
    "    for item in row_iterator:\n",
    "        items.append(item)\n",
    "        if len(items) == batch_size:\n",
    "            yield items\n",
    "            items = []\n",
    "    yield items\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_iterator = get_test_iterator()\n",
    "\n",
    "batch_size = 13*7\n",
    "batch_count = 0\n",
    "report_count = 1000\n",
    "report_batch = int(report_count/batch_size)\n",
    "\n",
    "answers = []\n",
    "for batch in get_batch_iterator(test_iterator, batch_size):\n",
    "    headers = [{'index': index, 'type': doc_type} for _ in range(batch_size)]\n",
    "    queries = [make_query(row) for row in batch]\n",
    "    msearch_request = []\n",
    "\n",
    "    for header, query in zip(headers, queries):\n",
    "        msearch_request.append(header)\n",
    "        msearch_request.append(query)\n",
    "\n",
    "    responses = es.msearch(msearch_request)['responses']\n",
    "\n",
    "    answers.extend([process_response(row, resp) for row, resp in zip(batch, responses)])\n",
    "    \n",
    "    batch_count += 1\n",
    "    if not batch_count % report_batch:\n",
    "        print(batch_count*batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "answers = pd.DataFrame(answers).reindex_axis(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "answers.to_csv('../first_submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parameters to explore\n",
    "* k (in k-NN) and k might be different per target dimenstion\n",
    "* boosting different fields\n",
    "* different analysis chains - like keeping CAPS and dropping very uncommon words (which score highly)\n",
    "* multimatch type\n",
    "* do something about punctuation and about repeditivity of text"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"#bin/elasticsearch -Dhttp.port=9201 -Dtransport.tcp.port=9301\n",
	"from collections import Counter\n",
	"\n",
	"import pandas as pd\n",
	"from elasticsearch import Elasticsearch, helpers\n",
	"es = Elasticsearch('localhost:9201') # 9201!!!\n",
	"es.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sample_submission = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/sample_submission.csv')\n",
	"test = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/test.csv')\n",
	"train = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/train.csv')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Index"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"es.indices.delete(index, ignore=404)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"index = 'toxic_comments'\n",
	"doc_type = 'comment'\n",
	"settings = {\n",
	" 'settings': {\n",
	" 'number_of_shards': 1,\n",
	" 'number_of_replicas': 0,\n",
	" },\n",
	" 'mappings': {\n",
	" doc_type: {\n",
	" '_all': {'enabled': False},\n",
	" 'properties': {\n",
	" 'comment_text': {\n",
	" 'type': 'string',\n",
	" 'analyzer': 'standard',\n",
	" 'copy_to': 'comment_text_english',\n",
	" },\n",
	" 'comment_text_english': {\n",
	" 'type': 'string',\n",
	" 'analyzer': 'english',\n",
	" },\n",
	" 'toxic': {\n",
	" 'type': 'boolean',\n",
	" },\n",
	" 'severe_toxic': {\n",
	" 'type': 'boolean',\n",
	" },\n",
	" 'obscene': {\n",
	" 'type': 'boolean',\n",
	" },\n",
	" 'threat': {\n",
	" 'type': 'boolean',\n",
	" },\n",
	" 'insult': {\n",
	" 'type': 'boolean',\n",
	" },\n",
	" 'identity_hate': {\n",
	" 'type': 'boolean',\n",
	" },\n",
	" }\n",
	" }\n",
	" },\n",
	"}\n",
	"es.indices.create(index, body=settings)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def make_action(docs, index, op_type='update', type='event'):\n",
	" for doc in docs:\n",
	" if op_type == 'update':\n",
	" action = {\n",
	" '_op_type': op_type,\n",
	" '_index': index,\n",
	" '_type': type,\n",
	" 'doc': doc,\n",
	" 'doc_as_upsert': True\n",
	" }\n",
	" elif op_type == 'create':\n",
	" action = {\n",
	" '_op_type': op_type,\n",
	" '_index': index,\n",
	" '_type': type,\n",
	" '_source': doc\n",
	" }\n",
	" if 'id' in doc:\n",
	" action['_id'] = doc['id']\n",
	" yield action\n",
	" \n",
	" \n",
	"def get_training_iterator():\n",
	" for index, row in train.iterrows():\n",
	" yield row.to_dict()\n",
	"\n",
	"docs = get_training_iterator()\n",
	"actions = make_action(docs, index, 'create', 'comment')\n",
	"\n",
	"details = []\n",
	"count = 0\n",
	"for ok, detail in helpers.streaming_bulk(es, actions):\n",
	" count += 1\n",
	" if not count % 1000:\n",
	" print(count)\n",
	" if not ok:\n",
	" details.append[detail]\n",
	"\n",
	"print(len(details))\n",
	"\n",
	"es.indices.forcemerge(index='toxic_comments', max_num_segments=1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Query"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"size = 25\n",
	"\n",
	"def make_query(row):\n",
	" body = {\n",
	" 'query': {\n",
	" 'multi_match': {\n",
	" 'query': row['comment_text'],\n",
	" 'fields': ['comment_text', 'comment_text_english'],\n",
	" 'type': 'best_fields',\n",
	" }\n",
	" },\n",
	" 'size': size,\n",
	" 'fields': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],\n",
	" }\n",
	" return body\n",
	"\n",
	"def process_response(row, resp):\n",
	" if 'hits' not in resp or resp['hits']['total'] == 0:\n",
	" print('no hits for {}'.format(row['id']))\n",
	" print(resp)\n",
	" answer = dict(zip(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], [0,0,0,0,0,0]))\n",
	" answer['id'] = row['id']\n",
	" return answer\n",
	" hits = [hit['fields'] for hit in resp['hits']['hits']]\n",
	" counter = Counter()\n",
	" for hit in hits:\n",
	" for k,v in hit.items():\n",
	" hit[k] = v[0]\n",
	" counter.update(hit)\n",
	" for k,v in counter.items():\n",
	" counter[k] /= size\n",
	" answer = dict(counter)\n",
	" answer['id'] = row['id']\n",
	" return answer\n",
	"\n",
	"def get_test_iterator():\n",
	" for index, row in test.iterrows():\n",
	" yield row.to_dict()\n",
	" \n",
	"def get_batch_iterator(row_iterator, batch_size):\n",
	" items = []\n",
	" for item in row_iterator:\n",
	" items.append(item)\n",
	" if len(items) == batch_size:\n",
	" yield items\n",
	" items = []\n",
	" yield items\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"test_iterator = get_test_iterator()\n",
	"\n",
	"batch_size = 13*7\n",
	"batch_count = 0\n",
	"report_count = 1000\n",
	"report_batch = int(report_count/batch_size)\n",
	"\n",
	"answers = []\n",
	"for batch in get_batch_iterator(test_iterator, batch_size):\n",
	" headers = [{'index': index, 'type': doc_type} for _ in range(batch_size)]\n",
	" queries = [make_query(row) for row in batch]\n",
	" msearch_request = []\n",
	"\n",
	" for header, query in zip(headers, queries):\n",
	" msearch_request.append(header)\n",
	" msearch_request.append(query)\n",
	"\n",
	" responses = es.msearch(msearch_request)['responses']\n",
	"\n",
	" answers.extend([process_response(row, resp) for row, resp in zip(batch, responses)])\n",
	" \n",
	" batch_count += 1\n",
	" if not batch_count % report_batch:\n",
	" print(batch_count*batch_size)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"answers = pd.DataFrame(answers).reindex_axis(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"answers.to_csv('../first_submission.csv', index=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Parameters to explore\n",
	"* k (in k-NN) and k might be different per target dimenstion\n",
	"* boosting different fields\n",
	"* different analysis chains - like keeping CAPS and dropping very uncommon words (which score highly)\n",
	"* multimatch type\n",
	"* do something about punctuation and about repeditivity of text"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}