Skip to content

Instantly share code, notes, and snippets.

@JnBrymn-EB
Created March 13, 2018 02:57
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JnBrymn-EB/d3c8582cc81a55f758d6505a7a4079e3 to your computer and use it in GitHub Desktop.
Save JnBrymn-EB/d3c8582cc81a55f758d6505a7a4079e3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#bin/elasticsearch -Dhttp.port=9201 -Dtransport.tcp.port=9301\n",
"from collections import Counter\n",
"\n",
"import pandas as pd\n",
"from elasticsearch import Elasticsearch, helpers\n",
"es = Elasticsearch('localhost:9201') # 9201!!!\n",
"es.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sample_submission = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/sample_submission.csv')\n",
"test = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/test.csv')\n",
"train = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/train.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"es.indices.delete(index, ignore=404)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index = 'toxic_comments'\n",
"doc_type = 'comment'\n",
"settings = {\n",
" 'settings': {\n",
" 'number_of_shards': 1,\n",
" 'number_of_replicas': 0,\n",
" },\n",
" 'mappings': {\n",
" doc_type: {\n",
" '_all': {'enabled': False},\n",
" 'properties': {\n",
" 'comment_text': {\n",
" 'type': 'string',\n",
" 'analyzer': 'standard',\n",
" 'copy_to': 'comment_text_english',\n",
" },\n",
" 'comment_text_english': {\n",
" 'type': 'string',\n",
" 'analyzer': 'english',\n",
" },\n",
" 'toxic': {\n",
" 'type': 'boolean',\n",
" },\n",
" 'severe_toxic': {\n",
" 'type': 'boolean',\n",
" },\n",
" 'obscene': {\n",
" 'type': 'boolean',\n",
" },\n",
" 'threat': {\n",
" 'type': 'boolean',\n",
" },\n",
" 'insult': {\n",
" 'type': 'boolean',\n",
" },\n",
" 'identity_hate': {\n",
" 'type': 'boolean',\n",
" },\n",
" }\n",
" }\n",
" },\n",
"}\n",
"es.indices.create(index, body=settings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def make_action(docs, index, op_type='update', type='event'):\n",
" for doc in docs:\n",
" if op_type == 'update':\n",
" action = {\n",
" '_op_type': op_type,\n",
" '_index': index,\n",
" '_type': type,\n",
" 'doc': doc,\n",
" 'doc_as_upsert': True\n",
" }\n",
" elif op_type == 'create':\n",
" action = {\n",
" '_op_type': op_type,\n",
" '_index': index,\n",
" '_type': type,\n",
" '_source': doc\n",
" }\n",
" if 'id' in doc:\n",
" action['_id'] = doc['id']\n",
" yield action\n",
" \n",
" \n",
"def get_training_iterator():\n",
" for index, row in train.iterrows():\n",
" yield row.to_dict()\n",
"\n",
"docs = get_training_iterator()\n",
"actions = make_action(docs, index, 'create', 'comment')\n",
"\n",
"details = []\n",
"count = 0\n",
"for ok, detail in helpers.streaming_bulk(es, actions):\n",
" count += 1\n",
" if not count % 1000:\n",
" print(count)\n",
" if not ok:\n",
" details.append[detail]\n",
"\n",
"print(len(details))\n",
"\n",
"es.indices.forcemerge(index='toxic_comments', max_num_segments=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Query"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"size = 25\n",
"\n",
"def make_query(row):\n",
" body = {\n",
" 'query': {\n",
" 'multi_match': {\n",
" 'query': row['comment_text'],\n",
" 'fields': ['comment_text', 'comment_text_english'],\n",
" 'type': 'best_fields',\n",
" }\n",
" },\n",
" 'size': size,\n",
" 'fields': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],\n",
" }\n",
" return body\n",
"\n",
"def process_response(row, resp):\n",
" if 'hits' not in resp or resp['hits']['total'] == 0:\n",
" print('no hits for {}'.format(row['id']))\n",
" print(resp)\n",
" answer = dict(zip(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], [0,0,0,0,0,0]))\n",
" answer['id'] = row['id']\n",
" return answer\n",
" hits = [hit['fields'] for hit in resp['hits']['hits']]\n",
" counter = Counter()\n",
" for hit in hits:\n",
" for k,v in hit.items():\n",
" hit[k] = v[0]\n",
" counter.update(hit)\n",
" for k,v in counter.items():\n",
" counter[k] /= size\n",
" answer = dict(counter)\n",
" answer['id'] = row['id']\n",
" return answer\n",
"\n",
"def get_test_iterator():\n",
" for index, row in test.iterrows():\n",
" yield row.to_dict()\n",
" \n",
"def get_batch_iterator(row_iterator, batch_size):\n",
" items = []\n",
" for item in row_iterator:\n",
" items.append(item)\n",
" if len(items) == batch_size:\n",
" yield items\n",
" items = []\n",
" yield items\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_iterator = get_test_iterator()\n",
"\n",
"batch_size = 13*7\n",
"batch_count = 0\n",
"report_count = 1000\n",
"report_batch = int(report_count/batch_size)\n",
"\n",
"answers = []\n",
"for batch in get_batch_iterator(test_iterator, batch_size):\n",
" headers = [{'index': index, 'type': doc_type} for _ in range(batch_size)]\n",
" queries = [make_query(row) for row in batch]\n",
" msearch_request = []\n",
"\n",
" for header, query in zip(headers, queries):\n",
" msearch_request.append(header)\n",
" msearch_request.append(query)\n",
"\n",
" responses = es.msearch(msearch_request)['responses']\n",
"\n",
" answers.extend([process_response(row, resp) for row, resp in zip(batch, responses)])\n",
" \n",
" batch_count += 1\n",
" if not batch_count % report_batch:\n",
" print(batch_count*batch_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"answers = pd.DataFrame(answers).reindex_axis(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"answers.to_csv('../first_submission.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parameters to explore\n",
"* k (in k-NN) and k might be different per target dimenstion\n",
"* boosting different fields\n",
"* different analysis chains - like keeping CAPS and dropping very uncommon words (which score highly)\n",
"* multimatch type\n",
"* do something about punctuation and about repeditivity of text"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment