JnBrymn-EB/Experimenting with Tagging.ipynb

## Experimenting with Tagging.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from xml.etree import ElementTree\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "XML_RE = re.compile(r'<[^>]+>')\n",
    "TAG_RE = re.compile(r'<([^>]+)>')\n",
    "\n",
    "tree = ElementTree.parse('../data/pets.stackexchange.com/Posts.xml')\n",
    "root = tree.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "titles = []\n",
    "bodies = []\n",
    "tagses = []\n",
    "\n",
    "for child in root:\n",
    "    attrib = child.attrib\n",
    "    if attrib['PostTypeId'] == '1':  # a question\n",
    "        titles.append(attrib['Title'])\n",
    "        bodies.append(XML_RE.sub(' ', attrib['Body']))\n",
    "        tagses.append(' '.join(TAG_RE.findall(attrib['Tags'])).replace('-','_'))\n",
    "        \n",
    "titles_train, titles_test, bodies_train, bodies_test, tagses_train, tagses_test = train_test_split(\n",
    "    titles,\n",
    "    bodies,\n",
    "    tagses,\n",
    "    test_size=0.10,\n",
    "    random_state=42,\n",
    ")        \n",
    "           \n",
    "title_processor = Pipeline([\n",
    "    ('vect', CountVectorizer(stop_words='english')),\n",
    "    ('tfidf', TfidfTransformer(use_idf=True)),\n",
    "])\n",
    "tfidf_titles_train = title_processor.fit_transform(titles_train)\n",
    "tfidf_titles_test = title_processor.transform(titles_test)\n",
    "\n",
    "body_processor = Pipeline([\n",
    "    ('vect', CountVectorizer(stop_words='english')),\n",
    "    ('tfidf', TfidfTransformer(use_idf=True)),\n",
    "])\n",
    "tfidf_bodies_train = body_processor.fit_transform(bodies_train)\n",
    "tfidf_bodies_test = body_processor.transform(bodies_test)\n",
    "\n",
    "tags_processor = CountVectorizer()\n",
    "vect_tagses_train = tags_processor.fit_transform(tagses_train)\n",
    "vect_tagses_test = tags_processor.transform(tagses_test)\n",
    "\n",
    "# might not be able to do this later\n",
    "dense_tfidf_titles_train = tfidf_titles_train.todense()\n",
    "dense_tfidf_titles_test = tfidf_titles_test.todense()\n",
    "dense_tfidf_bodies_train = tfidf_bodies_train.todense()\n",
    "dense_tfidf_bodies_test = tfidf_bodies_test.todense()\n",
    "dense_vect_tagses_train = vect_tagses_train.todense()\n",
    "dense_vect_tagses_test = vect_tagses_test.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 377,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "field = 'body'\n",
    "if field == 'body':\n",
    "    X_train = dense_tfidf_bodies_train\n",
    "    X_test = dense_tfidf_bodies_test\n",
    "    X_test_raw = bodies_test\n",
    "    processor = body_processor\n",
    "elif field == 'title':\n",
    "    X_train = dense_tfidf_titles_train\n",
    "    X_test = dense_tfidf_titles_test\n",
    "    X_test_raw = bodies_test\n",
    "    processor = title_processor\n",
    "\n",
    "y_train = dense_vect_tagses_train\n",
    "y_test = dense_vect_tagses_test\n",
    "y_test_raw = tagses_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 370,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://keras.io/getting-started/faq/#what-does-sample-batch-epoch-mean\n",
    "from keras.layers import Input, Dense\n",
    "from keras.models import Model\n",
    "\n",
    "inputs = Input(\n",
    "    shape=(X_train.shape[1],), \n",
    "    dtype='float', \n",
    "    name='inputs',\n",
    ")\n",
    "middle = Dense(y_train.shape[1], name='middle', activation='relu')(inputs)\n",
    "outputs = Dense(y_train.shape[1], name='outputs')(middle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 371,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model(inputs=inputs, outputs=outputs)\n",
    "model.compile(optimizer='rmsprop', loss='mean_squared_error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 375,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 0.0054\n",
      "Epoch 2/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 0.0043\n",
      "Epoch 3/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 0.0035\n",
      "Epoch 4/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 0.0029A: \n",
      "Epoch 5/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 0.0024\n",
      "Epoch 6/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 0.0020\n",
      "Epoch 7/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 0.0017\n",
      "Epoch 8/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 0.0015\n",
      "Epoch 9/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 0.0013\n",
      "Epoch 10/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 0.0012\n",
      "Epoch 11/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 0.0010\n",
      "Epoch 12/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 9.4797e-04\n",
      "Epoch 13/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 8.6780e-04\n",
      "Epoch 14/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 8.0661e-04A: \n",
      "Epoch 15/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 7.5197e-04\n",
      "Epoch 16/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 7.0781e-04\n",
      "Epoch 17/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 6.6906e-04\n",
      "Epoch 18/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 6.3558e-04\n",
      "Epoch 19/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 6.0527e-04A: 0s - loss: 6.0576e-0\n",
      "Epoch 20/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 5.8076e-04\n",
      "Epoch 21/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 5.5946e-04\n",
      "Epoch 22/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 5.3983e-04\n",
      "Epoch 23/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 5.2202e-04\n",
      "Epoch 24/25\n",
      "4215/4215 [==============================] - 9s 2ms/step - loss: 5.0677e-04\n",
      "Epoch 25/25\n",
      "4215/4215 [==============================] - 8s 2ms/step - loss: 4.9229e-04\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x1232f3470>"
      ]
     },
     "execution_count": 375,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X_train, y_train,epochs=25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 418,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predicted: ['aquarium' 'betta' 'fish']\n",
      "truth: ['fish', 'betta']\n",
      "original text:\n",
      " My kids saw the \"My Fun Fish Tank\" on TV. \n",
      "\n",
      "   \n",
      "\n",
      " (Not mine, but an example of what the tank looks like) \n",
      "\n",
      " I got it for them and after I put it together I went to the pet store and got a male betta. I change the water every 2 days with distilled water, and we feed him like the instructions that came with the tank told us to, but I'm worried that the fish doesn't have enough oxygen because he doesn't move very much, he stays at the bottom like he's gotten stuck under the rocks. \n",
      "\n",
      " Why is my betta acting like this, and what can I do to make him better? \n",
      "\n"
     ]
    }
   ],
   "source": [
    "def sample(i, limit=0.4):\n",
    "    print('predicted: {}'.format(list(tags_processor.inverse_transform((model.predict(X_test[i])>limit)+0.0))[0]))\n",
    "    print('truth: {}'.format(y_test_raw[i].split()))\n",
    "    print('original text:\\n{}'.format(X_test_raw[i]))\n",
    "sample(44, 0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 420,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predicted: ['dental_care' 'dogs' 'health']\n"
     ]
    }
   ],
   "source": [
    "def fake_text(text, limit = 0.4):\n",
    "    vect = processor.transform([text]).todense()[0]\n",
    "    print('predicted: {}'.format(list(tags_processor.inverse_transform((model.predict(vect)>limit)+0.0))[0]))\n",
    "    \n",
    "fake_text('''\n",
    "Why Does My puppy Have teeth?\n",
    "''')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 128,
	"metadata": {},
	"outputs": [],
	"source": [
	"import re\n",
	"from xml.etree import ElementTree\n",
	"\n",
	"import numpy as np\n",
	"from sklearn.pipeline import Pipeline\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.feature_extraction.text import TfidfTransformer\n",
	"from sklearn.naive_bayes import MultinomialNB"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 106,
	"metadata": {},
	"outputs": [],
	"source": [
	"XML_RE = re.compile(r'<[^>]+>')\n",
	"TAG_RE = re.compile(r'<([^>]+)>')\n",
	"\n",
	"tree = ElementTree.parse('../data/pets.stackexchange.com/Posts.xml')\n",
	"root = tree.getroot()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 354,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"titles = []\n",
	"bodies = []\n",
	"tagses = []\n",
	"\n",
	"for child in root:\n",
	" attrib = child.attrib\n",
	" if attrib['PostTypeId'] == '1': # a question\n",
	" titles.append(attrib['Title'])\n",
	" bodies.append(XML_RE.sub(' ', attrib['Body']))\n",
	" tagses.append(' '.join(TAG_RE.findall(attrib['Tags'])).replace('-','_'))\n",
	" \n",
	"titles_train, titles_test, bodies_train, bodies_test, tagses_train, tagses_test = train_test_split(\n",
	" titles,\n",
	" bodies,\n",
	" tagses,\n",
	" test_size=0.10,\n",
	" random_state=42,\n",
	") \n",
	" \n",
	"title_processor = Pipeline([\n",
	" ('vect', CountVectorizer(stop_words='english')),\n",
	" ('tfidf', TfidfTransformer(use_idf=True)),\n",
	"])\n",
	"tfidf_titles_train = title_processor.fit_transform(titles_train)\n",
	"tfidf_titles_test = title_processor.transform(titles_test)\n",
	"\n",
	"body_processor = Pipeline([\n",
	" ('vect', CountVectorizer(stop_words='english')),\n",
	" ('tfidf', TfidfTransformer(use_idf=True)),\n",
	"])\n",
	"tfidf_bodies_train = body_processor.fit_transform(bodies_train)\n",
	"tfidf_bodies_test = body_processor.transform(bodies_test)\n",
	"\n",
	"tags_processor = CountVectorizer()\n",
	"vect_tagses_train = tags_processor.fit_transform(tagses_train)\n",
	"vect_tagses_test = tags_processor.transform(tagses_test)\n",
	"\n",
	"# might not be able to do this later\n",
	"dense_tfidf_titles_train = tfidf_titles_train.todense()\n",
	"dense_tfidf_titles_test = tfidf_titles_test.todense()\n",
	"dense_tfidf_bodies_train = tfidf_bodies_train.todense()\n",
	"dense_tfidf_bodies_test = tfidf_bodies_test.todense()\n",
	"dense_vect_tagses_train = vect_tagses_train.todense()\n",
	"dense_vect_tagses_test = vect_tagses_test.todense()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 377,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"field = 'body'\n",
	"if field == 'body':\n",
	" X_train = dense_tfidf_bodies_train\n",
	" X_test = dense_tfidf_bodies_test\n",
	" X_test_raw = bodies_test\n",
	" processor = body_processor\n",
	"elif field == 'title':\n",
	" X_train = dense_tfidf_titles_train\n",
	" X_test = dense_tfidf_titles_test\n",
	" X_test_raw = bodies_test\n",
	" processor = title_processor\n",
	"\n",
	"y_train = dense_vect_tagses_train\n",
	"y_test = dense_vect_tagses_test\n",
	"y_test_raw = tagses_test"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 370,
	"metadata": {},
	"outputs": [],
	"source": [
	"# https://keras.io/getting-started/faq/#what-does-sample-batch-epoch-mean\n",
	"from keras.layers import Input, Dense\n",
	"from keras.models import Model\n",
	"\n",
	"inputs = Input(\n",
	" shape=(X_train.shape[1],), \n",
	" dtype='float', \n",
	" name='inputs',\n",
	")\n",
	"middle = Dense(y_train.shape[1], name='middle', activation='relu')(inputs)\n",
	"outputs = Dense(y_train.shape[1], name='outputs')(middle)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 371,
	"metadata": {},
	"outputs": [],
	"source": [
	"model = Model(inputs=inputs, outputs=outputs)\n",
	"model.compile(optimizer='rmsprop', loss='mean_squared_error')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 375,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 0.0054\n",
	"Epoch 2/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 0.0043\n",
	"Epoch 3/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 0.0035\n",
	"Epoch 4/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 0.0029A: \n",
	"Epoch 5/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 0.0024\n",
	"Epoch 6/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 0.0020\n",
	"Epoch 7/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 0.0017\n",
	"Epoch 8/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 0.0015\n",
	"Epoch 9/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 0.0013\n",
	"Epoch 10/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 0.0012\n",
	"Epoch 11/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 0.0010\n",
	"Epoch 12/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 9.4797e-04\n",
	"Epoch 13/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 8.6780e-04\n",
	"Epoch 14/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 8.0661e-04A: \n",
	"Epoch 15/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 7.5197e-04\n",
	"Epoch 16/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 7.0781e-04\n",
	"Epoch 17/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 6.6906e-04\n",
	"Epoch 18/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 6.3558e-04\n",
	"Epoch 19/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 6.0527e-04A: 0s - loss: 6.0576e-0\n",
	"Epoch 20/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 5.8076e-04\n",
	"Epoch 21/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 5.5946e-04\n",
	"Epoch 22/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 5.3983e-04\n",
	"Epoch 23/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 5.2202e-04\n",
	"Epoch 24/25\n",
	"4215/4215 [==============================] - 9s 2ms/step - loss: 5.0677e-04\n",
	"Epoch 25/25\n",
	"4215/4215 [==============================] - 8s 2ms/step - loss: 4.9229e-04\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<keras.callbacks.History at 0x1232f3470>"
	]
	},
	"execution_count": 375,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model.fit(X_train, y_train,epochs=25)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 418,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"predicted: ['aquarium' 'betta' 'fish']\n",
	"truth: ['fish', 'betta']\n",
	"original text:\n",
	" My kids saw the \"My Fun Fish Tank\" on TV. \n",
	"\n",
	" \n",
	"\n",
	" (Not mine, but an example of what the tank looks like) \n",
	"\n",
	" I got it for them and after I put it together I went to the pet store and got a male betta. I change the water every 2 days with distilled water, and we feed him like the instructions that came with the tank told us to, but I'm worried that the fish doesn't have enough oxygen because he doesn't move very much, he stays at the bottom like he's gotten stuck under the rocks. \n",
	"\n",
	" Why is my betta acting like this, and what can I do to make him better? \n",
	"\n"
	]
	}
	],
	"source": [
	"def sample(i, limit=0.4):\n",
	" print('predicted: {}'.format(list(tags_processor.inverse_transform((model.predict(X_test[i])>limit)+0.0))[0]))\n",
	" print('truth: {}'.format(y_test_raw[i].split()))\n",
	" print('original text:\\n{}'.format(X_test_raw[i]))\n",
	"sample(44, 0.3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 420,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"predicted: ['dental_care' 'dogs' 'health']\n"
	]
	}
	],
	"source": [
	"def fake_text(text, limit = 0.4):\n",
	" vect = processor.transform([text]).todense()[0]\n",
	" print('predicted: {}'.format(list(tags_processor.inverse_transform((model.predict(vect)>limit)+0.0))[0]))\n",
	" \n",
	"fake_text('''\n",
	"Why Does My puppy Have teeth?\n",
	"''')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}