Skip to content

Instantly share code, notes, and snippets.

@guenter
Created April 20, 2018 00:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guenter/db335fdb9ea915432524dcb6b5d5edd1 to your computer and use it in GitHub Desktop.
Save guenter/db335fdb9ea915432524dcb6b5d5edd1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import os \nimport datetime\n\nimport numpy as np\nimport pandas as pd\nimport nltk\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\nfrom sklearn.metrics import classification_report, confusion_matrix\nfrom sklearn.model_selection import train_test_split\nimport keras\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Dropout\nimport yaml",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model_config = {\n 'tfidf': True,\n 'norm': 'l2',\n 'stemmer': False,\n 'ngram_min': 1,\n 'ngram_max': 1,\n 'max_features': None,\n 'language': 'english'\n}\n\nmodel_id = \"godfather_\" + datetime.datetime.now().replace(microsecond=0).isoformat()\nmodel_export_path = 'model'\nmodel_log_dir = 'logs'\n\ndata = pd.read_csv(\"metacritic.csv\")\ntext_column = 'Quote'\ncategory_column = 'Polarity'\n\n# Remove neutral category\ndata = data[data[category_column] != 0]\n\ncategories = data[category_column].unique().tolist()",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "class StemmingCountVectorizer(CountVectorizer):\n def build_analyzer(self):\n stemmer = nltk.stem.snowball.SnowballStemmer(model_config['language'])\n analyzer = super(StemmingCountVectorizer, self).build_analyzer()\n return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])\n\nvectorizer_args = {\n 'ngram_range': (model_config['ngram_min'], model_config['ngram_max']),\n 'stop_words': model_config['language'],\n 'max_features': model_config['max_features']\n}\n\nif model_config['stemmer']:\n vectorizer = StemmingCountVectorizer(**vectorizer_args)\nelse:\n vectorizer = CountVectorizer(**vectorizer_args)\nx = vectorizer.fit_transform(data[text_column])\n\nif model_config['tfidf']:\n tf_transformer = TfidfTransformer(norm=model_config['norm']).fit(x)\n x = tf_transformer.transform(x)\n\ny = np.empty(len(data), dtype='uint8')\n\nfor i, c in enumerate(data[category_column]):\n y[i] = categories.index(c)\n\ntrain_x, test_x, train_y, test_y = train_test_split(x, y,\n test_size=0.25,\n random_state=42,\n stratify=y)\n\ntrain_y = keras.utils.to_categorical(train_y, num_classes=len(categories))\ntest_y = keras.utils.to_categorical(test_y, num_classes=len(categories))\n\nprint(\"{} features\".format(x.shape[1]))",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model = Sequential()\nmodel.add(Dense(8, activation='relu', input_dim=x.shape[1]))\nmodel.add(Dense(8, activation='relu'))\nmodel.add(Dense(units=len(categories), activation='softmax'))\n\nmodel.compile(optimizer='rmsprop',\n loss='binary_crossentropy',\n metrics=['accuracy'])\n\nstop_callback = keras.callbacks.EarlyStopping(monitor='val_loss',\n patience=0)\n\ntb_callback = keras.callbacks.TensorBoard(log_dir=os.path.join(model_log_dir, model_id),\n histogram_freq=0,\n write_graph=True,\n write_images=True)\n\nhist = model.fit(train_x, train_y, epochs=20, validation_data=(test_x, test_y), callbacks=[stop_callback, tb_callback])",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model_file = os.path.join(model_export_path, model_id + '.tfl')\nmodel.save(model_file)\n\nmodel_config_file = os.path.join(model_export_path, model_id + '.yml')\nstream = open(model_config_file, 'w')\nyaml.dump({'model_config': model_config}, stream, default_flow_style=False)\n\nprint(\"Saved model and config as {}\".format(model_id))",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "predicted_y = model.predict_classes(test_x)\n\ntarget_names = [str(c) for c in categories]\n\nprint(classification_report(np.argmax(test_y, axis=1), predicted_y, target_names=target_names))\nprint(\"Confusion Matrix\")\nprint(confusion_matrix(np.argmax(test_y, axis=1), predicted_y))",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"version_major": 2,
"version_minor": 0,
"state": {}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment