patrickvankessel/CorEx example.ipynb Secret

## CorEx example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package movie_reviews to\n",
      "[nltk_data]     /home/pvankessel/nltk_data...\n",
      "[nltk_data]   Package movie_reviews is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download(\"movie_reviews\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2000\n"
     ]
    }
   ],
   "source": [
    "rows = []\n",
    "for fileid in nltk.corpus.movie_reviews.fileids():\n",
    "    rows.append({\"text\": nltk.corpus.movie_reviews.raw(fileid)})\n",
    "df = pd.DataFrame(rows)\n",
    "print(len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "21886\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer = TfidfVectorizer(\n",
    "    max_df=.5,\n",
    "    min_df=10,\n",
    "    max_features=None,\n",
    "    ngram_range=(1, 2),\n",
    "    norm=None,\n",
    "    binary=True,\n",
    "    use_idf=False,\n",
    "    sublinear_tf=False\n",
    ")\n",
    "vectorizer = vectorizer.fit(df['text'])\n",
    "tfidf = vectorizer.transform(df['text'])\n",
    "vocab = vectorizer.get_feature_names()\n",
    "print(len(vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from corextopic import corextopic as ct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "anchors = []\n",
    "model = ct.Corex(n_hidden=8, seed=42)\n",
    "model = model.fit(\n",
    "    tfidf,\n",
    "    words=vocab\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic #1: see, me, had, really, don, know, think, my, because, how\n",
      "Topic #2: life, he is, both, never, it is, of his, that he, world, performance, to his\n",
      "Topic #3: the first, the most, films, from the, many, by the, since, such, at the, while\n",
      "Topic #4: comedy, funny, jokes, humor, laughs, funniest, the funniest, hilarious, the jokes, joke\n",
      "Topic #5: young, opening, music, follow, portrayal, cinematography, mars, aspect, art, shown\n",
      "Topic #6: murder, crime, thriller, police, killer, dead, the police, he has, turns, prison\n",
      "Topic #7: plot, action, case, critique, the plot, suspense, none, blair witch, seem, cool\n",
      "Topic #8: horror, horror film, scream, slasher, did last, horror films, scary, you did, williamson\n"
     ]
    }
   ],
   "source": [
    "for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):\n",
    "    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]\n",
    "    print(\"Topic #{}: {}\".format(i+1, \", \".join(topic_ngrams)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Anchors designed to nudge the model towards measuring specific genres\n",
    "anchors = [\n",
    "    [\"action\", \"adventure\"],\n",
    "    [\"drama\"],\n",
    "    [\"comedy\", \"funny\"],\n",
    "    [\"horror\", \"suspense\"],\n",
    "    [\"animated\", \"animation\"],\n",
    "    [\"sci fi\", \"alien\"],\n",
    "    [\"romance\", \"romantic\"],\n",
    "    [\"fantasy\"]\n",
    "]\n",
    "anchors = [\n",
    "    [a for a in topic if a in vocab]\n",
    "    for topic in anchors\n",
    "]\n",
    "\n",
    "model = ct.Corex(n_hidden=8, seed=42)\n",
    "model = model.fit(\n",
    "    tfidf,\n",
    "    words=vocab,\n",
    "    anchors=anchors, # Pass the anchors in here\n",
    "    anchor_strength=3 # Tell the model how much it should rely on the anchors\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic #1: action, adventure, the action, scenes, action sequences, where, action scenes, an action, action film, sequences\n",
      "Topic #2: drama, performance, mother, director, both, while, and his, to his, role, performances\n",
      "Topic #3: comedy, funny, jokes, laughs, humor, funny and, hilarious, very funny, gags, laugh\n",
      "Topic #4: horror, really, think, had, me, did, how, see, because, were\n",
      "Topic #5: animated, animation, disney, children, the animation, computer, adults, years, voice of, voice\n",
      "Topic #6: alien, sci fi, effects, special effects, fi, aliens, sci, planet, special, earth\n",
      "Topic #7: romantic, romance, she, love, with her, of her, that she, relationship, woman, romantic comedy\n",
      "Topic #8: life, he is, fantasy, world, it is, that the, perhaps, point, does, through\n"
     ]
    }
   ],
   "source": [
    "for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):\n",
    "    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]\n",
    "    print(\"Topic #{}: {}\".format(i+1, \", \".join(topic_ngrams)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_df = pd.DataFrame(\n",
    "    model.transform(tfidf), \n",
    "    columns=[\"topic_{}\".format(i+1) for i in range(8)]\n",
    ").astype(float)\n",
    "topic_df.index = df.index\n",
    "df = pd.concat([df, topic_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>topic_1</th>\n",
       "      <th>topic_2</th>\n",
       "      <th>topic_3</th>\n",
       "      <th>topic_4</th>\n",
       "      <th>topic_5</th>\n",
       "      <th>topic_6</th>\n",
       "      <th>topic_7</th>\n",
       "      <th>topic_8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1860</th>\n",
       "      <td>the verdict : spine-chilling drama from horror...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>\" the 44 caliber killer has struck again . \" ...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1333</th>\n",
       "      <td>in the company of men made a splash at the sun...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>905</th>\n",
       "      <td>in the year 2029 , captain leo davidson ( mark...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1289</th>\n",
       "      <td>[note that followups are directed to rec . art...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   text  topic_1  topic_2  \\\n",
       "1860  the verdict : spine-chilling drama from horror...      1.0      1.0   \n",
       "353    \" the 44 caliber killer has struck again . \" ...      0.0      1.0   \n",
       "1333  in the company of men made a splash at the sun...      0.0      1.0   \n",
       "905   in the year 2029 , captain leo davidson ( mark...      0.0      0.0   \n",
       "1289  [note that followups are directed to rec . art...      1.0      0.0   \n",
       "\n",
       "      topic_3  topic_4  topic_5  topic_6  topic_7  topic_8  \n",
       "1860      0.0      1.0      0.0      1.0      1.0      0.0  \n",
       "353       0.0      1.0      0.0      0.0      0.0      1.0  \n",
       "1333      1.0      1.0      0.0      1.0      1.0      1.0  \n",
       "905       0.0      0.0      0.0      1.0      1.0      0.0  \n",
       "1289      1.0      0.0      0.0      1.0      0.0      0.0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(5, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pewthon2",
   "language": "python",
   "name": "pewthon2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import nltk\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package movie_reviews to\n",
	"[nltk_data] /home/pvankessel/nltk_data...\n",
	"[nltk_data] Package movie_reviews is already up-to-date!\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nltk.download(\"movie_reviews\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"2000\n"
	]
	}
	],
	"source": [
	"rows = []\n",
	"for fileid in nltk.corpus.movie_reviews.fileids():\n",
	" rows.append({\"text\": nltk.corpus.movie_reviews.raw(fileid)})\n",
	"df = pd.DataFrame(rows)\n",
	"print(len(df))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"21886\n"
	]
	}
	],
	"source": [
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"vectorizer = TfidfVectorizer(\n",
	" max_df=.5,\n",
	" min_df=10,\n",
	" max_features=None,\n",
	" ngram_range=(1, 2),\n",
	" norm=None,\n",
	" binary=True,\n",
	" use_idf=False,\n",
	" sublinear_tf=False\n",
	")\n",
	"vectorizer = vectorizer.fit(df['text'])\n",
	"tfidf = vectorizer.transform(df['text'])\n",
	"vocab = vectorizer.get_feature_names()\n",
	"print(len(vocab))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"from corextopic import corextopic as ct"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"anchors = []\n",
	"model = ct.Corex(n_hidden=8, seed=42)\n",
	"model = model.fit(\n",
	" tfidf,\n",
	" words=vocab\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Topic #1: see, me, had, really, don, know, think, my, because, how\n",
	"Topic #2: life, he is, both, never, it is, of his, that he, world, performance, to his\n",
	"Topic #3: the first, the most, films, from the, many, by the, since, such, at the, while\n",
	"Topic #4: comedy, funny, jokes, humor, laughs, funniest, the funniest, hilarious, the jokes, joke\n",
	"Topic #5: young, opening, music, follow, portrayal, cinematography, mars, aspect, art, shown\n",
	"Topic #6: murder, crime, thriller, police, killer, dead, the police, he has, turns, prison\n",
	"Topic #7: plot, action, case, critique, the plot, suspense, none, blair witch, seem, cool\n",
	"Topic #8: horror, horror film, scream, slasher, did last, horror films, scary, you did, williamson\n"
	]
	}
	],
	"source": [
	"for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):\n",
	" topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]\n",
	" print(\"Topic #{}: {}\".format(i+1, \", \".join(topic_ngrams)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Anchors designed to nudge the model towards measuring specific genres\n",
	"anchors = [\n",
	" [\"action\", \"adventure\"],\n",
	" [\"drama\"],\n",
	" [\"comedy\", \"funny\"],\n",
	" [\"horror\", \"suspense\"],\n",
	" [\"animated\", \"animation\"],\n",
	" [\"sci fi\", \"alien\"],\n",
	" [\"romance\", \"romantic\"],\n",
	" [\"fantasy\"]\n",
	"]\n",
	"anchors = [\n",
	" [a for a in topic if a in vocab]\n",
	" for topic in anchors\n",
	"]\n",
	"\n",
	"model = ct.Corex(n_hidden=8, seed=42)\n",
	"model = model.fit(\n",
	" tfidf,\n",
	" words=vocab,\n",
	" anchors=anchors, # Pass the anchors in here\n",
	" anchor_strength=3 # Tell the model how much it should rely on the anchors\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Topic #1: action, adventure, the action, scenes, action sequences, where, action scenes, an action, action film, sequences\n",
	"Topic #2: drama, performance, mother, director, both, while, and his, to his, role, performances\n",
	"Topic #3: comedy, funny, jokes, laughs, humor, funny and, hilarious, very funny, gags, laugh\n",
	"Topic #4: horror, really, think, had, me, did, how, see, because, were\n",
	"Topic #5: animated, animation, disney, children, the animation, computer, adults, years, voice of, voice\n",
	"Topic #6: alien, sci fi, effects, special effects, fi, aliens, sci, planet, special, earth\n",
	"Topic #7: romantic, romance, she, love, with her, of her, that she, relationship, woman, romantic comedy\n",
	"Topic #8: life, he is, fantasy, world, it is, that the, perhaps, point, does, through\n"
	]
	}
	],
	"source": [
	"for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):\n",
	" topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]\n",
	" print(\"Topic #{}: {}\".format(i+1, \", \".join(topic_ngrams)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"topic_df = pd.DataFrame(\n",
	" model.transform(tfidf), \n",
	" columns=[\"topic_{}\".format(i+1) for i in range(8)]\n",
	").astype(float)\n",
	"topic_df.index = df.index\n",
	"df = pd.concat([df, topic_df], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>text</th>\n",
	" <th>topic_1</th>\n",
	" <th>topic_2</th>\n",
	" <th>topic_3</th>\n",
	" <th>topic_4</th>\n",
	" <th>topic_5</th>\n",
	" <th>topic_6</th>\n",
	" <th>topic_7</th>\n",
	" <th>topic_8</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1860</th>\n",
	" <td>the verdict : spine-chilling drama from horror...</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>353</th>\n",
	" <td>\" the 44 caliber killer has struck again . \" ...</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1333</th>\n",
	" <td>in the company of men made a splash at the sun...</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>905</th>\n",
	" <td>in the year 2029 , captain leo davidson ( mark...</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1289</th>\n",
	" <td>[note that followups are directed to rec . art...</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" text topic_1 topic_2 \\\n",
	"1860 the verdict : spine-chilling drama from horror... 1.0 1.0 \n",
	"353 \" the 44 caliber killer has struck again . \" ... 0.0 1.0 \n",
	"1333 in the company of men made a splash at the sun... 0.0 1.0 \n",
	"905 in the year 2029 , captain leo davidson ( mark... 0.0 0.0 \n",
	"1289 [note that followups are directed to rec . art... 1.0 0.0 \n",
	"\n",
	" topic_3 topic_4 topic_5 topic_6 topic_7 topic_8 \n",
	"1860 0.0 1.0 0.0 1.0 1.0 0.0 \n",
	"353 0.0 1.0 0.0 0.0 0.0 1.0 \n",
	"1333 1.0 1.0 0.0 1.0 1.0 1.0 \n",
	"905 0.0 0.0 0.0 1.0 1.0 0.0 \n",
	"1289 1.0 0.0 0.0 1.0 0.0 0.0 "
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.sample(5, random_state=42)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "pewthon2",
	"language": "python",
	"name": "pewthon2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.15"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}