slate-dev/$README.md

## $README.md

      
    Raw
  

              $README.md
            
          
    What does Twitter Say about Self-Regulated Learning? Mapping Tweets from 2011 to 2021

  
## 03 Wordcloud.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              03 Wordcloud.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## 04 Topic modeling.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!python -W ignore::DeprecationWarning\n",
    "\n# this script for generating the topic modeling\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "# Gensim\n",
    "import gensim\n",
    "import gensim.corpora as corpora\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.models import CoherenceModel\n",
    "from gensim.test.utils import datapath\n",
    "\n",
    "# Plotting tools\n",
    "import pyLDAvis\n",
    "import pyLDAvis.gensim_models  # don't skip this\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "# Enable logging for gensim - optional\n",
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\",category=DeprecationWarning)\n",
    "\n",
    "random_state = 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'tweets_lemmatization.fz' # Provide your own file path\n",
    "dataframe = pd.read_feather(filename)\n",
    "dataframe_column = \"lemma_tokens\" # Provide your own column name\n",
    "data_lemmatized = dataframe[dataframe_column]\n",
    "\n",
    "#Create dictionary\n",
    "id2word = corpora.Dictionary(data_lemmatized)\n",
    "id2word.filter_extremes(no_below=2, no_above=.99)\n",
    "\n",
    "#Create corpus\n",
    "corpus = [id2word.doc2bow(d) for d in data_lemmatized]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda\n",
    "def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):\n",
    "    \"\"\"\n",
    "    Compute c_v coherence for various number of topics\n",
    "\n",
    "    Parameters:\n",
    "    ----------\n",
    "    dictionary : Gensim dictionary\n",
    "    corpus : Gensim corpus\n",
    "    texts : List of input texts\n",
    "    limit : Max num of topics\n",
    "\n",
    "    Returns:\n",
    "    -------\n",
    "    model_list : List of LDA topic models\n",
    "    coherence_values : Coherence values corresponding to the LDA model with respective number of topics\n",
    "    \"\"\"\n",
    "    coherence_values = []\n",
    "    model_list = []\n",
    "    for num_topics in range(start, limit, step):\n",
    "        # model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)\n",
    "        model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=8, random_state=random_state)\n",
    "        print(\"Topic \" + str(num_topics) + \" done!\")\n",
    "        model_list.append(model)\n",
    "        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n",
    "        coherence_values.append(coherencemodel.get_coherence())\n",
    "\n",
    "    return model_list, coherence_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=68, step=4)\n",
    "m_cv = model_list, coherence_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show graph\n",
    "limit=68; start=2; step=4\n",
    "x = range(start, limit, step)\n",
    "fig = plt.figure()\n",
    "fig.set_dpi(1200)\n",
    "plt.plot(x, coherence_values)\n",
    "plt.xlabel(\"Num Topics\")\n",
    "plt.ylabel(\"Coherence score\")\n",
    "plt.legend((\"Coherence score\",), loc='best')\n",
    "plt.tight_layout()\n",
    "plt.savefig('coherence.png', pad_inches = 0, bbox_inches='tight', facecolor='white', edgecolor='none')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Keep track of the highest coherence number and the \n",
    "# corresponding topic number.\n",
    "optimal_coherence_topics = -1\n",
    "optimal_coherence_value = 0.2987\n",
    "\n",
    "# Print the coherence scores\n",
    "for m, cv in zip(x, coherence_values):\n",
    "    if cv > optimal_coherence_value:\n",
    "        optimal_coherence_value = cv\n",
    "        optimal_coherence_topics = m\n",
    "        optimal_model_from_cv = m\n",
    "    print(\"Num Topics =\", m, \" has Coherence Value of\", round(cv, 4))\n",
    "\n",
    "print(\"Optimal number of topics \" + str(optimal_coherence_topics) + \" has a value of \" + str(optimal_coherence_value))\n",
    "\n",
    "# Not elegant, but we need the index of the highest coherence value.\n",
    "# Use this to get the corresponding model from the model list.\n",
    "# optimal_coherence_value = 0.32098441742895534\n",
    "\n",
    "index_of_optimal_cv = coherence_values.index(optimal_coherence_value)\n",
    "optimal_model = model_list[index_of_optimal_cv]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize the optimal model.\n",
    "pyLDAvis.enable_notebook()\n",
    "vis_opt = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)\n",
    "vis_opt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model_list[2].show_topics(num_words=10000, formatted=False)\n",
    "\n",
    "mixture = [dict(model_list[2][x]) for x in corpus]\n",
    "pd.DataFrame(mixture).to_csv(\"topic_mixture.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_words_per_topic = []\n",
    "for t in range(optimal_model.num_topics):\n",
    "    top_words_per_topic.extend([(t, ) + x for x in optimal_model.show_topic(t, topn = 50)])\n",
    "\n",
    "pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv(\"top_words.csv\")\n"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "c26465253d500f9b1e4188c7a861c868c3d4a64a89438e34be0db151202be497"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"#!python -W ignore::DeprecationWarning\n",
	"\n# this script for generating the topic modeling\n",
	"\n",
	"import pandas as pd\n",
	"\n",
	"# Gensim\n",
	"import gensim\n",
	"import gensim.corpora as corpora\n",
	"from gensim.utils import simple_preprocess\n",
	"from gensim.models import CoherenceModel\n",
	"from gensim.test.utils import datapath\n",
	"\n",
	"# Plotting tools\n",
	"import pyLDAvis\n",
	"import pyLDAvis.gensim_models # don't skip this\n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"%matplotlib inline\n",
	"\n",
	"# Enable logging for gensim - optional\n",
	"import logging\n",
	"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)\n",
	"\n",
	"import warnings\n",
	"warnings.filterwarnings(\"ignore\",category=DeprecationWarning)\n",
	"\n",
	"random_state = 16"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = 'tweets_lemmatization.fz' # Provide your own file path\n",
	"dataframe = pd.read_feather(filename)\n",
	"dataframe_column = \"lemma_tokens\" # Provide your own column name\n",
	"data_lemmatized = dataframe[dataframe_column]\n",
	"\n",
	"#Create dictionary\n",
	"id2word = corpora.Dictionary(data_lemmatized)\n",
	"id2word.filter_extremes(no_below=2, no_above=.99)\n",
	"\n",
	"#Create corpus\n",
	"corpus = [id2word.doc2bow(d) for d in data_lemmatized]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda\n",
	"def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):\n",
	" \"\"\"\n",
	" Compute c_v coherence for various number of topics\n",
	"\n",
	" Parameters:\n",
	" ----------\n",
	" dictionary : Gensim dictionary\n",
	" corpus : Gensim corpus\n",
	" texts : List of input texts\n",
	" limit : Max num of topics\n",
	"\n",
	" Returns:\n",
	" -------\n",
	" model_list : List of LDA topic models\n",
	" coherence_values : Coherence values corresponding to the LDA model with respective number of topics\n",
	" \"\"\"\n",
	" coherence_values = []\n",
	" model_list = []\n",
	" for num_topics in range(start, limit, step):\n",
	" # model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)\n",
	" model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=8, random_state=random_state)\n",
	" print(\"Topic \" + str(num_topics) + \" done!\")\n",
	" model_list.append(model)\n",
	" coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n",
	" coherence_values.append(coherencemodel.get_coherence())\n",
	"\n",
	" return model_list, coherence_values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=68, step=4)\n",
	"m_cv = model_list, coherence_values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Show graph\n",
	"limit=68; start=2; step=4\n",
	"x = range(start, limit, step)\n",
	"fig = plt.figure()\n",
	"fig.set_dpi(1200)\n",
	"plt.plot(x, coherence_values)\n",
	"plt.xlabel(\"Num Topics\")\n",
	"plt.ylabel(\"Coherence score\")\n",
	"plt.legend((\"Coherence score\",), loc='best')\n",
	"plt.tight_layout()\n",
	"plt.savefig('coherence.png', pad_inches = 0, bbox_inches='tight', facecolor='white', edgecolor='none')\n",
	"plt.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Keep track of the highest coherence number and the \n",
	"# corresponding topic number.\n",
	"optimal_coherence_topics = -1\n",
	"optimal_coherence_value = 0.2987\n",
	"\n",
	"# Print the coherence scores\n",
	"for m, cv in zip(x, coherence_values):\n",
	" if cv > optimal_coherence_value:\n",
	" optimal_coherence_value = cv\n",
	" optimal_coherence_topics = m\n",
	" optimal_model_from_cv = m\n",
	" print(\"Num Topics =\", m, \" has Coherence Value of\", round(cv, 4))\n",
	"\n",
	"print(\"Optimal number of topics \" + str(optimal_coherence_topics) + \" has a value of \" + str(optimal_coherence_value))\n",
	"\n",
	"# Not elegant, but we need the index of the highest coherence value.\n",
	"# Use this to get the corresponding model from the model list.\n",
	"# optimal_coherence_value = 0.32098441742895534\n",
	"\n",
	"index_of_optimal_cv = coherence_values.index(optimal_coherence_value)\n",
	"optimal_model = model_list[index_of_optimal_cv]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Visualize the optimal model.\n",
	"pyLDAvis.enable_notebook()\n",
	"vis_opt = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)\n",
	"vis_opt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# model_list[2].show_topics(num_words=10000, formatted=False)\n",
	"\n",
	"mixture = [dict(model_list[2][x]) for x in corpus]\n",
	"pd.DataFrame(mixture).to_csv(\"topic_mixture.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"top_words_per_topic = []\n",
	"for t in range(optimal_model.num_topics):\n",
	" top_words_per_topic.extend([(t, ) + x for x in optimal_model.show_topic(t, topn = 50)])\n",
	"\n",
	"pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv(\"top_words.csv\")\n"
	]
	}
	],
	"metadata": {
	"interpreter": {
	"hash": "c26465253d500f9b1e4188c7a861c868c3d4a64a89438e34be0db151202be497"
	},
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}