Skip to content

Instantly share code, notes, and snippets.

@slate-dev
Last active January 24, 2022 08:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slate-dev/8f59772a790e5a3ed70788fab70d5343 to your computer and use it in GitHub Desktop.
Save slate-dev/8f59772a790e5a3ed70788fab70d5343 to your computer and use it in GitHub Desktop.

What does Twitter Say about Self-Regulated Learning? Mapping Tweets from 2011 to 2021

Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!python -W ignore::DeprecationWarning\n",
"\n# this script for generating the topic modeling\n",
"\n",
"import pandas as pd\n",
"\n",
"# Gensim\n",
"import gensim\n",
"import gensim.corpora as corpora\n",
"from gensim.utils import simple_preprocess\n",
"from gensim.models import CoherenceModel\n",
"from gensim.test.utils import datapath\n",
"\n",
"# Plotting tools\n",
"import pyLDAvis\n",
"import pyLDAvis.gensim_models # don't skip this\n",
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline\n",
"\n",
"# Enable logging for gensim - optional\n",
"import logging\n",
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\",category=DeprecationWarning)\n",
"\n",
"random_state = 16"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename = 'tweets_lemmatization.fz' # Provide your own file path\n",
"dataframe = pd.read_feather(filename)\n",
"dataframe_column = \"lemma_tokens\" # Provide your own column name\n",
"data_lemmatized = dataframe[dataframe_column]\n",
"\n",
"#Create dictionary\n",
"id2word = corpora.Dictionary(data_lemmatized)\n",
"id2word.filter_extremes(no_below=2, no_above=.99)\n",
"\n",
"#Create corpus\n",
"corpus = [id2word.doc2bow(d) for d in data_lemmatized]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda\n",
"def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):\n",
" \"\"\"\n",
" Compute c_v coherence for various number of topics\n",
"\n",
" Parameters:\n",
" ----------\n",
" dictionary : Gensim dictionary\n",
" corpus : Gensim corpus\n",
" texts : List of input texts\n",
" limit : Max num of topics\n",
"\n",
" Returns:\n",
" -------\n",
" model_list : List of LDA topic models\n",
" coherence_values : Coherence values corresponding to the LDA model with respective number of topics\n",
" \"\"\"\n",
" coherence_values = []\n",
" model_list = []\n",
" for num_topics in range(start, limit, step):\n",
" # model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)\n",
" model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=8, random_state=random_state)\n",
" print(\"Topic \" + str(num_topics) + \" done!\")\n",
" model_list.append(model)\n",
" coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n",
" coherence_values.append(coherencemodel.get_coherence())\n",
"\n",
" return model_list, coherence_values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=68, step=4)\n",
"m_cv = model_list, coherence_values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show graph\n",
"limit=68; start=2; step=4\n",
"x = range(start, limit, step)\n",
"fig = plt.figure()\n",
"fig.set_dpi(1200)\n",
"plt.plot(x, coherence_values)\n",
"plt.xlabel(\"Num Topics\")\n",
"plt.ylabel(\"Coherence score\")\n",
"plt.legend((\"Coherence score\",), loc='best')\n",
"plt.tight_layout()\n",
"plt.savefig('coherence.png', pad_inches = 0, bbox_inches='tight', facecolor='white', edgecolor='none')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep track of the highest coherence number and the \n",
"# corresponding topic number.\n",
"optimal_coherence_topics = -1\n",
"optimal_coherence_value = 0.2987\n",
"\n",
"# Print the coherence scores\n",
"for m, cv in zip(x, coherence_values):\n",
" if cv > optimal_coherence_value:\n",
" optimal_coherence_value = cv\n",
" optimal_coherence_topics = m\n",
" optimal_model_from_cv = m\n",
" print(\"Num Topics =\", m, \" has Coherence Value of\", round(cv, 4))\n",
"\n",
"print(\"Optimal number of topics \" + str(optimal_coherence_topics) + \" has a value of \" + str(optimal_coherence_value))\n",
"\n",
"# Not elegant, but we need the index of the highest coherence value.\n",
"# Use this to get the corresponding model from the model list.\n",
"# optimal_coherence_value = 0.32098441742895534\n",
"\n",
"index_of_optimal_cv = coherence_values.index(optimal_coherence_value)\n",
"optimal_model = model_list[index_of_optimal_cv]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize the optimal model.\n",
"pyLDAvis.enable_notebook()\n",
"vis_opt = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)\n",
"vis_opt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# model_list[2].show_topics(num_words=10000, formatted=False)\n",
"\n",
"mixture = [dict(model_list[2][x]) for x in corpus]\n",
"pd.DataFrame(mixture).to_csv(\"topic_mixture.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"top_words_per_topic = []\n",
"for t in range(optimal_model.num_topics):\n",
" top_words_per_topic.extend([(t, ) + x for x in optimal_model.show_topic(t, topn = 50)])\n",
"\n",
"pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv(\"top_words.csv\")\n"
]
}
],
"metadata": {
"interpreter": {
"hash": "c26465253d500f9b1e4188c7a861c868c3d4a64a89438e34be0db151202be497"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment