What does Twitter Say about Self-Regulated Learning? Mapping Tweets from 2011 to 2021
Last active
January 24, 2022 08:47
-
-
Save slate-dev/8f59772a790e5a3ed70788fab70d5343 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!python -W ignore::DeprecationWarning\n", | |
"\n# this script for generating the topic modeling\n", | |
"\n", | |
"import pandas as pd\n", | |
"\n", | |
"# Gensim\n", | |
"import gensim\n", | |
"import gensim.corpora as corpora\n", | |
"from gensim.utils import simple_preprocess\n", | |
"from gensim.models import CoherenceModel\n", | |
"from gensim.test.utils import datapath\n", | |
"\n", | |
"# Plotting tools\n", | |
"import pyLDAvis\n", | |
"import pyLDAvis.gensim_models # don't skip this\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"%matplotlib inline\n", | |
"\n", | |
"# Enable logging for gensim - optional\n", | |
"import logging\n", | |
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)\n", | |
"\n", | |
"import warnings\n", | |
"warnings.filterwarnings(\"ignore\",category=DeprecationWarning)\n", | |
"\n", | |
"random_state = 16" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = 'tweets_lemmatization.fz' # Provide your own file path\n", | |
"dataframe = pd.read_feather(filename)\n", | |
"dataframe_column = \"lemma_tokens\" # Provide your own column name\n", | |
"data_lemmatized = dataframe[dataframe_column]\n", | |
"\n", | |
"#Create dictionary\n", | |
"id2word = corpora.Dictionary(data_lemmatized)\n", | |
"id2word.filter_extremes(no_below=2, no_above=.99)\n", | |
"\n", | |
"#Create corpus\n", | |
"corpus = [id2word.doc2bow(d) for d in data_lemmatized]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda\n", | |
"def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):\n", | |
" \"\"\"\n", | |
" Compute c_v coherence for various number of topics\n", | |
"\n", | |
" Parameters:\n", | |
" ----------\n", | |
" dictionary : Gensim dictionary\n", | |
" corpus : Gensim corpus\n", | |
" texts : List of input texts\n", | |
" limit : Max num of topics\n", | |
"\n", | |
" Returns:\n", | |
" -------\n", | |
" model_list : List of LDA topic models\n", | |
" coherence_values : Coherence values corresponding to the LDA model with respective number of topics\n", | |
" \"\"\"\n", | |
" coherence_values = []\n", | |
" model_list = []\n", | |
" for num_topics in range(start, limit, step):\n", | |
" # model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)\n", | |
" model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=8, random_state=random_state)\n", | |
" print(\"Topic \" + str(num_topics) + \" done!\")\n", | |
" model_list.append(model)\n", | |
" coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n", | |
" coherence_values.append(coherencemodel.get_coherence())\n", | |
"\n", | |
" return model_list, coherence_values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=68, step=4)\n", | |
"m_cv = model_list, coherence_values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Show graph\n", | |
"limit=68; start=2; step=4\n", | |
"x = range(start, limit, step)\n", | |
"fig = plt.figure()\n", | |
"fig.set_dpi(1200)\n", | |
"plt.plot(x, coherence_values)\n", | |
"plt.xlabel(\"Num Topics\")\n", | |
"plt.ylabel(\"Coherence score\")\n", | |
"plt.legend((\"Coherence score\",), loc='best')\n", | |
"plt.tight_layout()\n", | |
"plt.savefig('coherence.png', pad_inches = 0, bbox_inches='tight', facecolor='white', edgecolor='none')\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Keep track of the highest coherence number and the \n", | |
"# corresponding topic number.\n", | |
"optimal_coherence_topics = -1\n", | |
"optimal_coherence_value = 0.2987\n", | |
"\n", | |
"# Print the coherence scores\n", | |
"for m, cv in zip(x, coherence_values):\n", | |
" if cv > optimal_coherence_value:\n", | |
" optimal_coherence_value = cv\n", | |
" optimal_coherence_topics = m\n", | |
" optimal_model_from_cv = m\n", | |
" print(\"Num Topics =\", m, \" has Coherence Value of\", round(cv, 4))\n", | |
"\n", | |
"print(\"Optimal number of topics \" + str(optimal_coherence_topics) + \" has a value of \" + str(optimal_coherence_value))\n", | |
"\n", | |
"# Not elegant, but we need the index of the highest coherence value.\n", | |
"# Use this to get the corresponding model from the model list.\n", | |
"# optimal_coherence_value = 0.32098441742895534\n", | |
"\n", | |
"index_of_optimal_cv = coherence_values.index(optimal_coherence_value)\n", | |
"optimal_model = model_list[index_of_optimal_cv]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Visualize the optimal model.\n", | |
"pyLDAvis.enable_notebook()\n", | |
"vis_opt = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)\n", | |
"vis_opt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# model_list[2].show_topics(num_words=10000, formatted=False)\n", | |
"\n", | |
"mixture = [dict(model_list[2][x]) for x in corpus]\n", | |
"pd.DataFrame(mixture).to_csv(\"topic_mixture.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"top_words_per_topic = []\n", | |
"for t in range(optimal_model.num_topics):\n", | |
" top_words_per_topic.extend([(t, ) + x for x in optimal_model.show_topic(t, topn = 50)])\n", | |
"\n", | |
"pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv(\"top_words.csv\")\n" | |
] | |
} | |
], | |
"metadata": { | |
"interpreter": { | |
"hash": "c26465253d500f9b1e4188c7a861c868c3d4a64a89438e34be0db151202be497" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment