Skip to content

Instantly share code, notes, and snippets.

@CliffordAnderson
Created October 23, 2023 20:50
Show Gist options
  • Save CliffordAnderson/e2afa63efcea7702c186fd77884cfc14 to your computer and use it in GitHub Desktop.
Save CliffordAnderson/e2afa63efcea7702c186fd77884cfc14 to your computer and use it in GitHub Desktop.
News Topics.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"private_outputs": true,
"provenance": [],
"machine_shape": "hm",
"gpuType": "V100",
"authorship_tag": "ABX9TyN4HVZSF7k08EFLSyhwMw5M",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/CliffordAnderson/e2afa63efcea7702c186fd77884cfc14/news-topics.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "XkpcnCKXBYX-"
},
"outputs": [],
"source": [
"!pip install requests top2vec umap-learn matplotlib seaborn"
]
},
{
"cell_type": "code",
"source": [
"import requests\n",
"\n",
"# See the NewArticles dataset: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ&version=1.0\n",
"url = \"https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ\"\n",
"\n",
"response = requests.get(url)\n",
"\n",
"if response.status_code == 200:\n",
" with open('data.csv', 'wb') as file:\n",
" file.write(response.content)\n",
" print(\"File downloaded successfully.\")\n",
"else:\n",
" print(f\"Failed to retrieve file: {response.status_code}\")\n"
],
"metadata": {
"id": "WPNBR75oBdQ3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import csv\n",
"\n",
"with open('data.csv', 'r') as f:\n",
" reader = csv.reader(f)\n",
" print(next(reader))\n"
],
"metadata": {
"id": "8R-flNySBssL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from top2vec import Top2Vec\n",
"\n",
"data = pd.read_csv('/content/data.csv', encoding='latin-1')\n",
"documents = data['text'].dropna().tolist() # dropna() to remove any missing values\n"
],
"metadata": {
"id": "Bi0m0xBKCHL0"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = Top2Vec(documents, speed='learn')"
],
"metadata": {
"id": "cZrOQpzaDHkY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"topic_words, word_scores, topic_scores = model.get_topics(25)\n",
"\n",
"for i, topic in enumerate(topic_words, 1):\n",
" print(f\"Topic {i}: {', '.join(topic)}\")\n"
],
"metadata": {
"id": "5DWNWol5DV4Q"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import umap\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"sns.set(style='white', palette='muted')\n",
"\n",
"topic_vectors = model.topic_vectors\n",
"topic_words, word_scores, topic_scores = model.get_topics(model.get_num_topics())\n",
"\n",
"umap_model = umap.UMAP(n_neighbors=3, random_state=42)\n",
"embedding = umap_model.fit_transform(topic_vectors)\n",
"\n",
"plt.figure(figsize=(12, 10))\n",
"scatter = plt.scatter(embedding[:, 0], embedding[:, 1], s=60, cmap='viridis', alpha=0.7)\n",
"plt.title(\"2D UMAP projection of Topics\", fontsize=16)\n",
"plt.xlabel(\"UMAP 1\", fontsize=14)\n",
"plt.ylabel(\"UMAP 2\", fontsize=14)\n",
"\n",
"for i, (x, y) in enumerate(embedding):\n",
" label = ', '.join(topic_words[i][:1]) # Use the top x words for each topic as labels\n",
" plt.text(x, y, label, ha='center', va='center', fontsize=10, color='black')\n",
"\n",
"cbar = plt.colorbar(scatter)\n",
"cbar.set_label('Topic Number', rotation=270, labelpad=15, fontsize=12)\n",
"\n",
"sns.despine(left=True, bottom=True)\n",
"\n",
"plt.show()\n"
],
"metadata": {
"id": "HeOANcw_D-I1"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment