CliffordAnderson/news-topics.ipynb

## news-topics.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "private_outputs": true,
      "provenance": [],
      "machine_shape": "hm",
      "gpuType": "V100",
      "authorship_tag": "ABX9TyN4HVZSF7k08EFLSyhwMw5M",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/CliffordAnderson/e2afa63efcea7702c186fd77884cfc14/news-topics.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XkpcnCKXBYX-"
      },
      "outputs": [],
      "source": [
        "!pip install requests top2vec umap-learn matplotlib seaborn"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import requests\n",
        "\n",
        "# See the NewArticles dataset: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ&version=1.0\n",
        "url = \"https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ\"\n",
        "\n",
        "response = requests.get(url)\n",
        "\n",
        "if response.status_code == 200:\n",
        "    with open('data.csv', 'wb') as file:\n",
        "        file.write(response.content)\n",
        "    print(\"File downloaded successfully.\")\n",
        "else:\n",
        "    print(f\"Failed to retrieve file: {response.status_code}\")\n"
      ],
      "metadata": {
        "id": "WPNBR75oBdQ3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import csv\n",
        "\n",
        "with open('data.csv', 'r') as f:\n",
        "    reader = csv.reader(f)\n",
        "    print(next(reader))\n"
      ],
      "metadata": {
        "id": "8R-flNySBssL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "from top2vec import Top2Vec\n",
        "\n",
        "data = pd.read_csv('/content/data.csv', encoding='latin-1')\n",
        "documents = data['text'].dropna().tolist()  # dropna() to remove any missing values\n"
      ],
      "metadata": {
        "id": "Bi0m0xBKCHL0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = Top2Vec(documents, speed='learn')"
      ],
      "metadata": {
        "id": "cZrOQpzaDHkY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "topic_words, word_scores, topic_scores = model.get_topics(25)\n",
        "\n",
        "for i, topic in enumerate(topic_words, 1):\n",
        "    print(f\"Topic {i}: {', '.join(topic)}\")\n"
      ],
      "metadata": {
        "id": "5DWNWol5DV4Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import umap\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "\n",
        "sns.set(style='white', palette='muted')\n",
        "\n",
        "topic_vectors = model.topic_vectors\n",
        "topic_words, word_scores, topic_scores = model.get_topics(model.get_num_topics())\n",
        "\n",
        "umap_model = umap.UMAP(n_neighbors=3, random_state=42)\n",
        "embedding = umap_model.fit_transform(topic_vectors)\n",
        "\n",
        "plt.figure(figsize=(12, 10))\n",
        "scatter = plt.scatter(embedding[:, 0], embedding[:, 1], s=60, cmap='viridis', alpha=0.7)\n",
        "plt.title(\"2D UMAP projection of Topics\", fontsize=16)\n",
        "plt.xlabel(\"UMAP 1\", fontsize=14)\n",
        "plt.ylabel(\"UMAP 2\", fontsize=14)\n",
        "\n",
        "for i, (x, y) in enumerate(embedding):\n",
        "    label = ', '.join(topic_words[i][:1])  # Use the top x words for each topic as labels\n",
        "    plt.text(x, y, label, ha='center', va='center', fontsize=10, color='black')\n",
        "\n",
        "cbar = plt.colorbar(scatter)\n",
        "cbar.set_label('Topic Number', rotation=270, labelpad=15, fontsize=12)\n",
        "\n",
        "sns.despine(left=True, bottom=True)\n",
        "\n",
        "plt.show()\n"
      ],
      "metadata": {
        "id": "HeOANcw_D-I1"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"private_outputs": true,
	"provenance": [],
	"machine_shape": "hm",
	"gpuType": "V100",
	"authorship_tag": "ABX9TyN4HVZSF7k08EFLSyhwMw5M",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/CliffordAnderson/e2afa63efcea7702c186fd77884cfc14/news-topics.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "XkpcnCKXBYX-"
	},
	"outputs": [],
	"source": [
	"!pip install requests top2vec umap-learn matplotlib seaborn"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import requests\n",
	"\n",
	"# See the NewArticles dataset: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ&version=1.0\n",
	"url = \"https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ\"\n",
	"\n",
	"response = requests.get(url)\n",
	"\n",
	"if response.status_code == 200:\n",
	" with open('data.csv', 'wb') as file:\n",
	" file.write(response.content)\n",
	" print(\"File downloaded successfully.\")\n",
	"else:\n",
	" print(f\"Failed to retrieve file: {response.status_code}\")\n"
	],
	"metadata": {
	"id": "WPNBR75oBdQ3"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import csv\n",
	"\n",
	"with open('data.csv', 'r') as f:\n",
	" reader = csv.reader(f)\n",
	" print(next(reader))\n"
	],
	"metadata": {
	"id": "8R-flNySBssL"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import pandas as pd\n",
	"from top2vec import Top2Vec\n",
	"\n",
	"data = pd.read_csv('/content/data.csv', encoding='latin-1')\n",
	"documents = data['text'].dropna().tolist() # dropna() to remove any missing values\n"
	],
	"metadata": {
	"id": "Bi0m0xBKCHL0"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"model = Top2Vec(documents, speed='learn')"
	],
	"metadata": {
	"id": "cZrOQpzaDHkY"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"topic_words, word_scores, topic_scores = model.get_topics(25)\n",
	"\n",
	"for i, topic in enumerate(topic_words, 1):\n",
	" print(f\"Topic {i}: {', '.join(topic)}\")\n"
	],
	"metadata": {
	"id": "5DWNWol5DV4Q"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import umap\n",
	"import matplotlib.pyplot as plt\n",
	"import seaborn as sns\n",
	"\n",
	"sns.set(style='white', palette='muted')\n",
	"\n",
	"topic_vectors = model.topic_vectors\n",
	"topic_words, word_scores, topic_scores = model.get_topics(model.get_num_topics())\n",
	"\n",
	"umap_model = umap.UMAP(n_neighbors=3, random_state=42)\n",
	"embedding = umap_model.fit_transform(topic_vectors)\n",
	"\n",
	"plt.figure(figsize=(12, 10))\n",
	"scatter = plt.scatter(embedding[:, 0], embedding[:, 1], s=60, cmap='viridis', alpha=0.7)\n",
	"plt.title(\"2D UMAP projection of Topics\", fontsize=16)\n",
	"plt.xlabel(\"UMAP 1\", fontsize=14)\n",
	"plt.ylabel(\"UMAP 2\", fontsize=14)\n",
	"\n",
	"for i, (x, y) in enumerate(embedding):\n",
	" label = ', '.join(topic_words[i][:1]) # Use the top x words for each topic as labels\n",
	" plt.text(x, y, label, ha='center', va='center', fontsize=10, color='black')\n",
	"\n",
	"cbar = plt.colorbar(scatter)\n",
	"cbar.set_label('Topic Number', rotation=270, labelpad=15, fontsize=12)\n",
	"\n",
	"sns.despine(left=True, bottom=True)\n",
	"\n",
	"plt.show()\n"
	],
	"metadata": {
	"id": "HeOANcw_D-I1"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}