ViksaaSkool/get_twitter_data_example.ipynb

## get_twitter_data_example.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "get_twitter_data_example.ipynb",
      "provenance": [],
      "authorship_tag": "ABX9TyO1Se22kIxi80h9Hi7lr5Pr",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ViksaaSkool/5fb024ea5b470017a607891d5febda34/get_twitter_data_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "B4e7o03xlELR",
        "colab_type": "text"
      },
      "source": [
        "First need to install the necessary libraries\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OS7TYuR8k7-h",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install nasty"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aU-X44_qlDhZ",
        "colab_type": "text"
      },
      "source": [
        "add imports"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vEfBhahpl5lX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import nasty\n",
        "import pandas as pd\n",
        "import json\n",
        "import time\n",
        "from datetime import date\n",
        "from google.colab import drive\n",
        "from datetime import datetime\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8TrtFowLmKvL",
        "colab_type": "text"
      },
      "source": [
        "mout the google drive \n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "C-XtW7McmOV0",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "GOOGLE_DRIVE_ROOT = \"/content/drive\"\n",
        "drive.mount(GOOGLE_DRIVE_ROOT)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zuYgdfc4mRGR",
        "colab_type": "text"
      },
      "source": [
        "define constants"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0uJpC-romS3K",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "GOOGLE_DRIVE_COLAB_PATH = f\"{GOOGLE_DRIVE_ROOT}/My Drive/Colab Notebooks/\"\n",
        "READ = 'r'\n",
        "WRITE = 'w'\n",
        "DATE_FORMAT = \"%Y-%m-%d\"\n",
        "\n",
        "STAND_UP_COMEDIANS_TWITTER_ACCOUNTS = f\"{GOOGLE_DRIVE_COLAB_PATH}/stand_up_comedians_twitter_accounts.csv\"\n",
        "STAND_UP_COMEDIANS_DATA = f\"{GOOGLE_DRIVE_COLAB_PATH}/tweets_stand_up_comedians\"\n",
        "\n",
        "MAX_TWEETS = 5000 #per period (month)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LPw6d2zMnoJt",
        "colab_type": "text"
      },
      "source": [
        "declare methods"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uX0pkRvEnp1R",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def generate_data_csv_path(path, since, until):\n",
        "    return f\"{path}_{since}_{until}.csv\"     \n",
        "\n",
        "def save_to_json_file(data, file_path):\n",
        "    with create_data_file(file_path) as jason_file:\n",
        "        json.dump(data, jason_file, default=serialize) \n",
        "\n",
        "def create_data_file(file_path):\n",
        "    return open(file_path, WRITE, encoding=\"utf-8\")     \n",
        "\n",
        "def serialize(obj):\n",
        "    if isinstance(obj, date):\n",
        "        serial = obj.isoformat()\n",
        "        return serial\n",
        "    return obj.__dict__   \n",
        "\n",
        "def to_date_time(str_date):\n",
        "    return datetime.strptime(str_date, DATE_FORMAT)\n",
        "\n",
        "def get_month_intervals(str_year):\n",
        "    months = []\n",
        "    for i in range(1, 13):\n",
        "        if i < 10:\n",
        "            months.append(f\"{str_year}-0{i}-01\")\n",
        "        else:\n",
        "            if i == 12:\n",
        "                months.append(f\"{str_year}-{i}-31\")\n",
        "            else:\n",
        "                months.append(f\"{str_year}-{i}-01\")\n",
        "    print(months)\n",
        "    return months  \n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ewjUn9x5nv7m",
        "colab_type": "text"
      },
      "source": [
        "use nasty library and twitter search queries (\"from: user, since=\"date\", until=date, max_tweets=number_of_tweets\")"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8t7gPZsunxId",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#see nasty library: https://github.com/lschmelzeisen/nasty\n",
        "def scrape_tweets_from(account, since, until, no_max_tweets=MAX_TWEETS):\n",
        "    tweets = []\n",
        "    start_time = time.time()\n",
        "    tweet_stream = nasty.Search(f\"from:{account}\", since=to_date_time(since), until=to_date_time(until),\n",
        "                                max_tweets=no_max_tweets).request()\n",
        "\n",
        "    #tweet_streem - up to 50 to tweets in any give period from a given account\n",
        "    for tweet in tweet_stream:\n",
        "        tweets.append(tweet)\n",
        "\n",
        "    elapsed_time = time.time() - start_time\n",
        "    duration = time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time)) #log time need to scrape tweets\n",
        "    print(\n",
        "        f\"scrape_tweets for {account} | scraped tweets = {no_max_tweets}, duration = {duration}, tweets = {len(tweets)}\")\n",
        "    return tweets      \n",
        "\n",
        "#take year - scrape tweets month by month from given list\n",
        "def scrape_tweet_data_by_month(sources, output_data_path, year):\n",
        "    twitter_account_list = pd.read_csv(sources)[\"twitter_account\"].dropna().tolist()\n",
        "    date_list = get_month_intervals(year)\n",
        "\n",
        "    tweets = []\n",
        "    data_file_path = generate_data_csv_path(output_data_path, year, int(year) + 1)\n",
        "    create_data_file(data_file_path)\n",
        "    for i in range(len(date_list) - 1):\n",
        "        since = date_list[i]\n",
        "        until = date_list[i + 1]\n",
        "        print(f\"since: {since}, until: {until} \")\n",
        "        for twitter_account in twitter_account_list:\n",
        "            tweets.append(scrape_tweets_from(twitter_account, since, until))\n",
        "            print(f\"tweets from: {twitter_account}, count: {len(tweets)}\")\n",
        "\n",
        "        update_csv(reduce(lambda x, y: x + y, tweets), data_file_path)\n",
        "        print(f\"total tweets in month {i} : {len(tweets)}\")\n",
        "        tweets.clear()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "__nqkYGSof6q",
        "colab_type": "text"
      },
      "source": [
        "get the tweets (from list of account for a given year)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AfkggTlGom-u",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "scrape_tweet_data_by_month(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS, STAND_UP_COMEDIANS_DATA, year) "
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "get_twitter_data_example.ipynb",
	"provenance": [],
	"authorship_tag": "ABX9TyO1Se22kIxi80h9Hi7lr5Pr",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/ViksaaSkool/5fb024ea5b470017a607891d5febda34/get_twitter_data_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "B4e7o03xlELR",
	"colab_type": "text"
	},
	"source": [
	"First need to install the necessary libraries\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "OS7TYuR8k7-h",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"!pip install nasty"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "aU-X44_qlDhZ",
	"colab_type": "text"
	},
	"source": [
	"add imports"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "vEfBhahpl5lX",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import nasty\n",
	"import pandas as pd\n",
	"import json\n",
	"import time\n",
	"from datetime import date\n",
	"from google.colab import drive\n",
	"from datetime import datetime\n"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "8TrtFowLmKvL",
	"colab_type": "text"
	},
	"source": [
	"mout the google drive \n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "C-XtW7McmOV0",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"GOOGLE_DRIVE_ROOT = \"/content/drive\"\n",
	"drive.mount(GOOGLE_DRIVE_ROOT)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "zuYgdfc4mRGR",
	"colab_type": "text"
	},
	"source": [
	"define constants"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "0uJpC-romS3K",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"GOOGLE_DRIVE_COLAB_PATH = f\"{GOOGLE_DRIVE_ROOT}/My Drive/Colab Notebooks/\"\n",
	"READ = 'r'\n",
	"WRITE = 'w'\n",
	"DATE_FORMAT = \"%Y-%m-%d\"\n",
	"\n",
	"STAND_UP_COMEDIANS_TWITTER_ACCOUNTS = f\"{GOOGLE_DRIVE_COLAB_PATH}/stand_up_comedians_twitter_accounts.csv\"\n",
	"STAND_UP_COMEDIANS_DATA = f\"{GOOGLE_DRIVE_COLAB_PATH}/tweets_stand_up_comedians\"\n",
	"\n",
	"MAX_TWEETS = 5000 #per period (month)\n"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "LPw6d2zMnoJt",
	"colab_type": "text"
	},
	"source": [
	"declare methods"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "uX0pkRvEnp1R",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"def generate_data_csv_path(path, since, until):\n",
	" return f\"{path}_{since}_{until}.csv\" \n",
	"\n",
	"def save_to_json_file(data, file_path):\n",
	" with create_data_file(file_path) as jason_file:\n",
	" json.dump(data, jason_file, default=serialize) \n",
	"\n",
	"def create_data_file(file_path):\n",
	" return open(file_path, WRITE, encoding=\"utf-8\") \n",
	"\n",
	"def serialize(obj):\n",
	" if isinstance(obj, date):\n",
	" serial = obj.isoformat()\n",
	" return serial\n",
	" return obj.__dict__ \n",
	"\n",
	"def to_date_time(str_date):\n",
	" return datetime.strptime(str_date, DATE_FORMAT)\n",
	"\n",
	"def get_month_intervals(str_year):\n",
	" months = []\n",
	" for i in range(1, 13):\n",
	" if i < 10:\n",
	" months.append(f\"{str_year}-0{i}-01\")\n",
	" else:\n",
	" if i == 12:\n",
	" months.append(f\"{str_year}-{i}-31\")\n",
	" else:\n",
	" months.append(f\"{str_year}-{i}-01\")\n",
	" print(months)\n",
	" return months \n"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "ewjUn9x5nv7m",
	"colab_type": "text"
	},
	"source": [
	"use nasty library and twitter search queries (\"from: user, since=\"date\", until=date, max_tweets=number_of_tweets\")"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "8t7gPZsunxId",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"#see nasty library: https://github.com/lschmelzeisen/nasty\n",
	"def scrape_tweets_from(account, since, until, no_max_tweets=MAX_TWEETS):\n",
	" tweets = []\n",
	" start_time = time.time()\n",
	" tweet_stream = nasty.Search(f\"from:{account}\", since=to_date_time(since), until=to_date_time(until),\n",
	" max_tweets=no_max_tweets).request()\n",
	"\n",
	" #tweet_streem - up to 50 to tweets in any give period from a given account\n",
	" for tweet in tweet_stream:\n",
	" tweets.append(tweet)\n",
	"\n",
	" elapsed_time = time.time() - start_time\n",
	" duration = time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time)) #log time need to scrape tweets\n",
	" print(\n",
	" f\"scrape_tweets for {account} \| scraped tweets = {no_max_tweets}, duration = {duration}, tweets = {len(tweets)}\")\n",
	" return tweets \n",
	"\n",
	"#take year - scrape tweets month by month from given list\n",
	"def scrape_tweet_data_by_month(sources, output_data_path, year):\n",
	" twitter_account_list = pd.read_csv(sources)[\"twitter_account\"].dropna().tolist()\n",
	" date_list = get_month_intervals(year)\n",
	"\n",
	" tweets = []\n",
	" data_file_path = generate_data_csv_path(output_data_path, year, int(year) + 1)\n",
	" create_data_file(data_file_path)\n",
	" for i in range(len(date_list) - 1):\n",
	" since = date_list[i]\n",
	" until = date_list[i + 1]\n",
	" print(f\"since: {since}, until: {until} \")\n",
	" for twitter_account in twitter_account_list:\n",
	" tweets.append(scrape_tweets_from(twitter_account, since, until))\n",
	" print(f\"tweets from: {twitter_account}, count: {len(tweets)}\")\n",
	"\n",
	" update_csv(reduce(lambda x, y: x + y, tweets), data_file_path)\n",
	" print(f\"total tweets in month {i} : {len(tweets)}\")\n",
	" tweets.clear()"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "__nqkYGSof6q",
	"colab_type": "text"
	},
	"source": [
	"get the tweets (from list of account for a given year)"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "AfkggTlGom-u",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"scrape_tweet_data_by_month(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS, STAND_UP_COMEDIANS_DATA, year) "
	],
	"execution_count": 0,
	"outputs": []
	}
	]
	}