Skip to content

Instantly share code, notes, and snippets.

@ViksaaSkool
Created April 10, 2020 01:37
Show Gist options
  • Save ViksaaSkool/5fb024ea5b470017a607891d5febda34 to your computer and use it in GitHub Desktop.
Save ViksaaSkool/5fb024ea5b470017a607891d5febda34 to your computer and use it in GitHub Desktop.
get_twitter_data_example.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "get_twitter_data_example.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyO1Se22kIxi80h9Hi7lr5Pr",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ViksaaSkool/5fb024ea5b470017a607891d5febda34/get_twitter_data_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "B4e7o03xlELR",
"colab_type": "text"
},
"source": [
"First need to install the necessary libraries\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "OS7TYuR8k7-h",
"colab_type": "code",
"colab": {}
},
"source": [
"!pip install nasty"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "aU-X44_qlDhZ",
"colab_type": "text"
},
"source": [
"add imports"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vEfBhahpl5lX",
"colab_type": "code",
"colab": {}
},
"source": [
"import nasty\n",
"import pandas as pd\n",
"import json\n",
"import time\n",
"from datetime import date\n",
"from google.colab import drive\n",
"from datetime import datetime\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "8TrtFowLmKvL",
"colab_type": "text"
},
"source": [
"mout the google drive \n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "C-XtW7McmOV0",
"colab_type": "code",
"colab": {}
},
"source": [
"GOOGLE_DRIVE_ROOT = \"/content/drive\"\n",
"drive.mount(GOOGLE_DRIVE_ROOT)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zuYgdfc4mRGR",
"colab_type": "text"
},
"source": [
"define constants"
]
},
{
"cell_type": "code",
"metadata": {
"id": "0uJpC-romS3K",
"colab_type": "code",
"colab": {}
},
"source": [
"GOOGLE_DRIVE_COLAB_PATH = f\"{GOOGLE_DRIVE_ROOT}/My Drive/Colab Notebooks/\"\n",
"READ = 'r'\n",
"WRITE = 'w'\n",
"DATE_FORMAT = \"%Y-%m-%d\"\n",
"\n",
"STAND_UP_COMEDIANS_TWITTER_ACCOUNTS = f\"{GOOGLE_DRIVE_COLAB_PATH}/stand_up_comedians_twitter_accounts.csv\"\n",
"STAND_UP_COMEDIANS_DATA = f\"{GOOGLE_DRIVE_COLAB_PATH}/tweets_stand_up_comedians\"\n",
"\n",
"MAX_TWEETS = 5000 #per period (month)\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "LPw6d2zMnoJt",
"colab_type": "text"
},
"source": [
"declare methods"
]
},
{
"cell_type": "code",
"metadata": {
"id": "uX0pkRvEnp1R",
"colab_type": "code",
"colab": {}
},
"source": [
"def generate_data_csv_path(path, since, until):\n",
" return f\"{path}_{since}_{until}.csv\" \n",
"\n",
"def save_to_json_file(data, file_path):\n",
" with create_data_file(file_path) as jason_file:\n",
" json.dump(data, jason_file, default=serialize) \n",
"\n",
"def create_data_file(file_path):\n",
" return open(file_path, WRITE, encoding=\"utf-8\") \n",
"\n",
"def serialize(obj):\n",
" if isinstance(obj, date):\n",
" serial = obj.isoformat()\n",
" return serial\n",
" return obj.__dict__ \n",
"\n",
"def to_date_time(str_date):\n",
" return datetime.strptime(str_date, DATE_FORMAT)\n",
"\n",
"def get_month_intervals(str_year):\n",
" months = []\n",
" for i in range(1, 13):\n",
" if i < 10:\n",
" months.append(f\"{str_year}-0{i}-01\")\n",
" else:\n",
" if i == 12:\n",
" months.append(f\"{str_year}-{i}-31\")\n",
" else:\n",
" months.append(f\"{str_year}-{i}-01\")\n",
" print(months)\n",
" return months \n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ewjUn9x5nv7m",
"colab_type": "text"
},
"source": [
"use nasty library and twitter search queries (\"from: user, since=\"date\", until=date, max_tweets=number_of_tweets\")"
]
},
{
"cell_type": "code",
"metadata": {
"id": "8t7gPZsunxId",
"colab_type": "code",
"colab": {}
},
"source": [
"#see nasty library: https://github.com/lschmelzeisen/nasty\n",
"def scrape_tweets_from(account, since, until, no_max_tweets=MAX_TWEETS):\n",
" tweets = []\n",
" start_time = time.time()\n",
" tweet_stream = nasty.Search(f\"from:{account}\", since=to_date_time(since), until=to_date_time(until),\n",
" max_tweets=no_max_tweets).request()\n",
"\n",
" #tweet_streem - up to 50 to tweets in any give period from a given account\n",
" for tweet in tweet_stream:\n",
" tweets.append(tweet)\n",
"\n",
" elapsed_time = time.time() - start_time\n",
" duration = time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time)) #log time need to scrape tweets\n",
" print(\n",
" f\"scrape_tweets for {account} | scraped tweets = {no_max_tweets}, duration = {duration}, tweets = {len(tweets)}\")\n",
" return tweets \n",
"\n",
"#take year - scrape tweets month by month from given list\n",
"def scrape_tweet_data_by_month(sources, output_data_path, year):\n",
" twitter_account_list = pd.read_csv(sources)[\"twitter_account\"].dropna().tolist()\n",
" date_list = get_month_intervals(year)\n",
"\n",
" tweets = []\n",
" data_file_path = generate_data_csv_path(output_data_path, year, int(year) + 1)\n",
" create_data_file(data_file_path)\n",
" for i in range(len(date_list) - 1):\n",
" since = date_list[i]\n",
" until = date_list[i + 1]\n",
" print(f\"since: {since}, until: {until} \")\n",
" for twitter_account in twitter_account_list:\n",
" tweets.append(scrape_tweets_from(twitter_account, since, until))\n",
" print(f\"tweets from: {twitter_account}, count: {len(tweets)}\")\n",
"\n",
" update_csv(reduce(lambda x, y: x + y, tweets), data_file_path)\n",
" print(f\"total tweets in month {i} : {len(tweets)}\")\n",
" tweets.clear()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "__nqkYGSof6q",
"colab_type": "text"
},
"source": [
"get the tweets (from list of account for a given year)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "AfkggTlGom-u",
"colab_type": "code",
"colab": {}
},
"source": [
"scrape_tweet_data_by_month(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS, STAND_UP_COMEDIANS_DATA, year) "
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment