Created
April 10, 2020 01:37
-
-
Save ViksaaSkool/5fb024ea5b470017a607891d5febda34 to your computer and use it in GitHub Desktop.
get_twitter_data_example.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "get_twitter_data_example.ipynb", | |
"provenance": [], | |
"authorship_tag": "ABX9TyO1Se22kIxi80h9Hi7lr5Pr", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/ViksaaSkool/5fb024ea5b470017a607891d5febda34/get_twitter_data_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "B4e7o03xlELR", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"First need to install the necessary libraries\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "OS7TYuR8k7-h", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!pip install nasty" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "aU-X44_qlDhZ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"add imports" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vEfBhahpl5lX", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import nasty\n", | |
"import pandas as pd\n", | |
"import json\n", | |
"import time\n", | |
"from datetime import date\n", | |
"from google.colab import drive\n", | |
"from datetime import datetime\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "8TrtFowLmKvL", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"mout the google drive \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "C-XtW7McmOV0", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"GOOGLE_DRIVE_ROOT = \"/content/drive\"\n", | |
"drive.mount(GOOGLE_DRIVE_ROOT)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zuYgdfc4mRGR", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"define constants" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0uJpC-romS3K", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"GOOGLE_DRIVE_COLAB_PATH = f\"{GOOGLE_DRIVE_ROOT}/My Drive/Colab Notebooks/\"\n", | |
"READ = 'r'\n", | |
"WRITE = 'w'\n", | |
"DATE_FORMAT = \"%Y-%m-%d\"\n", | |
"\n", | |
"STAND_UP_COMEDIANS_TWITTER_ACCOUNTS = f\"{GOOGLE_DRIVE_COLAB_PATH}/stand_up_comedians_twitter_accounts.csv\"\n", | |
"STAND_UP_COMEDIANS_DATA = f\"{GOOGLE_DRIVE_COLAB_PATH}/tweets_stand_up_comedians\"\n", | |
"\n", | |
"MAX_TWEETS = 5000 #per period (month)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "LPw6d2zMnoJt", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"declare methods" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "uX0pkRvEnp1R", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def generate_data_csv_path(path, since, until):\n", | |
" return f\"{path}_{since}_{until}.csv\" \n", | |
"\n", | |
"def save_to_json_file(data, file_path):\n", | |
" with create_data_file(file_path) as jason_file:\n", | |
" json.dump(data, jason_file, default=serialize) \n", | |
"\n", | |
"def create_data_file(file_path):\n", | |
" return open(file_path, WRITE, encoding=\"utf-8\") \n", | |
"\n", | |
"def serialize(obj):\n", | |
" if isinstance(obj, date):\n", | |
" serial = obj.isoformat()\n", | |
" return serial\n", | |
" return obj.__dict__ \n", | |
"\n", | |
"def to_date_time(str_date):\n", | |
" return datetime.strptime(str_date, DATE_FORMAT)\n", | |
"\n", | |
"def get_month_intervals(str_year):\n", | |
" months = []\n", | |
" for i in range(1, 13):\n", | |
" if i < 10:\n", | |
" months.append(f\"{str_year}-0{i}-01\")\n", | |
" else:\n", | |
" if i == 12:\n", | |
" months.append(f\"{str_year}-{i}-31\")\n", | |
" else:\n", | |
" months.append(f\"{str_year}-{i}-01\")\n", | |
" print(months)\n", | |
" return months \n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ewjUn9x5nv7m", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"use nasty library and twitter search queries (\"from: user, since=\"date\", until=date, max_tweets=number_of_tweets\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8t7gPZsunxId", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#see nasty library: https://github.com/lschmelzeisen/nasty\n", | |
"def scrape_tweets_from(account, since, until, no_max_tweets=MAX_TWEETS):\n", | |
" tweets = []\n", | |
" start_time = time.time()\n", | |
" tweet_stream = nasty.Search(f\"from:{account}\", since=to_date_time(since), until=to_date_time(until),\n", | |
" max_tweets=no_max_tweets).request()\n", | |
"\n", | |
" #tweet_streem - up to 50 to tweets in any give period from a given account\n", | |
" for tweet in tweet_stream:\n", | |
" tweets.append(tweet)\n", | |
"\n", | |
" elapsed_time = time.time() - start_time\n", | |
" duration = time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time)) #log time need to scrape tweets\n", | |
" print(\n", | |
" f\"scrape_tweets for {account} | scraped tweets = {no_max_tweets}, duration = {duration}, tweets = {len(tweets)}\")\n", | |
" return tweets \n", | |
"\n", | |
"#take year - scrape tweets month by month from given list\n", | |
"def scrape_tweet_data_by_month(sources, output_data_path, year):\n", | |
" twitter_account_list = pd.read_csv(sources)[\"twitter_account\"].dropna().tolist()\n", | |
" date_list = get_month_intervals(year)\n", | |
"\n", | |
" tweets = []\n", | |
" data_file_path = generate_data_csv_path(output_data_path, year, int(year) + 1)\n", | |
" create_data_file(data_file_path)\n", | |
" for i in range(len(date_list) - 1):\n", | |
" since = date_list[i]\n", | |
" until = date_list[i + 1]\n", | |
" print(f\"since: {since}, until: {until} \")\n", | |
" for twitter_account in twitter_account_list:\n", | |
" tweets.append(scrape_tweets_from(twitter_account, since, until))\n", | |
" print(f\"tweets from: {twitter_account}, count: {len(tweets)}\")\n", | |
"\n", | |
" update_csv(reduce(lambda x, y: x + y, tweets), data_file_path)\n", | |
" print(f\"total tweets in month {i} : {len(tweets)}\")\n", | |
" tweets.clear()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "__nqkYGSof6q", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"get the tweets (from list of account for a given year)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AfkggTlGom-u", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"scrape_tweet_data_by_month(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS, STAND_UP_COMEDIANS_DATA, year) " | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment