Skip to content

Instantly share code, notes, and snippets.

@iamvee
Last active January 31, 2021 21:51
Show Gist options
  • Save iamvee/86f60f60f1a9376175a4aecb7c6b1746 to your computer and use it in GitHub Desktop.
Save iamvee/86f60f60f1a9376175a4aecb7c6b1746 to your computer and use it in GitHub Desktop.
tweets
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "reserved-journalism",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import tweepy\n",
"import csv\n",
"import pandas as pd\n",
"import time\n",
"import collections\n",
"\n",
"consumer_key = ''\n",
"consumer_secret = ''\n",
"\n",
"access_token = ''\n",
"access_token_secret = ''\n",
"\n",
"\n",
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
"auth.set_access_token(access_token, access_token_secret)\n",
"api = tweepy.API(auth,wait_on_rate_limit=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "suffering-capitol",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"hashtag = \"#hashtag\"\n",
"lim = 100\n",
"\n",
"steps = lim\n",
"\n",
"current_time = time.ctime().replace(\" \",\"_\")\n",
"dir_name = f\"./{hashtag[1:]}\"\n",
"path = f\"./{hashtag[1:]}/{current_time}\"\n",
"outfile = f\"{path}/out.csv\"\n",
"outusers = f\"{path}/users.csv\"\n",
"\n",
"\n",
"try:\n",
" os.mkdir(f\"./{hashtag[1:]}\")\n",
"except FileExistsError:\n",
" print(\"directory exists\")\n",
"finally:\n",
" os.mkdir(path)\n",
" \n",
" \n",
"csv_file = open(outfile, 'a')\n",
"csv_writer_tweets = csv.writer(csv_file)\n",
"csv_writer_tweets.writerow(\n",
" ['created_at', 'id', 'id_str', 'user_id', 'screen name', 'account created at', 'text', 'truncated',\n",
" 'in_reply_to_status_id', 'in_reply_to_status_id_str', \n",
" 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', \n",
" 'geo', 'coordinates', 'place', 'contributors', \n",
" 'is_quote_status', 'retweet_count', 'favorite_count', \n",
" 'favorited', 'retweeted', 'lang'])\n",
"\n",
"csv_users = open(outusers, 'a')\n",
"csv_writer_users = csv.writer(csv_users)\n",
"csv_writer_users.writerow(\n",
" ['id', 'id_str', 'name', 'screen_name', 'location', 'description', \n",
" 'url','protected', 'followers_count', 'friends_count', \n",
" 'listed_count', 'created_at', 'favourites_count', 'utc_offset', \n",
" 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', \n",
" 'contributors_enabled', 'is_translator', 'is_translation_enabled', \n",
" 'has_extended_profile', 'default_profile', 'default_profile_image', \n",
" 'following', 'follow_request_sent', 'notifications', 'translator_type'])\n",
"\n",
"\n",
"\n",
"print(time.ctime())\n",
"for i, tweet in enumerate(tweepy.Cursor(api.search,q=hashtag,count=lim).items()):\n",
" if not tweet.user.id in users: \n",
" csv_writer_users.writerow([\n",
" tweet.user._json['id'], tweet.user._json['id_str'], tweet.user._json['name'],\n",
" tweet.user._json['screen_name'], tweet.user._json['location'],\n",
" tweet.user._json['description'], tweet.user._json['url'], \n",
" tweet.user._json['protected'], tweet.user._json['followers_count'], \n",
" tweet.user._json['friends_count'], tweet.user._json['listed_count'],\n",
" tweet.user._json['created_at'], tweet.user._json['favourites_count'], \n",
" tweet.user._json['utc_offset'], tweet.user._json['time_zone'], \n",
" tweet.user._json['geo_enabled'], tweet.user._json['verified'], \n",
" tweet.user._json['statuses_count'], tweet.user._json['lang'], \n",
" tweet.user._json['contributors_enabled'], tweet.user._json['is_translator'], \n",
" tweet.user._json['is_translation_enabled'], tweet.user._json['has_extended_profile'],\n",
" tweet.user._json['default_profile'], tweet.user._json['default_profile_image'], \n",
" tweet.user._json['following'], tweet.user._json['follow_request_sent'], \n",
" tweet.user._json['notifications'], tweet.user._json['translator_type']\n",
" ])\n",
" \n",
" csv_writer_tweets.writerow([\n",
" tweet._json['created_at'], tweet._json['id'], tweet._json['id_str'], \n",
" tweet.user._json['id'], tweet.user._json['screen_name'], str(tweet.user.created_at),\n",
" tweet._json['text'], tweet._json['truncated'], tweet._json['in_reply_to_status_id'],\n",
" tweet._json['in_reply_to_status_id_str'], tweet._json['in_reply_to_user_id'],\n",
" tweet._json['in_reply_to_user_id_str'], tweet._json['in_reply_to_screen_name'], \n",
" tweet._json['geo'], tweet._json['coordinates'], tweet._json['place'],\n",
" tweet._json['contributors'], tweet._json['is_quote_status'], \n",
" tweet._json['retweet_count'], tweet._json['favorite_count'], \n",
" tweet._json['favorited'], tweet._json['retweeted'], tweet._json['lang']])\n",
" if i > lim:\n",
" print(f\"{i:8<}\", flush=True, end=\" \")\n",
" lim += steps"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "enabling-manner",
"metadata": {},
"outputs": [],
"source": [
"df_tweets = pd.read_csv(outfile)\n",
"df_users = pd.read_csv(outusers)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "undefined-documentary",
"metadata": {},
"outputs": [],
"source": [
"df = df_tweets\n",
"df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n",
"\n",
"x = df[df[\"RT\"]==False][\"text\"].map(\n",
" lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n",
" lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n",
" lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n",
" lambda x:re.sub(\"\\s\", \"\", x)).map(\n",
" lambda x: x[:20])\n",
"\n",
"y = sorted(set(x))\n",
"\n",
"print(f\"uniq {len(y)}\\noriginal {len(df[df['RT']==False])}\\nall {len(df['RT'])}\")\n",
"print(\"accounts \", len(set(df[\"screen name\"])))\n",
"\n",
"df.tail(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "rubber-operation",
"metadata": {},
"outputs": [],
"source": [
"account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n",
"tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n",
"\n",
"\n",
"print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n",
"\n",
"for k in sorted(account_number.keys()):\n",
" print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "frequent-immunology",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "sticky-expression",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "infinite-shoot",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment