Skip to content

Instantly share code, notes, and snippets.

@chi-feng
Created January 27, 2021 07:54
Show Gist options
  • Save chi-feng/cfeecf0329f473559439c21d88898a5f to your computer and use it in GitHub Desktop.
Save chi-feng/cfeecf0329f473559439c21d88898a5f to your computer and use it in GitHub Desktop.
Twitter reply scraper using Tweepy
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import logging\n",
"import json\n",
"\n",
"import pandas as pd\n",
"\n",
"import tweepy\n",
"from tqdm.notebook import tqdm\n",
"from pprint import pprint\n",
"\n",
"from credentials import consumer_key, consumer_secret, access_token, access_token_secret"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_api():\n",
" auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
" auth.set_access_token(access_token, access_token_secret)\n",
" api = tweepy.API(auth)\n",
" return api\n",
"\n",
"api = get_api()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"name = \"marclamonthill\"\n",
"status_id_str = \"1352680581809790977\"\n",
"status_id_str = \"1352680218520137728\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 1: Run until StopIteration"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scraped 100, found 0\n",
"scraped 200, found 1\n",
"scraped 300, found 1\n",
"scraped 400, found 2\n",
"scraped 500, found 3\n",
"scraped 600, found 4\n",
"scraped 700, found 5\n",
"scraped 800, found 6\n",
"scraped 900, found 12\n",
"scraped 1000, found 13\n",
"scraped 1100, found 14\n",
"scraped 1200, found 15\n",
"scraped 1300, found 16\n",
"scraped 1400, found 17\n",
"scraped 1500, found 21\n",
"scraped 1600, found 27\n",
"scraped 1700, found 33\n",
"scraped 1800, found 36\n",
"scraped 1900, found 38\n",
"scraped 2000, found 43\n"
]
}
],
"source": [
"# logging to file\n",
"logger = logging.getLogger(f\"{name}_{status_id_str}\")\n",
"logger.setLevel(logging.DEBUG)\n",
"fh = logging.FileHandler('%s.log' % name)\n",
"fh.setLevel(logging.INFO)\n",
"formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')\n",
"fh.setFormatter(formatter)\n",
"logger.addHandler(fh)\n",
"\n",
"replies = []\n",
"\n",
"cursor = tweepy.Cursor(\n",
" api.search, \n",
" q=f\"to:{name}\",\n",
" result_type=\"recent\", \n",
" since_id=status_id_str, \n",
" tweet_mode='extended',\n",
" count=100\n",
").items()\n",
"\n",
"count = 0\n",
"while True:\n",
" try:\n",
" reply = cursor.next()\n",
" count += 1\n",
" if count % 100 == 0:\n",
" print(f\"scraped {count}, found {len(replies)}\")\n",
" logger.info(f\"scraped {count}, found {len(replies)} \")\n",
" if not hasattr(reply, 'in_reply_to_status_id_str'):\n",
" continue\n",
" if reply.in_reply_to_status_id_str == status_id_str:\n",
" replies.append(reply)\n",
" logger.info(reply.id_str)\n",
"\n",
" except tweepy.RateLimitError as e:\n",
" logger.error(\"Twitter api rate limit reached\")\n",
" time.sleep(60)\n",
" continue\n",
"\n",
" except tweepy.TweepError as e:\n",
" logger.error(f\"Tweepy error occured: {e}\")\n",
" break\n",
"\n",
" except StopIteration:\n",
" break\n",
"\n",
" except Exception as e:\n",
" logger.error(f\"Failed while fetching replies {e}\")\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Method 2: Setting explicit limit"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "48f8662001e0407898867abf5e31663f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3000.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
".................................................\n",
"found 49 replies\n"
]
}
],
"source": [
"limit = 3000\n",
"api = get_api()\n",
"replies = []\n",
"for tweet in tqdm(tweepy.Cursor(\n",
" api.search, \n",
" q=f\"to:{name}\", \n",
" result_type=\"recent\", \n",
" tweet_mode=\"extended\",\n",
" count=100,\n",
" timeout=999999\n",
").items(limit), total=limit):\n",
" if hasattr(tweet, \"in_reply_to_status_id_str\") and tweet.in_reply_to_status_id_str == status_id_str:\n",
" replies.append(tweet)\n",
" print(\".\", end=\"\")\n",
" \n",
"print(f\"found {len(replies)} replies\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Write replies to files"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"replies_raw = [reply._json for reply in replies]\n",
"with open(f\"{name}_{status_id_str}_raw.json\", \"w\") as outfile:\n",
" json.dump(replies_raw, outfile, indent=2)\n",
" \n",
"replies_list = [dict(\n",
" id_str=reply[\"id_str\"], \n",
" full_text=reply[\"full_text\"], \n",
" retweet_count=reply[\"retweet_count\"],\n",
" favorite_count=reply[\"favorite_count\"],\n",
" name=reply[\"user\"][\"name\"],\n",
" screen_name=reply[\"user\"][\"screen_name\"],\n",
" followers_count=reply[\"user\"][\"followers_count\"],\n",
" profile_image_url_https=reply[\"user\"][\"profile_image_url_https\"]\n",
") for reply in replies_raw]\n",
"\n",
"with open(f\"{name}_{status_id_str}.json\", \"w\") as outfile:\n",
" json.dump(replies_list, outfile, indent=2)\n",
"\n",
"# convert JSON to CSV using pandas\n",
"data = pd.read_json(f\"{name}_{status_id_str}.json\")\n",
"data.to_csv(f\"{name}_{status_id_str}.csv\") "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment