chi-feng/twitter_replies.ipynb

## twitter_replies.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import logging\n",
    "import json\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import tweepy\n",
    "from tqdm.notebook import tqdm\n",
    "from pprint import pprint\n",
    "\n",
    "from credentials import consumer_key, consumer_secret, access_token, access_token_secret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_api():\n",
    "    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
    "    auth.set_access_token(access_token, access_token_secret)\n",
    "    api = tweepy.API(auth)\n",
    "    return api\n",
    "\n",
    "api = get_api()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"marclamonthill\"\n",
    "status_id_str = \"1352680581809790977\"\n",
    "status_id_str = \"1352680218520137728\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Method 1: Run until StopIteration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scraped 100, found 0\n",
      "scraped 200, found 1\n",
      "scraped 300, found 1\n",
      "scraped 400, found 2\n",
      "scraped 500, found 3\n",
      "scraped 600, found 4\n",
      "scraped 700, found 5\n",
      "scraped 800, found 6\n",
      "scraped 900, found 12\n",
      "scraped 1000, found 13\n",
      "scraped 1100, found 14\n",
      "scraped 1200, found 15\n",
      "scraped 1300, found 16\n",
      "scraped 1400, found 17\n",
      "scraped 1500, found 21\n",
      "scraped 1600, found 27\n",
      "scraped 1700, found 33\n",
      "scraped 1800, found 36\n",
      "scraped 1900, found 38\n",
      "scraped 2000, found 43\n"
     ]
    }
   ],
   "source": [
    "# logging to file\n",
    "logger = logging.getLogger(f\"{name}_{status_id_str}\")\n",
    "logger.setLevel(logging.DEBUG)\n",
    "fh = logging.FileHandler('%s.log' % name)\n",
    "fh.setLevel(logging.INFO)\n",
    "formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')\n",
    "fh.setFormatter(formatter)\n",
    "logger.addHandler(fh)\n",
    "\n",
    "replies = []\n",
    "\n",
    "cursor = tweepy.Cursor(\n",
    "    api.search, \n",
    "    q=f\"to:{name}\",\n",
    "    result_type=\"recent\", \n",
    "    since_id=status_id_str, \n",
    "    tweet_mode='extended',\n",
    "    count=100\n",
    ").items()\n",
    "\n",
    "count = 0\n",
    "while True:\n",
    "    try:\n",
    "        reply = cursor.next()\n",
    "        count += 1\n",
    "        if count % 100 == 0:\n",
    "            print(f\"scraped {count}, found {len(replies)}\")\n",
    "            logger.info(f\"scraped {count}, found {len(replies)} \")\n",
    "        if not hasattr(reply, 'in_reply_to_status_id_str'):\n",
    "            continue\n",
    "        if reply.in_reply_to_status_id_str == status_id_str:\n",
    "            replies.append(reply)\n",
    "            logger.info(reply.id_str)\n",
    "\n",
    "    except tweepy.RateLimitError as e:\n",
    "        logger.error(\"Twitter api rate limit reached\")\n",
    "        time.sleep(60)\n",
    "        continue\n",
    "\n",
    "    except tweepy.TweepError as e:\n",
    "        logger.error(f\"Tweepy error occured: {e}\")\n",
    "        break\n",
    "\n",
    "    except StopIteration:\n",
    "        break\n",
    "\n",
    "    except Exception as e:\n",
    "        logger.error(f\"Failed while fetching replies {e}\")\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Method 2: Setting explicit limit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "48f8662001e0407898867abf5e31663f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3000.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ".................................................\n",
      "found 49 replies\n"
     ]
    }
   ],
   "source": [
    "limit = 3000\n",
    "api = get_api()\n",
    "replies = []\n",
    "for tweet in tqdm(tweepy.Cursor(\n",
    "    api.search, \n",
    "    q=f\"to:{name}\", \n",
    "    result_type=\"recent\", \n",
    "    tweet_mode=\"extended\",\n",
    "    count=100,\n",
    "    timeout=999999\n",
    ").items(limit), total=limit):\n",
    "    if hasattr(tweet, \"in_reply_to_status_id_str\") and tweet.in_reply_to_status_id_str == status_id_str:\n",
    "        replies.append(tweet)\n",
    "        print(\".\", end=\"\")\n",
    "        \n",
    "print(f\"found {len(replies)} replies\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write replies to files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "replies_raw = [reply._json for reply in replies]\n",
    "with open(f\"{name}_{status_id_str}_raw.json\", \"w\") as outfile:\n",
    "    json.dump(replies_raw, outfile, indent=2)\n",
    "    \n",
    "replies_list = [dict(\n",
    "    id_str=reply[\"id_str\"], \n",
    "    full_text=reply[\"full_text\"], \n",
    "    retweet_count=reply[\"retweet_count\"],\n",
    "    favorite_count=reply[\"favorite_count\"],\n",
    "    name=reply[\"user\"][\"name\"],\n",
    "    screen_name=reply[\"user\"][\"screen_name\"],\n",
    "    followers_count=reply[\"user\"][\"followers_count\"],\n",
    "    profile_image_url_https=reply[\"user\"][\"profile_image_url_https\"]\n",
    ") for reply in replies_raw]\n",
    "\n",
    "with open(f\"{name}_{status_id_str}.json\", \"w\") as outfile:\n",
    "    json.dump(replies_list, outfile, indent=2)\n",
    "\n",
    "# convert JSON to CSV using pandas\n",
    "data = pd.read_json(f\"{name}_{status_id_str}.json\")\n",
    "data.to_csv(f\"{name}_{status_id_str}.csv\")    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import time\n",
	"import logging\n",
	"import json\n",
	"\n",
	"import pandas as pd\n",
	"\n",
	"import tweepy\n",
	"from tqdm.notebook import tqdm\n",
	"from pprint import pprint\n",
	"\n",
	"from credentials import consumer_key, consumer_secret, access_token, access_token_secret"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_api():\n",
	" auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
	" auth.set_access_token(access_token, access_token_secret)\n",
	" api = tweepy.API(auth)\n",
	" return api\n",
	"\n",
	"api = get_api()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"name = \"marclamonthill\"\n",
	"status_id_str = \"1352680581809790977\"\n",
	"status_id_str = \"1352680218520137728\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Method 1: Run until StopIteration"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"scraped 100, found 0\n",
	"scraped 200, found 1\n",
	"scraped 300, found 1\n",
	"scraped 400, found 2\n",
	"scraped 500, found 3\n",
	"scraped 600, found 4\n",
	"scraped 700, found 5\n",
	"scraped 800, found 6\n",
	"scraped 900, found 12\n",
	"scraped 1000, found 13\n",
	"scraped 1100, found 14\n",
	"scraped 1200, found 15\n",
	"scraped 1300, found 16\n",
	"scraped 1400, found 17\n",
	"scraped 1500, found 21\n",
	"scraped 1600, found 27\n",
	"scraped 1700, found 33\n",
	"scraped 1800, found 36\n",
	"scraped 1900, found 38\n",
	"scraped 2000, found 43\n"
	]
	}
	],
	"source": [
	"# logging to file\n",
	"logger = logging.getLogger(f\"{name}_{status_id_str}\")\n",
	"logger.setLevel(logging.DEBUG)\n",
	"fh = logging.FileHandler('%s.log' % name)\n",
	"fh.setLevel(logging.INFO)\n",
	"formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')\n",
	"fh.setFormatter(formatter)\n",
	"logger.addHandler(fh)\n",
	"\n",
	"replies = []\n",
	"\n",
	"cursor = tweepy.Cursor(\n",
	" api.search, \n",
	" q=f\"to:{name}\",\n",
	" result_type=\"recent\", \n",
	" since_id=status_id_str, \n",
	" tweet_mode='extended',\n",
	" count=100\n",
	").items()\n",
	"\n",
	"count = 0\n",
	"while True:\n",
	" try:\n",
	" reply = cursor.next()\n",
	" count += 1\n",
	" if count % 100 == 0:\n",
	" print(f\"scraped {count}, found {len(replies)}\")\n",
	" logger.info(f\"scraped {count}, found {len(replies)} \")\n",
	" if not hasattr(reply, 'in_reply_to_status_id_str'):\n",
	" continue\n",
	" if reply.in_reply_to_status_id_str == status_id_str:\n",
	" replies.append(reply)\n",
	" logger.info(reply.id_str)\n",
	"\n",
	" except tweepy.RateLimitError as e:\n",
	" logger.error(\"Twitter api rate limit reached\")\n",
	" time.sleep(60)\n",
	" continue\n",
	"\n",
	" except tweepy.TweepError as e:\n",
	" logger.error(f\"Tweepy error occured: {e}\")\n",
	" break\n",
	"\n",
	" except StopIteration:\n",
	" break\n",
	"\n",
	" except Exception as e:\n",
	" logger.error(f\"Failed while fetching replies {e}\")\n",
	" break"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Method 2: Setting explicit limit"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "48f8662001e0407898867abf5e31663f",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3000.0), HTML(value='')))"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	".................................................\n",
	"found 49 replies\n"
	]
	}
	],
	"source": [
	"limit = 3000\n",
	"api = get_api()\n",
	"replies = []\n",
	"for tweet in tqdm(tweepy.Cursor(\n",
	" api.search, \n",
	" q=f\"to:{name}\", \n",
	" result_type=\"recent\", \n",
	" tweet_mode=\"extended\",\n",
	" count=100,\n",
	" timeout=999999\n",
	").items(limit), total=limit):\n",
	" if hasattr(tweet, \"in_reply_to_status_id_str\") and tweet.in_reply_to_status_id_str == status_id_str:\n",
	" replies.append(tweet)\n",
	" print(\".\", end=\"\")\n",
	" \n",
	"print(f\"found {len(replies)} replies\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Write replies to files"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"replies_raw = [reply._json for reply in replies]\n",
	"with open(f\"{name}_{status_id_str}_raw.json\", \"w\") as outfile:\n",
	" json.dump(replies_raw, outfile, indent=2)\n",
	" \n",
	"replies_list = [dict(\n",
	" id_str=reply[\"id_str\"], \n",
	" full_text=reply[\"full_text\"], \n",
	" retweet_count=reply[\"retweet_count\"],\n",
	" favorite_count=reply[\"favorite_count\"],\n",
	" name=reply[\"user\"][\"name\"],\n",
	" screen_name=reply[\"user\"][\"screen_name\"],\n",
	" followers_count=reply[\"user\"][\"followers_count\"],\n",
	" profile_image_url_https=reply[\"user\"][\"profile_image_url_https\"]\n",
	") for reply in replies_raw]\n",
	"\n",
	"with open(f\"{name}_{status_id_str}.json\", \"w\") as outfile:\n",
	" json.dump(replies_list, outfile, indent=2)\n",
	"\n",
	"# convert JSON to CSV using pandas\n",
	"data = pd.read_json(f\"{name}_{status_id_str}.json\")\n",
	"data.to_csv(f\"{name}_{status_id_str}.csv\") "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}