Skip to content

Instantly share code, notes, and snippets.

@TheRinger
Forked from NikosKoufos/Twitter Scraper.ipynb
Created December 19, 2019 15:14
Show Gist options
  • Save TheRinger/f4bc23240f0d4924f7ea83e99e444bcf to your computer and use it in GitHub Desktop.
Save TheRinger/f4bc23240f0d4924f7ea83e99e444bcf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_tweets(browser):\n",
" body = browser.find_element_by_tag_name('body')\n",
" tweets = []\n",
" while len(tweets) < num_of_tweets:\n",
" for _ in range(5):\n",
" body.send_keys(Keys.PAGE_DOWN)\n",
" time.sleep(1)\n",
" tweets = browser.find_elements_by_class_name('content')\n",
"\n",
" return tweets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def parse_and_save(tweets, num_of_tweets, query):\n",
" counter = 0\n",
" column_separator = \",\"\n",
" tags_separator = '|'\n",
" \n",
" with open(query+\"_twitter_data.csv\", 'w', errors='ignore') as f:\n",
" f.write(\"Username\" + column_separator + \"Text\" + column_separator + \"Reply\"\\\n",
" + column_separator + \"Retweet\" + column_separator + \"Favourite\"\\\n",
" + column_separator + \"RefUsers\" + column_separator + \"Hashtags\\n\")\n",
"\n",
" for tweet in tweets:\n",
" if counter == num_of_tweets:\n",
" break\n",
" user = tweet.find_element_by_class_name('username').text\n",
" # remove , and new line\n",
" text = tweet.find_element_by_class_name('tweet-text').text.replace('\\n', ' ').replace(',', ' ')\n",
" stats = tweet.find_elements_by_class_name('ProfileTweet-actionCountForPresentation')\n",
"\n",
" reply = stats[0].text\n",
" if not len(reply):\n",
" reply = '0'\n",
"\n",
" retweet = stats[1].text\n",
" if not len(retweet):\n",
" retweet = '0'\n",
"\n",
" favourite = stats[3].text\n",
" if not len(favourite):\n",
" favourite = '0'\n",
"\n",
" ref_users = ''\n",
" for user_ref in tweet.find_elements_by_class_name('twitter-atreply'):\n",
" ref_users += user_ref.text + tags_separator\n",
" ref_users = ref_users[:-len(tags_separator)]\n",
" if not len(ref_users):\n",
" ref_users = '-'\n",
"\n",
" hashtags = ''\n",
" for hashtag in tweet.find_elements_by_class_name('twitter-hashtag'):\n",
" hashtags += hashtag.text + tags_separator\n",
" hashtags = hashtags[:-len(tags_separator)]\n",
" if not len(hashtags):\n",
" hashtags = '-'\n",
"\n",
" line = user + column_separator + text + column_separator + reply \\\n",
" + column_separator + retweet + column_separator + favourite \\\n",
" + column_separator + ref_users + column_separator + hashtags + '\\n'\n",
" f.write(line)\n",
" counter += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"browser = webdriver.Chrome()\n",
"base_url = \"https://twitter.com/search?q=\"\n",
"query = \"Trump\"\n",
"url = base_url + query\n",
"num_of_tweets = 100\n",
"\n",
"browser.get(url)\n",
"time.sleep(1)\n",
"tweets = get_tweets(browser)\n",
"parse_and_save(tweets, num_of_tweets, query)\n",
"browser.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment