Created
August 4, 2019 13:07
-
-
Save aussetg/d0dd149268c29fe8fffe9787ee3adfdc to your computer and use it in GitHub Desktop.
Spider to scrape Twitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
import datetime | |
import json | |
from TwitterScraper.items import Tweet | |
from urllib.parse import quote | |
from bs4 import BeautifulSoup | |
from scrapy.http import HtmlResponse | |
from dateutil.parser import parse | |
# TODO: We should use urlencode to build urls, not weird string concatenation with quoting | |
class SearchSpider(scrapy.Spider): | |
name = "search" | |
def __init__(self, q=None, since=None, until=None, tag=None, *args, **kwargs): | |
super(SearchSpider, self).__init__(*args, **kwargs) | |
base_url = "https://twitter.com/i/search/timeline?f=tweets&q={}%20since%3A{}%20until%3A{}" | |
if q is None: | |
q = "@BNPParibas" | |
if since is None: | |
start = datetime.datetime.today() | |
else: | |
start = datetime.datetime.strptime(since, "%Y-%m-%d") | |
if until is None: | |
stop = start + datetime.timedelta(1) | |
else: | |
stop = datetime.datetime.strptime(until, "%Y-%m-%d") + datetime.timedelta(1) | |
query = q | |
if since is None: | |
self.query = query | |
else: | |
self.query = "{} since:{} until:{}".format(quote(query), | |
datetime.datetime.strftime(start, "%Y-%m-%d"), | |
datetime.datetime.strftime(stop, "%Y-%m-%d")) | |
self.tag = tag | |
numdays = (stop - start).days | |
self.logger.info('numdays: {}'.format(numdays)) | |
self.urls = [base_url.format(quote(query), | |
datetime.datetime.strftime(start + datetime.timedelta(days=d), "%Y-%m-%d"), | |
datetime.datetime.strftime(start + datetime.timedelta(days=d + 1), "%Y-%m-%d")) | |
for d in range(0, numdays + 1)] | |
self.logger.info('URLs for scrapping: {}'.format(self.urls)) | |
def start_requests(self): | |
for url in self.urls: | |
request = scrapy.Request(url=url, callback=self.parse) | |
request.meta["base_url"] = url | |
yield request | |
def parse(self, response): | |
jsonresponse = json.loads(response.body_as_unicode()) | |
base_url = response.meta["base_url"] | |
self.logger.info("Parsing URL: {}".format(response.url)) | |
html = scrapy.Selector(text=jsonresponse['items_html'], type="html") | |
tweets = html.css('div.original-tweet') | |
for tweet in tweets: | |
replies, retweets, likes = list( | |
map(lambda x: int(x[0]), tweet.css('span.ProfileTweet-actionCountForAria::text').extract())) | |
tweet_id = int(tweet.css('div.original-tweet::attr(data-tweet-id)').extract_first()) | |
user_id = int(tweet.css('div.original-tweet::attr(data-user-id)').extract_first()) | |
timestamp = datetime.datetime.fromtimestamp( | |
int(tweet.css('span._timestamp::attr(data-time)').extract_first())).strftime('%Y-%m-%d %H:%M:%S') | |
# Tweets are formated a werird way, instead of just inserting the emoji as unicode it's inserted as an image | |
# ( probably for compatibility ? ) and the 'alt" tag is the proper emoji sympa so that's it copyable. | |
# So if we just do string() we will get everything BUT the emojis. I just add the emoji back as text as a | |
# workaround. | |
text_html = tweet.xpath("descendant-or-self::p[@class and contains(concat(' ', normalize-space(@class), " | |
"' '), ' TweetTextSize ') and (@class and contains(concat(' ', normalize-space(@class), " | |
"' '), ' js-tweet-text ')) and (@class and contains(concat(' ', normalize-space(@class), ' ')," | |
" ' tweet-text '))]").extract_first() | |
soup = BeautifulSoup(text_html, 'html.parser') | |
for emoji in soup.find_all('img'): | |
emoji.string = emoji.attrs['alt'] | |
text = soup.text | |
## Need to fix cases where it's "text pic.twitter.com", those are saved as "textpic.twitter.com" | |
lang = tweet.css('p.tweet-text::attr(lang)').extract_first() | |
# TODO: We would like the tweet we reply to, instead we get the user. How to retrieve it? | |
# TODO: Follow the link and scrape? | |
# TODO: follow the link, retrieve last ThreadedConversation-tweet, get data-item-id | |
replies_to = tweet.css('div.ReplyingToContextBelowAuthor a.pretty-link.js-user-profile-link::attr(data-user-id)').extract_first() | |
if replies_to is not None: | |
replies_to = int(replies_to) | |
# We could scrape every single tweet but to reduce the number of requests we will only do those where | |
# we know they are part of a conversation. | |
# We could check in database if a tweet is already present to prevent over scraping. | |
tweet_url = "https://twiter.com" + tweet.css( | |
'div.tweet.js-stream-tweet.js-actionable-tweet::attr(data-permalink-path)').extract_first() | |
yield scrapy.Request(tweet_url, self.parse_single) | |
retweet_of = tweet.css('div.QuoteTweet-innerContainer.u-cf.js-permalink.js-media-container::attr(data-item-id)').extract_first() | |
if retweet_of is not None: | |
retweet_of = int(retweet_of) | |
yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp, | |
text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to, retweet_of=retweet_of) | |
if len(tweets) == 20: | |
min_tweet = tweets[0].css('div.original-tweet::attr(data-tweet-id)').extract_first() | |
max_tweet = tweets[-1].css('div.original-tweet::attr(data-tweet-id)').extract_first() | |
cursor = "TWEET-{}-{}-" \ | |
"BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \ | |
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \ | |
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \ | |
.format(max_tweet, min_tweet) | |
url = base_url + "&max_position={}".format(cursor) | |
request = scrapy.Request(url, self.parse) | |
request.meta["base_url"] = base_url | |
yield request | |
def parse_single(self, response): | |
self.logger.info('Parsing a single tweet dependencies: {}'.format(response.url)) | |
html = scrapy.Selector(text=response.body_as_unicode(), type="html") | |
# Some infos are common to all tweets. | |
main = html.css('div.permalink-inner.permalink-tweet-container') | |
lang = main.css('p.tweet-text::attr(lang)').extract_first() | |
initial_id = int(main.css('div.tweet.permalink-tweet.js-actionable-user.' | |
'js-actionable-tweet.js-original-tweet::attr(data-tweet-id)').extract_first()) | |
# Let's do the other tweets | |
parents = html.css('div.permalink-in-reply-tos') | |
children = html.css('div.replies-to') | |
# Let's first parse the parent tweets. | |
parent_tweets = parents.css('div.ThreadedConversation-tweet') | |
current_id = None | |
for tweet in parent_tweets: | |
user_id = int(main.css('a.account-group.js-account-group' | |
'.js-action-profile.js-user-profile-link' | |
'.js-nav::attr(data-user-id)').extract_first()) | |
tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first()) | |
timestamp = datetime.datetime.fromtimestamp( | |
int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\ | |
.strftime('%Y-%m-%d %H:%M:%S') | |
text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first() | |
soup = BeautifulSoup(text_html, 'html.parser') | |
for emoji in soup.find_all('img'): | |
emoji.string = emoji.attrs['alt'] | |
text = soup.text | |
replies_to = current_id | |
current_id = tweet_id | |
replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \ | |
.css('span.ProfileTweet-actionCountForPresentation::text') \ | |
.extract_first() | |
retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \ | |
.css('span.ProfileTweet-actionCountForPresentation::text') \ | |
.extract_first() | |
likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \ | |
.css('span.ProfileTweet-actionCountForPresentation::text') \ | |
.extract_first() | |
if replies is None: | |
replies = 0 | |
else: | |
replies = int(replies) | |
if retweets is None: | |
retweets = 0 | |
else: | |
retweets = int(retweets) | |
if likes is None: | |
likes = 0 | |
else: | |
likes = int(likes) | |
yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp, | |
text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to, | |
retweet_of=None) | |
# Now let's first parse the child tweets. | |
child_tweets = children.css('div.ThreadedConversation-tweet') | |
current_id = initial_id | |
for tweet in child_tweets: | |
user_id = int(main.css('a.account-group.js-account-group' | |
'.js-action-profile.js-user-profile-link' | |
'.js-nav::attr(data-user-id)').extract_first()) | |
tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first()) | |
timestamp = datetime.datetime.fromtimestamp( | |
int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\ | |
.strftime('%Y-%m-%d %H:%M:%S') | |
text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first() | |
soup = BeautifulSoup(text_html, 'html.parser') | |
for emoji in soup.find_all('img'): | |
emoji.string = emoji.attrs['alt'] | |
text = soup.text | |
replies_to = current_id | |
current_id = tweet_id | |
replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \ | |
.css('span.ProfileTweet-actionCountForPresentation::text') \ | |
.extract_first() | |
retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \ | |
.css('span.ProfileTweet-actionCountForPresentation::text') \ | |
.extract_first() | |
likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \ | |
.css('span.ProfileTweet-actionCountForPresentation::text') \ | |
.extract_first() | |
if replies is None: | |
replies = 0 | |
else: | |
replies = int(replies) | |
if retweets is None: | |
retweets = 0 | |
else: | |
retweets = int(retweets) | |
if likes is None: | |
likes = 0 | |
else: | |
likes = int(likes) | |
yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp, | |
text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to, | |
retweet_of=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment