Skip to content

Instantly share code, notes, and snippets.

@aussetg
Created August 4, 2019 13:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aussetg/d0dd149268c29fe8fffe9787ee3adfdc to your computer and use it in GitHub Desktop.
Save aussetg/d0dd149268c29fe8fffe9787ee3adfdc to your computer and use it in GitHub Desktop.
Spider to scrape Twitter
# -*- coding: utf-8 -*-
import scrapy
import datetime
import json
from TwitterScraper.items import Tweet
from urllib.parse import quote
from bs4 import BeautifulSoup
from scrapy.http import HtmlResponse
from dateutil.parser import parse
# TODO: We should use urlencode to build urls, not weird string concatenation with quoting
class SearchSpider(scrapy.Spider):
name = "search"
def __init__(self, q=None, since=None, until=None, tag=None, *args, **kwargs):
super(SearchSpider, self).__init__(*args, **kwargs)
base_url = "https://twitter.com/i/search/timeline?f=tweets&q={}%20since%3A{}%20until%3A{}"
if q is None:
q = "@BNPParibas"
if since is None:
start = datetime.datetime.today()
else:
start = datetime.datetime.strptime(since, "%Y-%m-%d")
if until is None:
stop = start + datetime.timedelta(1)
else:
stop = datetime.datetime.strptime(until, "%Y-%m-%d") + datetime.timedelta(1)
query = q
if since is None:
self.query = query
else:
self.query = "{} since:{} until:{}".format(quote(query),
datetime.datetime.strftime(start, "%Y-%m-%d"),
datetime.datetime.strftime(stop, "%Y-%m-%d"))
self.tag = tag
numdays = (stop - start).days
self.logger.info('numdays: {}'.format(numdays))
self.urls = [base_url.format(quote(query),
datetime.datetime.strftime(start + datetime.timedelta(days=d), "%Y-%m-%d"),
datetime.datetime.strftime(start + datetime.timedelta(days=d + 1), "%Y-%m-%d"))
for d in range(0, numdays + 1)]
self.logger.info('URLs for scrapping: {}'.format(self.urls))
def start_requests(self):
for url in self.urls:
request = scrapy.Request(url=url, callback=self.parse)
request.meta["base_url"] = url
yield request
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
base_url = response.meta["base_url"]
self.logger.info("Parsing URL: {}".format(response.url))
html = scrapy.Selector(text=jsonresponse['items_html'], type="html")
tweets = html.css('div.original-tweet')
for tweet in tweets:
replies, retweets, likes = list(
map(lambda x: int(x[0]), tweet.css('span.ProfileTweet-actionCountForAria::text').extract()))
tweet_id = int(tweet.css('div.original-tweet::attr(data-tweet-id)').extract_first())
user_id = int(tweet.css('div.original-tweet::attr(data-user-id)').extract_first())
timestamp = datetime.datetime.fromtimestamp(
int(tweet.css('span._timestamp::attr(data-time)').extract_first())).strftime('%Y-%m-%d %H:%M:%S')
# Tweets are formated a werird way, instead of just inserting the emoji as unicode it's inserted as an image
# ( probably for compatibility ? ) and the 'alt" tag is the proper emoji sympa so that's it copyable.
# So if we just do string() we will get everything BUT the emojis. I just add the emoji back as text as a
# workaround.
text_html = tweet.xpath("descendant-or-self::p[@class and contains(concat(' ', normalize-space(@class), "
"' '), ' TweetTextSize ') and (@class and contains(concat(' ', normalize-space(@class), "
"' '), ' js-tweet-text ')) and (@class and contains(concat(' ', normalize-space(@class), ' '),"
" ' tweet-text '))]").extract_first()
soup = BeautifulSoup(text_html, 'html.parser')
for emoji in soup.find_all('img'):
emoji.string = emoji.attrs['alt']
text = soup.text
## Need to fix cases where it's "text pic.twitter.com", those are saved as "textpic.twitter.com"
lang = tweet.css('p.tweet-text::attr(lang)').extract_first()
# TODO: We would like the tweet we reply to, instead we get the user. How to retrieve it?
# TODO: Follow the link and scrape?
# TODO: follow the link, retrieve last ThreadedConversation-tweet, get data-item-id
replies_to = tweet.css('div.ReplyingToContextBelowAuthor a.pretty-link.js-user-profile-link::attr(data-user-id)').extract_first()
if replies_to is not None:
replies_to = int(replies_to)
# We could scrape every single tweet but to reduce the number of requests we will only do those where
# we know they are part of a conversation.
# We could check in database if a tweet is already present to prevent over scraping.
tweet_url = "https://twiter.com" + tweet.css(
'div.tweet.js-stream-tweet.js-actionable-tweet::attr(data-permalink-path)').extract_first()
yield scrapy.Request(tweet_url, self.parse_single)
retweet_of = tweet.css('div.QuoteTweet-innerContainer.u-cf.js-permalink.js-media-container::attr(data-item-id)').extract_first()
if retweet_of is not None:
retweet_of = int(retweet_of)
yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to, retweet_of=retweet_of)
if len(tweets) == 20:
min_tweet = tweets[0].css('div.original-tweet::attr(data-tweet-id)').extract_first()
max_tweet = tweets[-1].css('div.original-tweet::attr(data-tweet-id)').extract_first()
cursor = "TWEET-{}-{}-" \
"BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
.format(max_tweet, min_tweet)
url = base_url + "&max_position={}".format(cursor)
request = scrapy.Request(url, self.parse)
request.meta["base_url"] = base_url
yield request
def parse_single(self, response):
self.logger.info('Parsing a single tweet dependencies: {}'.format(response.url))
html = scrapy.Selector(text=response.body_as_unicode(), type="html")
# Some infos are common to all tweets.
main = html.css('div.permalink-inner.permalink-tweet-container')
lang = main.css('p.tweet-text::attr(lang)').extract_first()
initial_id = int(main.css('div.tweet.permalink-tweet.js-actionable-user.'
'js-actionable-tweet.js-original-tweet::attr(data-tweet-id)').extract_first())
# Let's do the other tweets
parents = html.css('div.permalink-in-reply-tos')
children = html.css('div.replies-to')
# Let's first parse the parent tweets.
parent_tweets = parents.css('div.ThreadedConversation-tweet')
current_id = None
for tweet in parent_tweets:
user_id = int(main.css('a.account-group.js-account-group'
'.js-action-profile.js-user-profile-link'
'.js-nav::attr(data-user-id)').extract_first())
tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first())
timestamp = datetime.datetime.fromtimestamp(
int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\
.strftime('%Y-%m-%d %H:%M:%S')
text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first()
soup = BeautifulSoup(text_html, 'html.parser')
for emoji in soup.find_all('img'):
emoji.string = emoji.attrs['alt']
text = soup.text
replies_to = current_id
current_id = tweet_id
replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \
.css('span.ProfileTweet-actionCountForPresentation::text') \
.extract_first()
retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \
.css('span.ProfileTweet-actionCountForPresentation::text') \
.extract_first()
likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \
.css('span.ProfileTweet-actionCountForPresentation::text') \
.extract_first()
if replies is None:
replies = 0
else:
replies = int(replies)
if retweets is None:
retweets = 0
else:
retweets = int(retweets)
if likes is None:
likes = 0
else:
likes = int(likes)
yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to,
retweet_of=None)
# Now let's first parse the child tweets.
child_tweets = children.css('div.ThreadedConversation-tweet')
current_id = initial_id
for tweet in child_tweets:
user_id = int(main.css('a.account-group.js-account-group'
'.js-action-profile.js-user-profile-link'
'.js-nav::attr(data-user-id)').extract_first())
tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first())
timestamp = datetime.datetime.fromtimestamp(
int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\
.strftime('%Y-%m-%d %H:%M:%S')
text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first()
soup = BeautifulSoup(text_html, 'html.parser')
for emoji in soup.find_all('img'):
emoji.string = emoji.attrs['alt']
text = soup.text
replies_to = current_id
current_id = tweet_id
replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \
.css('span.ProfileTweet-actionCountForPresentation::text') \
.extract_first()
retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \
.css('span.ProfileTweet-actionCountForPresentation::text') \
.extract_first()
likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \
.css('span.ProfileTweet-actionCountForPresentation::text') \
.extract_first()
if replies is None:
replies = 0
else:
replies = int(replies)
if retweets is None:
retweets = 0
else:
retweets = int(retweets)
if likes is None:
likes = 0
else:
likes = int(likes)
yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to,
retweet_of=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment