aussetg/spider.py

## spider.py
# -*- coding: utf-8 -*-
import scrapy
import datetime
import json
from TwitterScraper.items import Tweet
from urllib.parse import quote
from bs4 import BeautifulSoup
from scrapy.http import HtmlResponse
from dateutil.parser import parse

# TODO: We should use urlencode to build urls, not weird string concatenation with quoting

class SearchSpider(scrapy.Spider):
    name = "search"

    def __init__(self, q=None, since=None, until=None, tag=None, *args, **kwargs):
        super(SearchSpider, self).__init__(*args, **kwargs)
        base_url = "https://twitter.com/i/search/timeline?f=tweets&q={}%20since%3A{}%20until%3A{}"

        if q is None:
            q = "@BNPParibas"
        if since is None:
            start = datetime.datetime.today()
        else:
            start = datetime.datetime.strptime(since, "%Y-%m-%d")
        if until is None:
            stop = start + datetime.timedelta(1)
        else:
            stop = datetime.datetime.strptime(until, "%Y-%m-%d") + datetime.timedelta(1)

        query = q

        if since is None:
            self.query = query
        else:
            self.query = "{} since:{} until:{}".format(quote(query),
                                                       datetime.datetime.strftime(start, "%Y-%m-%d"),
                                                       datetime.datetime.strftime(stop, "%Y-%m-%d"))

        self.tag = tag

        numdays = (stop - start).days

        self.logger.info('numdays: {}'.format(numdays))

        self.urls = [base_url.format(quote(query),
                                     datetime.datetime.strftime(start + datetime.timedelta(days=d), "%Y-%m-%d"),
                                     datetime.datetime.strftime(start + datetime.timedelta(days=d + 1), "%Y-%m-%d"))
                     for d in range(0, numdays + 1)]
        self.logger.info('URLs for scrapping: {}'.format(self.urls))

    def start_requests(self):

        for url in self.urls:
            request = scrapy.Request(url=url, callback=self.parse)
            request.meta["base_url"] = url
            yield request

    def parse(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        base_url = response.meta["base_url"]
        self.logger.info("Parsing URL: {}".format(response.url))
        html = scrapy.Selector(text=jsonresponse['items_html'], type="html")
        tweets = html.css('div.original-tweet')
        for tweet in tweets:
            replies, retweets, likes = list(
                map(lambda x: int(x[0]), tweet.css('span.ProfileTweet-actionCountForAria::text').extract()))
            tweet_id = int(tweet.css('div.original-tweet::attr(data-tweet-id)').extract_first())
            user_id = int(tweet.css('div.original-tweet::attr(data-user-id)').extract_first())
            timestamp = datetime.datetime.fromtimestamp(
                int(tweet.css('span._timestamp::attr(data-time)').extract_first())).strftime('%Y-%m-%d %H:%M:%S')
            # Tweets are formated a werird way, instead of just inserting the emoji as unicode it's inserted as an image
            # ( probably for compatibility ? ) and the 'alt" tag is the proper emoji sympa so that's it copyable.
            # So if we just do string() we will get everything BUT the emojis. I just add the emoji back as text as a
            # workaround.
            text_html = tweet.xpath("descendant-or-self::p[@class and contains(concat(' ', normalize-space(@class), "
                               "' '), ' TweetTextSize ') and (@class and contains(concat(' ', normalize-space(@class), "
                               "' '), ' js-tweet-text ')) and (@class and contains(concat(' ', normalize-space(@class), ' '),"
                               " ' tweet-text '))]").extract_first()
            soup = BeautifulSoup(text_html, 'html.parser')
            for emoji in soup.find_all('img'):
                emoji.string = emoji.attrs['alt']
            text = soup.text
            ## Need to fix cases where it's "text pic.twitter.com", those are saved as "textpic.twitter.com"
            lang = tweet.css('p.tweet-text::attr(lang)').extract_first()
            # TODO: We would like the tweet we reply to, instead we get the user. How to retrieve it?
            # TODO: Follow the link and scrape?
            # TODO: follow the link, retrieve last ThreadedConversation-tweet, get data-item-id
            replies_to = tweet.css('div.ReplyingToContextBelowAuthor a.pretty-link.js-user-profile-link::attr(data-user-id)').extract_first()
            if replies_to is not None:
                replies_to = int(replies_to)
                # We could scrape every single tweet but to reduce the number of requests we will only do those where
                # we know they are part of a conversation.
                # We could check in database if a tweet is already present to prevent over scraping.
                tweet_url = "https://twiter.com" + tweet.css(
                    'div.tweet.js-stream-tweet.js-actionable-tweet::attr(data-permalink-path)').extract_first()
                yield scrapy.Request(tweet_url, self.parse_single)
            retweet_of = tweet.css('div.QuoteTweet-innerContainer.u-cf.js-permalink.js-media-container::attr(data-item-id)').extract_first()
            if retweet_of is not None:
                retweet_of = int(retweet_of)
            yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
                        text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to, retweet_of=retweet_of)
        if len(tweets) == 20:
            min_tweet = tweets[0].css('div.original-tweet::attr(data-tweet-id)').extract_first()
            max_tweet = tweets[-1].css('div.original-tweet::attr(data-tweet-id)').extract_first()
            cursor = "TWEET-{}-{}-" \
                     "BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
                     "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
                     "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
                .format(max_tweet, min_tweet)
            url = base_url + "&max_position={}".format(cursor)
            request = scrapy.Request(url, self.parse)
            request.meta["base_url"] = base_url
            yield request

    def parse_single(self, response):
        self.logger.info('Parsing a single tweet dependencies: {}'.format(response.url))
        html = scrapy.Selector(text=response.body_as_unicode(), type="html")
        # Some infos are common to all tweets.
        main = html.css('div.permalink-inner.permalink-tweet-container')
        lang = main.css('p.tweet-text::attr(lang)').extract_first()
        initial_id = int(main.css('div.tweet.permalink-tweet.js-actionable-user.'
                                  'js-actionable-tweet.js-original-tweet::attr(data-tweet-id)').extract_first())
        # Let's do the other tweets
        parents = html.css('div.permalink-in-reply-tos')
        children = html.css('div.replies-to')
        # Let's first parse the parent tweets.
        parent_tweets = parents.css('div.ThreadedConversation-tweet')
        current_id = None
        for tweet in parent_tweets:
            user_id = int(main.css('a.account-group.js-account-group'
                                   '.js-action-profile.js-user-profile-link'
                                   '.js-nav::attr(data-user-id)').extract_first())
            tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first())
            timestamp = datetime.datetime.fromtimestamp(
                int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\
                .strftime('%Y-%m-%d %H:%M:%S')
            text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first()
            soup = BeautifulSoup(text_html, 'html.parser')
            for emoji in soup.find_all('img'):
                emoji.string = emoji.attrs['alt']
            text = soup.text
            replies_to = current_id
            current_id = tweet_id
            replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \
                .css('span.ProfileTweet-actionCountForPresentation::text') \
                .extract_first()
            retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \
                .css('span.ProfileTweet-actionCountForPresentation::text') \
                .extract_first()
            likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \
                .css('span.ProfileTweet-actionCountForPresentation::text') \
                .extract_first()
            if replies is None:
                replies = 0
            else:
                replies = int(replies)
            if retweets is None:
                retweets = 0
            else:
                retweets = int(retweets)
            if likes is None:
                likes = 0
            else:
                likes = int(likes)
            yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
                        text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to,
                        retweet_of=None)
        # Now let's first parse the child tweets.
        child_tweets = children.css('div.ThreadedConversation-tweet')
        current_id = initial_id
        for tweet in child_tweets:
            user_id = int(main.css('a.account-group.js-account-group'
                                   '.js-action-profile.js-user-profile-link'
                                   '.js-nav::attr(data-user-id)').extract_first())
            tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first())
            timestamp = datetime.datetime.fromtimestamp(
                int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\
                .strftime('%Y-%m-%d %H:%M:%S')
            text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first()
            soup = BeautifulSoup(text_html, 'html.parser')
            for emoji in soup.find_all('img'):
                emoji.string = emoji.attrs['alt']
            text = soup.text
            replies_to = current_id
            current_id = tweet_id
            replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \
                .css('span.ProfileTweet-actionCountForPresentation::text') \
                .extract_first()
            retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \
                .css('span.ProfileTweet-actionCountForPresentation::text') \
                .extract_first()
            likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \
                .css('span.ProfileTweet-actionCountForPresentation::text') \
                .extract_first()
            if replies is None:
                replies = 0
            else:
                replies = int(replies)
            if retweets is None:
                retweets = 0
            else:
                retweets = int(retweets)
            if likes is None:
                likes = 0
            else:
                likes = int(likes)
            yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
                        text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to,
                        retweet_of=None)
	# -- coding: utf-8 --
	import scrapy
	import datetime
	import json
	from TwitterScraper.items import Tweet
	from urllib.parse import quote
	from bs4 import BeautifulSoup
	from scrapy.http import HtmlResponse
	from dateutil.parser import parse

	# TODO: We should use urlencode to build urls, not weird string concatenation with quoting

	class SearchSpider(scrapy.Spider):
	name = "search"

	def __init__(self, q=None, since=None, until=None, tag=None, args, *kwargs):
	super(SearchSpider, self).__init__(args, *kwargs)
	base_url = "https://twitter.com/i/search/timeline?f=tweets&q={}%20since%3A{}%20until%3A{}"

	if q is None:
	q = "@BNPParibas"
	if since is None:
	start = datetime.datetime.today()
	else:
	start = datetime.datetime.strptime(since, "%Y-%m-%d")
	if until is None:
	stop = start + datetime.timedelta(1)
	else:
	stop = datetime.datetime.strptime(until, "%Y-%m-%d") + datetime.timedelta(1)

	query = q

	if since is None:
	self.query = query
	else:
	self.query = "{} since:{} until:{}".format(quote(query),
	datetime.datetime.strftime(start, "%Y-%m-%d"),
	datetime.datetime.strftime(stop, "%Y-%m-%d"))

	self.tag = tag

	numdays = (stop - start).days

	self.logger.info('numdays: {}'.format(numdays))

	self.urls = [base_url.format(quote(query),
	datetime.datetime.strftime(start + datetime.timedelta(days=d), "%Y-%m-%d"),
	datetime.datetime.strftime(start + datetime.timedelta(days=d + 1), "%Y-%m-%d"))
	for d in range(0, numdays + 1)]
	self.logger.info('URLs for scrapping: {}'.format(self.urls))

	def start_requests(self):

	for url in self.urls:
	request = scrapy.Request(url=url, callback=self.parse)
	request.meta["base_url"] = url
	yield request

	def parse(self, response):
	jsonresponse = json.loads(response.body_as_unicode())
	base_url = response.meta["base_url"]
	self.logger.info("Parsing URL: {}".format(response.url))
	html = scrapy.Selector(text=jsonresponse['items_html'], type="html")
	tweets = html.css('div.original-tweet')
	for tweet in tweets:
	replies, retweets, likes = list(
	map(lambda x: int(x[0]), tweet.css('span.ProfileTweet-actionCountForAria::text').extract()))
	tweet_id = int(tweet.css('div.original-tweet::attr(data-tweet-id)').extract_first())
	user_id = int(tweet.css('div.original-tweet::attr(data-user-id)').extract_first())
	timestamp = datetime.datetime.fromtimestamp(
	int(tweet.css('span._timestamp::attr(data-time)').extract_first())).strftime('%Y-%m-%d %H:%M:%S')
	# Tweets are formated a werird way, instead of just inserting the emoji as unicode it's inserted as an image
	# ( probably for compatibility ? ) and the 'alt" tag is the proper emoji sympa so that's it copyable.
	# So if we just do string() we will get everything BUT the emojis. I just add the emoji back as text as a
	# workaround.
	text_html = tweet.xpath("descendant-or-self::p[@class and contains(concat(' ', normalize-space(@class), "
	"' '), ' TweetTextSize ') and (@class and contains(concat(' ', normalize-space(@class), "
	"' '), ' js-tweet-text ')) and (@class and contains(concat(' ', normalize-space(@class), ' '),"
	" ' tweet-text '))]").extract_first()
	soup = BeautifulSoup(text_html, 'html.parser')
	for emoji in soup.find_all('img'):
	emoji.string = emoji.attrs['alt']
	text = soup.text
	## Need to fix cases where it's "text pic.twitter.com", those are saved as "textpic.twitter.com"
	lang = tweet.css('p.tweet-text::attr(lang)').extract_first()
	# TODO: We would like the tweet we reply to, instead we get the user. How to retrieve it?
	# TODO: Follow the link and scrape?
	# TODO: follow the link, retrieve last ThreadedConversation-tweet, get data-item-id
	replies_to = tweet.css('div.ReplyingToContextBelowAuthor a.pretty-link.js-user-profile-link::attr(data-user-id)').extract_first()
	if replies_to is not None:
	replies_to = int(replies_to)
	# We could scrape every single tweet but to reduce the number of requests we will only do those where
	# we know they are part of a conversation.
	# We could check in database if a tweet is already present to prevent over scraping.
	tweet_url = "https://twiter.com" + tweet.css(
	'div.tweet.js-stream-tweet.js-actionable-tweet::attr(data-permalink-path)').extract_first()
	yield scrapy.Request(tweet_url, self.parse_single)
	retweet_of = tweet.css('div.QuoteTweet-innerContainer.u-cf.js-permalink.js-media-container::attr(data-item-id)').extract_first()
	if retweet_of is not None:
	retweet_of = int(retweet_of)
	yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
	text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to, retweet_of=retweet_of)
	if len(tweets) == 20:
	min_tweet = tweets[0].css('div.original-tweet::attr(data-tweet-id)').extract_first()
	max_tweet = tweets[-1].css('div.original-tweet::attr(data-tweet-id)').extract_first()
	cursor = "TWEET-{}-{}-" \
	"BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
	"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
	"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
	.format(max_tweet, min_tweet)
	url = base_url + "&max_position={}".format(cursor)
	request = scrapy.Request(url, self.parse)
	request.meta["base_url"] = base_url
	yield request

	def parse_single(self, response):
	self.logger.info('Parsing a single tweet dependencies: {}'.format(response.url))
	html = scrapy.Selector(text=response.body_as_unicode(), type="html")
	# Some infos are common to all tweets.
	main = html.css('div.permalink-inner.permalink-tweet-container')
	lang = main.css('p.tweet-text::attr(lang)').extract_first()
	initial_id = int(main.css('div.tweet.permalink-tweet.js-actionable-user.'
	'js-actionable-tweet.js-original-tweet::attr(data-tweet-id)').extract_first())
	# Let's do the other tweets
	parents = html.css('div.permalink-in-reply-tos')
	children = html.css('div.replies-to')
	# Let's first parse the parent tweets.
	parent_tweets = parents.css('div.ThreadedConversation-tweet')
	current_id = None
	for tweet in parent_tweets:
	user_id = int(main.css('a.account-group.js-account-group'
	'.js-action-profile.js-user-profile-link'
	'.js-nav::attr(data-user-id)').extract_first())
	tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first())
	timestamp = datetime.datetime.fromtimestamp(
	int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\
	.strftime('%Y-%m-%d %H:%M:%S')
	text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first()
	soup = BeautifulSoup(text_html, 'html.parser')
	for emoji in soup.find_all('img'):
	emoji.string = emoji.attrs['alt']
	text = soup.text
	replies_to = current_id
	current_id = tweet_id
	replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \
	.css('span.ProfileTweet-actionCountForPresentation::text') \
	.extract_first()
	retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \
	.css('span.ProfileTweet-actionCountForPresentation::text') \
	.extract_first()
	likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \
	.css('span.ProfileTweet-actionCountForPresentation::text') \
	.extract_first()
	if replies is None:
	replies = 0
	else:
	replies = int(replies)
	if retweets is None:
	retweets = 0
	else:
	retweets = int(retweets)
	if likes is None:
	likes = 0
	else:
	likes = int(likes)
	yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
	text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to,
	retweet_of=None)
	# Now let's first parse the child tweets.
	child_tweets = children.css('div.ThreadedConversation-tweet')
	current_id = initial_id
	for tweet in child_tweets:
	user_id = int(main.css('a.account-group.js-account-group'
	'.js-action-profile.js-user-profile-link'
	'.js-nav::attr(data-user-id)').extract_first())
	tweet_id = int(tweet.css("div.tweet.js-stream-tweet::attr(data-item-id)").extract_first())
	timestamp = datetime.datetime.fromtimestamp(
	int(tweet.css("span._timestamp.js-short-timestamp::attr(data-time)").extract_first()))\
	.strftime('%Y-%m-%d %H:%M:%S')
	text_html = tweet.css("p.TweetTextSize.js-tweet-text.tweet-text").extract_first()
	soup = BeautifulSoup(text_html, 'html.parser')
	for emoji in soup.find_all('img'):
	emoji.string = emoji.attrs['alt']
	text = soup.text
	replies_to = current_id
	current_id = tweet_id
	replies = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionReply') \
	.css('span.ProfileTweet-actionCountForPresentation::text') \
	.extract_first()
	retweets = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionRetweet') \
	.css('span.ProfileTweet-actionCountForPresentation::text') \
	.extract_first()
	likes = tweet.css('button.ProfileTweet-actionButton.js-actionButton.js-actionFavorite') \
	.css('span.ProfileTweet-actionCountForPresentation::text') \
	.extract_first()
	if replies is None:
	replies = 0
	else:
	replies = int(replies)
	if retweets is None:
	retweets = 0
	else:
	retweets = int(retweets)
	if likes is None:
	likes = 0
	else:
	likes = int(likes)
	yield Tweet(tweet_id=tweet_id, user_id=user_id, timestamp=timestamp,
	text=text, lang=lang, replies=replies, retweets=retweets, likes=likes, replies_to=replies_to,
	retweet_of=None)