ameerkat/scrap_twitter.py

## scrap_twitter.py
from scrapingbee import ScrapingBeeClient
import time
import logging
import json

SCRAPING_BEE_API_KEY = "RBUHWF4Y0ORC8RGXVRG07VNCBNFN3AH3083P3CHJKEF00HIFGQD2Z0BIMXD4C7AHF14S361H85NZ5TYF" # replace with your API key

class ScrappingBeeClientWrapper:
    def __init__(self, client, client_config):
        self.client = client
        self.client_config = client_config

    def get(self, url, params = {}):
        retry_delay = self.client_config["retry_delay_ms"] / 1000.0
        for i in range(self.client_config["max_retries"]):
            try:
                response = self.client.get(url, params=params)
                if response.ok:
                    return response
            except Exception as e:
                logging.error("Woah! That request failed with:")
                logging.error(e)
            if i != self.client_config["max_retries"] - 1:
                time.sleep(retry_delay)
            retry_delay *= self.client_config["retry_delay_growth_factor"]
        return response

client = ScrappingBeeClientWrapper(ScrapingBeeClient(api_key=SCRAPING_BEE_API_KEY), {
    "max_retries": 5,
    "retry_delay_ms": 2000,
    "retry_delay_growth_factor": 2 # set to 1 to have delay be static
})

search_term="google.com"
target_url = f"https://twitter.com/search?q={search_term}&src=typed_query&f=live"
tweet_response = client.get(target_url, params = {
    'render_js': 'True',
    'window_height': 4320,
    'wait': 5000,
    # The JS scenario here is quite tricky as the site only keeps the
    # last X tweets in the DOM. You have to capture the data, then
    # scroll then capture the next chunk almost tweet by tweet. Our
    # samples could actually be quite small though.

    # 'js_scenario': {
    #     "instructions": [
    #         # scroll and wait and scroll and wait if possible to load
    #         # latest tweets. Figuring out when to stop scrolling
    #         # can be a little tricky. We might want to use frequency
    #         # to estimate based on the sample we get.
    #     ]
    # },
    'extract_rules':{
        "tweets": {
            "selector": "article[data-testid='tweet']",
            "type": "list",
            "output": {
                "handle": "div[data-testid='User-Names'] a[tabindex='-1'] span",
                "permalink": {
                    "selector": "div[data-testid='User-Names'] a[dir='auto']",
                    "output": "@href"
                },
                "time": {
                    "selector": "div[data-testid='User-Names'] time",
                    "output": "@datetime"
                },
                "text": "div[data-testid='tweetText']",
                "replies": "div[data-testid='reply']",
                "retweets": "div[data-testid='retweet']",
                "likes": "div[data-testid='like']"
            }
        }
    }
})

if tweet_response.status_code != 200:
    print(f"Failed to get twitter search page ({target_url}) with response code {tweet_response.status_code}")
    print(tweet_response.content)

json_result = json.loads(tweet_response.content)

# This is optional, if you think something is wrong with the code above for example and you aren't getting
# the same output as you expect, try running this.
if not json_result["tweets"]:
    print("Failed to find any tweets. Check screenshot to see if page loaded correctly.")
    screenshot_response = client.get(target_url, params = {
            'render_js': 'True', # they've changed it to have some redirect
            'window_height': 4320,
            'timeout': 20000,
            'wait': 5000,
            'screenshot': True
        })

    if not screenshot_response.ok:
        logging.warning(f"Failed to get a screenshot of the target page {target_url}. {screenshot_response.content}")
    else:
        logging.warning(f"Writing screenshot to file.")
        target_file = f"./twitter.png"
        try:
            with open(target_file, "wb") as f:
                f.write(screenshot_response.content)
                logging.warn(f"Wrote screenshot to file {target_file}")
        except Exception as e:
            logging.error(f"Failed to write screenshot due to exception {e}.")
else:
    # do something with the response
    print(json.dumps(json_result, indent=2))
	from scrapingbee import ScrapingBeeClient
	import time
	import logging
	import json

	SCRAPING_BEE_API_KEY = "RBUHWF4Y0ORC8RGXVRG07VNCBNFN3AH3083P3CHJKEF00HIFGQD2Z0BIMXD4C7AHF14S361H85NZ5TYF" # replace with your API key

	class ScrappingBeeClientWrapper:
	def __init__(self, client, client_config):
	self.client = client
	self.client_config = client_config

	def get(self, url, params = {}):
	retry_delay = self.client_config["retry_delay_ms"] / 1000.0
	for i in range(self.client_config["max_retries"]):
	try:
	response = self.client.get(url, params=params)
	if response.ok:
	return response
	except Exception as e:
	logging.error("Woah! That request failed with:")
	logging.error(e)
	if i != self.client_config["max_retries"] - 1:
	time.sleep(retry_delay)
	retry_delay *= self.client_config["retry_delay_growth_factor"]
	return response

	client = ScrappingBeeClientWrapper(ScrapingBeeClient(api_key=SCRAPING_BEE_API_KEY), {
	"max_retries": 5,
	"retry_delay_ms": 2000,
	"retry_delay_growth_factor": 2 # set to 1 to have delay be static
	})

	search_term="google.com"
	target_url = f"https://twitter.com/search?q={search_term}&src=typed_query&f=live"
	tweet_response = client.get(target_url, params = {
	'render_js': 'True',
	'window_height': 4320,
	'wait': 5000,
	# The JS scenario here is quite tricky as the site only keeps the
	# last X tweets in the DOM. You have to capture the data, then
	# scroll then capture the next chunk almost tweet by tweet. Our
	# samples could actually be quite small though.

	# 'js_scenario': {
	# "instructions": [
	# # scroll and wait and scroll and wait if possible to load
	# # latest tweets. Figuring out when to stop scrolling
	# # can be a little tricky. We might want to use frequency
	# # to estimate based on the sample we get.
	# ]
	# },
	'extract_rules':{
	"tweets": {
	"selector": "article[data-testid='tweet']",
	"type": "list",
	"output": {
	"handle": "div[data-testid='User-Names'] a[tabindex='-1'] span",
	"permalink": {
	"selector": "div[data-testid='User-Names'] a[dir='auto']",
	"output": "@href"
	},
	"time": {
	"selector": "div[data-testid='User-Names'] time",
	"output": "@datetime"
	},
	"text": "div[data-testid='tweetText']",
	"replies": "div[data-testid='reply']",
	"retweets": "div[data-testid='retweet']",
	"likes": "div[data-testid='like']"
	}
	}
	}
	})

	if tweet_response.status_code != 200:
	print(f"Failed to get twitter search page ({target_url}) with response code {tweet_response.status_code}")
	print(tweet_response.content)

	json_result = json.loads(tweet_response.content)

	# This is optional, if you think something is wrong with the code above for example and you aren't getting
	# the same output as you expect, try running this.
	if not json_result["tweets"]:
	print("Failed to find any tweets. Check screenshot to see if page loaded correctly.")
	screenshot_response = client.get(target_url, params = {
	'render_js': 'True', # they've changed it to have some redirect
	'window_height': 4320,
	'timeout': 20000,
	'wait': 5000,
	'screenshot': True
	})

	if not screenshot_response.ok:
	logging.warning(f"Failed to get a screenshot of the target page {target_url}. {screenshot_response.content}")
	else:
	logging.warning(f"Writing screenshot to file.")
	target_file = f"./twitter.png"
	try:
	with open(target_file, "wb") as f:
	f.write(screenshot_response.content)
	logging.warn(f"Wrote screenshot to file {target_file}")
	except Exception as e:
	logging.error(f"Failed to write screenshot due to exception {e}.")
	else:
	# do something with the response
	print(json.dumps(json_result, indent=2))