Skip to content

Instantly share code, notes, and snippets.

@ameerkat
Last active December 31, 2022 17:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ameerkat/0b218d3552b6be47fa3bccdf43d2001b to your computer and use it in GitHub Desktop.
Save ameerkat/0b218d3552b6be47fa3bccdf43d2001b to your computer and use it in GitHub Desktop.
from scrapingbee import ScrapingBeeClient
import time
import logging
import json
SCRAPING_BEE_API_KEY = "RBUHWF4Y0ORC8RGXVRG07VNCBNFN3AH3083P3CHJKEF00HIFGQD2Z0BIMXD4C7AHF14S361H85NZ5TYF" # replace with your API key
class ScrappingBeeClientWrapper:
def __init__(self, client, client_config):
self.client = client
self.client_config = client_config
def get(self, url, params = {}):
retry_delay = self.client_config["retry_delay_ms"] / 1000.0
for i in range(self.client_config["max_retries"]):
try:
response = self.client.get(url, params=params)
if response.ok:
return response
except Exception as e:
logging.error("Woah! That request failed with:")
logging.error(e)
if i != self.client_config["max_retries"] - 1:
time.sleep(retry_delay)
retry_delay *= self.client_config["retry_delay_growth_factor"]
return response
client = ScrappingBeeClientWrapper(ScrapingBeeClient(api_key=SCRAPING_BEE_API_KEY), {
"max_retries": 5,
"retry_delay_ms": 2000,
"retry_delay_growth_factor": 2 # set to 1 to have delay be static
})
search_term="google.com"
target_url = f"https://twitter.com/search?q={search_term}&src=typed_query&f=live"
tweet_response = client.get(target_url, params = {
'render_js': 'True',
'window_height': 4320,
'wait': 5000,
# The JS scenario here is quite tricky as the site only keeps the
# last X tweets in the DOM. You have to capture the data, then
# scroll then capture the next chunk almost tweet by tweet. Our
# samples could actually be quite small though.
# 'js_scenario': {
# "instructions": [
# # scroll and wait and scroll and wait if possible to load
# # latest tweets. Figuring out when to stop scrolling
# # can be a little tricky. We might want to use frequency
# # to estimate based on the sample we get.
# ]
# },
'extract_rules':{
"tweets": {
"selector": "article[data-testid='tweet']",
"type": "list",
"output": {
"handle": "div[data-testid='User-Names'] a[tabindex='-1'] span",
"permalink": {
"selector": "div[data-testid='User-Names'] a[dir='auto']",
"output": "@href"
},
"time": {
"selector": "div[data-testid='User-Names'] time",
"output": "@datetime"
},
"text": "div[data-testid='tweetText']",
"replies": "div[data-testid='reply']",
"retweets": "div[data-testid='retweet']",
"likes": "div[data-testid='like']"
}
}
}
})
if tweet_response.status_code != 200:
print(f"Failed to get twitter search page ({target_url}) with response code {tweet_response.status_code}")
print(tweet_response.content)
json_result = json.loads(tweet_response.content)
# This is optional, if you think something is wrong with the code above for example and you aren't getting
# the same output as you expect, try running this.
if not json_result["tweets"]:
print("Failed to find any tweets. Check screenshot to see if page loaded correctly.")
screenshot_response = client.get(target_url, params = {
'render_js': 'True', # they've changed it to have some redirect
'window_height': 4320,
'timeout': 20000,
'wait': 5000,
'screenshot': True
})
if not screenshot_response.ok:
logging.warning(f"Failed to get a screenshot of the target page {target_url}. {screenshot_response.content}")
else:
logging.warning(f"Writing screenshot to file.")
target_file = f"./twitter.png"
try:
with open(target_file, "wb") as f:
f.write(screenshot_response.content)
logging.warn(f"Wrote screenshot to file {target_file}")
except Exception as e:
logging.error(f"Failed to write screenshot due to exception {e}.")
else:
# do something with the response
print(json.dumps(json_result, indent=2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment