Skip to content

Instantly share code, notes, and snippets.

@LovingMelody
Last active June 1, 2019 23:15
Show Gist options
  • Save LovingMelody/cb2cd3f8d3dd02829343b56a1c4cf983 to your computer and use it in GitHub Desktop.
Save LovingMelody/cb2cd3f8d3dd02829343b56a1c4cf983 to your computer and use it in GitHub Desktop.
Basic twitter scraper for media (Just images)
#!/usr/bin/env python3
import json
import random
import sys
from time import sleep
import requests
from bs4 import BeautifulSoup
MIN_SLEEP=0
MAX_SLEEP=10
def sleep_time():
return random.randrange(MIN_SLEEP,MAX_SLEEP)
def parse_user(user):
r = requests.get(f"https://mobile.twitter.com/{user}/media/grid")
soup = BeautifulSoup(r.text, features="html5lib")
title = soup.select_one('.title')
if title is None:
print("Failed to get media")
return []
try:
ct = int(title.text.strip().split(' ')[-1])
except:
print("Failed to media count, aborting")
return []
print(f"Fetching {ct} images for {user}")
tweets = []
for idx in range(ct):
try:
r = requests.get(f"https://mobile.twitter.com/{user}/media/grid", cookies=r.cookies, params={'idx':idx})
soup = BeautifulSoup(r.text, features="html5lib")
link = soup.select_one('.media > img:nth-child(1)')
tweets.append({
'media': link['src'],
'id': link['id'],
'content': soup.select_one('.tweet-content').text
})
sys.stdout.write(f"Fetched {idx}\033[K\n")
except:
print(f"Failed to fetch {idx}")
finally:
time = sleep_time()
print(f"Sleeping for {time}s", end='\r')
sleep(time)
print("Finished collecting for", user)
return tweets
def main():
data = {}
try:
for user in sys.argv[1:]:
try:
data[user] =parse_user(user)
except:
print(f"Failed to find {user}.")
data[user] = []
except:
pass
with open("data.json", "w") as f:
json.dump(data, f)
if __name__ == "__main__":
main()
beautifulsoup4>=4.7.1
html5lib>=1.0.1
requests>=2.22.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment