Last active
June 1, 2019 23:15
-
-
Save LovingMelody/cb2cd3f8d3dd02829343b56a1c4cf983 to your computer and use it in GitHub Desktop.
Basic twitter scraper for media (Just images)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
import random | |
import sys | |
from time import sleep | |
import requests | |
from bs4 import BeautifulSoup | |
MIN_SLEEP=0 | |
MAX_SLEEP=10 | |
def sleep_time(): | |
return random.randrange(MIN_SLEEP,MAX_SLEEP) | |
def parse_user(user): | |
r = requests.get(f"https://mobile.twitter.com/{user}/media/grid") | |
soup = BeautifulSoup(r.text, features="html5lib") | |
title = soup.select_one('.title') | |
if title is None: | |
print("Failed to get media") | |
return [] | |
try: | |
ct = int(title.text.strip().split(' ')[-1]) | |
except: | |
print("Failed to media count, aborting") | |
return [] | |
print(f"Fetching {ct} images for {user}") | |
tweets = [] | |
for idx in range(ct): | |
try: | |
r = requests.get(f"https://mobile.twitter.com/{user}/media/grid", cookies=r.cookies, params={'idx':idx}) | |
soup = BeautifulSoup(r.text, features="html5lib") | |
link = soup.select_one('.media > img:nth-child(1)') | |
tweets.append({ | |
'media': link['src'], | |
'id': link['id'], | |
'content': soup.select_one('.tweet-content').text | |
}) | |
sys.stdout.write(f"Fetched {idx}\033[K\n") | |
except: | |
print(f"Failed to fetch {idx}") | |
finally: | |
time = sleep_time() | |
print(f"Sleeping for {time}s", end='\r') | |
sleep(time) | |
print("Finished collecting for", user) | |
return tweets | |
def main(): | |
data = {} | |
try: | |
for user in sys.argv[1:]: | |
try: | |
data[user] =parse_user(user) | |
except: | |
print(f"Failed to find {user}.") | |
data[user] = [] | |
except: | |
pass | |
with open("data.json", "w") as f: | |
json.dump(data, f) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4>=4.7.1 | |
html5lib>=1.0.1 | |
requests>=2.22.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment