Skip to content

Instantly share code, notes, and snippets.

@SansPapyrus683
Created February 26, 2024 21:17
Show Gist options
  • Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.
Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.
download all your twitter anime girls!
import json
import re
import os
import shutil
import requests
def load_twt_obj(file: str) -> list:
raw = open(file).read()
return json.loads(raw[raw.find("=") + 1:])
tweets = load_twt_obj("data/tweets.js") + load_twt_obj("data/deleted-tweets.js")
del_dir = "data/deleted_tweets_media"
gen_dir = "data/tweets_media"
for fn in os.listdir(del_dir):
shutil.copy(os.path.join(del_dir, fn), gen_dir)
# after getting the actual images this isn't needed but just in case
all_raw_media = os.listdir(gen_dir)
all_media = {}
for i in all_raw_media:
post_id = i[:i.find("-")]
img_id = i[i.find("-") + 1:i.rfind(".")]
_, ext = os.path.splitext(i)
if post_id not in all_media:
all_media[post_id] = {}
all_media[post_id][img_id] = ext
handle_fmt = re.compile(r"RT @([^:]*):")
img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
os.makedirs("good_media", exist_ok=True)
for v, t in enumerate(tweets):
t = t["tweet"]
match = handle_fmt.match(t["full_text"])
if match is None:
continue
handle = match.group(1)
og_id = t["id"]
if "media" not in t["entities"]:
continue
media = t["extended_entities"]["media"]
src_id = [m["source_status_id"] for m in media]
assert len(set(src_id)) == 1 # just a sanity check
src_id = src_id[0]
img_at = 0
for m in media:
img_id = img_id_fmt.match(m["media_url"])
# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
if img_id is None:
continue
img_id = img_id.group(1)
if img_id not in all_media.get(og_id, []):
continue
ext = all_media[og_id][img_id]
stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
sigma_path = f"good_media/{handle}_{src_id}_{img_at}{ext}"
dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
img_data = requests.get(dl_url).content
with open(sigma_path, "wb") as written:
written.write(img_data)
# shutil.copy(stupid_path, sigma_path)
img_at += 1
if (v + 1) % 100 == 0:
print(f"at tweet #{v + 1}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment