SansPapyrus683/twitter.py

## twitter.py
import json
import re
import os
import shutil
import requests


def load_twt_obj(file: str) -> list:
    raw = open(file).read()
    return json.loads(raw[raw.find("=") + 1:])


tweets = load_twt_obj("data/tweets.js") + load_twt_obj("data/deleted-tweets.js")

del_dir = "data/deleted_tweets_media"
gen_dir = "data/tweets_media"
for fn in os.listdir(del_dir):
    shutil.copy(os.path.join(del_dir, fn), gen_dir)

# after getting the actual images this isn't needed but just in case
all_raw_media = os.listdir(gen_dir)
all_media = {}
for i in all_raw_media:
    post_id = i[:i.find("-")]
    img_id = i[i.find("-") + 1:i.rfind(".")]
    _, ext = os.path.splitext(i)
    if post_id not in all_media:
        all_media[post_id] = {}
    all_media[post_id][img_id] = ext

handle_fmt = re.compile(r"RT @([^:]*):")
img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
os.makedirs("good_media", exist_ok=True)
for v, t in enumerate(tweets):
    t = t["tweet"]
    match = handle_fmt.match(t["full_text"])
    if match is None:
        continue

    handle = match.group(1)
    og_id = t["id"]
    if "media" not in t["entities"]:
        continue

    media = t["extended_entities"]["media"]
    src_id = [m["source_status_id"] for m in media]
    assert len(set(src_id)) == 1  # just a sanity check
    src_id = src_id[0]

    img_at = 0
    for m in media:
        img_id = img_id_fmt.match(m["media_url"])
        # sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
        if img_id is None:
            continue

        img_id = img_id.group(1)
        if img_id not in all_media.get(og_id, []):
            continue

        ext = all_media[og_id][img_id]
        stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
        sigma_path = f"good_media/{handle}_{src_id}_{img_at}{ext}"

        dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
        img_data = requests.get(dl_url).content
        with open(sigma_path, "wb") as written:
            written.write(img_data)

        # shutil.copy(stupid_path, sigma_path)
        img_at += 1

    if (v + 1) % 100 == 0:
        print(f"at tweet #{v + 1}")
	import json
	import re
	import os
	import shutil
	import requests


	def load_twt_obj(file: str) -> list:
	raw = open(file).read()
	return json.loads(raw[raw.find("=") + 1:])


	tweets = load_twt_obj("data/tweets.js") + load_twt_obj("data/deleted-tweets.js")

	del_dir = "data/deleted_tweets_media"
	gen_dir = "data/tweets_media"
	for fn in os.listdir(del_dir):
	shutil.copy(os.path.join(del_dir, fn), gen_dir)

	# after getting the actual images this isn't needed but just in case
	all_raw_media = os.listdir(gen_dir)
	all_media = {}
	for i in all_raw_media:
	post_id = i[:i.find("-")]
	img_id = i[i.find("-") + 1:i.rfind(".")]
	_, ext = os.path.splitext(i)
	if post_id not in all_media:
	all_media[post_id] = {}
	all_media[post_id][img_id] = ext

	handle_fmt = re.compile(r"RT @([^:]*):")
	img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.])\.")
	os.makedirs("good_media", exist_ok=True)
	for v, t in enumerate(tweets):
	t = t["tweet"]
	match = handle_fmt.match(t["full_text"])
	if match is None:
	continue

	handle = match.group(1)
	og_id = t["id"]
	if "media" not in t["entities"]:
	continue

	media = t["extended_entities"]["media"]
	src_id = [m["source_status_id"] for m in media]
	assert len(set(src_id)) == 1 # just a sanity check
	src_id = src_id[0]

	img_at = 0
	for m in media:
	img_id = img_id_fmt.match(m["media_url"])
	# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
	if img_id is None:
	continue

	img_id = img_id.group(1)
	if img_id not in all_media.get(og_id, []):
	continue

	ext = all_media[og_id][img_id]
	stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
	sigma_path = f"good_media/{handle}_{src_id}_{img_at}{ext}"

	dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
	img_data = requests.get(dl_url).content
	with open(sigma_path, "wb") as written:
	written.write(img_data)

	# shutil.copy(stupid_path, sigma_path)
	img_at += 1

	if (v + 1) % 100 == 0:
	print(f"at tweet #{v + 1}")