Created
July 19, 2021 19:20
-
-
Save rebane2001/1b7f2c8e10130698c0390298c58e5069 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests | |
import re | |
import os | |
import time | |
import json | |
from datetime import datetime, timezone | |
import random | |
api_keys = [ | |
"API-KEY", # Recommended to only use one to avoid ban | |
] | |
allids = set() | |
def updateIDs(): | |
global allids | |
allids = set() | |
for file in ["api-filtered.txt","api-all.txt","api-all-plain.txt"]: | |
with open(f"{file}", "r", encoding="UTF-8") as f: | |
for l in f: | |
# Strip unnecessary parts and leave only the video ID | |
videoid = l.replace("https://www.youtube.com/watch?v=", "").replace("youtube ", "").strip() | |
allids.add(videoid) | |
def videoAlreadyAdded(videoid): | |
return videoid in allids | |
def get_videos_page(video_ids): | |
api_key = random.choice(api_keys) | |
response = requests.get( | |
( | |
f'https://www.googleapis.com/youtube/v3/videos?' | |
f'id={",".join(video_ids)}' | |
f'&part=status,snippet,contentDetails,statistics' | |
f'&maxResults=50' | |
f'&key={api_key}' | |
) | |
) | |
#print(response.text) | |
if not response.status_code == 200: | |
print("Something not right!") | |
print(response.text,api_key) | |
return ([], None) | |
precious_data = json.loads(response.text) | |
return precious_data["items"] | |
def get_all_videos_from_ids(video_ids): | |
updateIDs() | |
video_ids = list(filter(lambda x: not videoAlreadyAdded(x), video_ids)) | |
print("Fetching API data...") | |
while len(video_ids) > 0: | |
print(f"⏳ {len(video_ids)} to go...") | |
videos = get_videos_page(video_ids[:50]) | |
with open("api-dump.jsonl", "a", encoding="UTF-8") as f: | |
for video in videos: | |
f.write(f"{json.dumps(video)}\n") | |
with open("api-filtered.txt", "a") as f: | |
for video_id in filter_ids(videos): | |
print(f"🦄 {video_id}") | |
f.write(f"{video_id}\n") | |
with open("api-all.txt", "a") as f: | |
for video_id in [vid["id"] for vid in videos]: | |
f.write(f"{video_id}\n") | |
with open("api-all-plain.txt", "a") as f: | |
for video_id in video_ids[:50]: | |
f.write(f"{video_id}\n") | |
video_ids = video_ids[50:] | |
def parse_date_format(date_str): | |
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) | |
def filter_ids(videos): | |
critical_datetime = datetime(year=2017, month=1, day=2, tzinfo=timezone.utc) | |
videos = list(filter(lambda x: x["status"]["privacyStatus"] == "unlisted", videos)) | |
videos = [(x, parse_date_format(x["snippet"]["publishedAt"])) for x in videos] | |
videos = list(filter(lambda x: x[1] < critical_datetime, videos)) | |
videos = [x[0] for x in videos] | |
video_ids = [vid["id"] for vid in videos] | |
return video_ids | |
print("Loading IDs...") | |
video_ids = [] | |
with open("sorted.txt","r") as f: | |
for l in f: | |
video_ids.append(l.strip()) | |
get_all_videos_from_ids(video_ids) | |
print("All done?") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment