Skip to content

Instantly share code, notes, and snippets.

@rebane2001
Created July 19, 2021 19:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rebane2001/1b7f2c8e10130698c0390298c58e5069 to your computer and use it in GitHub Desktop.
Save rebane2001/1b7f2c8e10130698c0390298c58e5069 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import requests
import re
import os
import time
import json
from datetime import datetime, timezone
import random
api_keys = [
"API-KEY", # Recommended to only use one to avoid ban
]
allids = set()
def updateIDs():
global allids
allids = set()
for file in ["api-filtered.txt","api-all.txt","api-all-plain.txt"]:
with open(f"{file}", "r", encoding="UTF-8") as f:
for l in f:
# Strip unnecessary parts and leave only the video ID
videoid = l.replace("https://www.youtube.com/watch?v=", "").replace("youtube ", "").strip()
allids.add(videoid)
def videoAlreadyAdded(videoid):
return videoid in allids
def get_videos_page(video_ids):
api_key = random.choice(api_keys)
response = requests.get(
(
f'https://www.googleapis.com/youtube/v3/videos?'
f'id={",".join(video_ids)}'
f'&part=status,snippet,contentDetails,statistics'
f'&maxResults=50'
f'&key={api_key}'
)
)
#print(response.text)
if not response.status_code == 200:
print("Something not right!")
print(response.text,api_key)
return ([], None)
precious_data = json.loads(response.text)
return precious_data["items"]
def get_all_videos_from_ids(video_ids):
updateIDs()
video_ids = list(filter(lambda x: not videoAlreadyAdded(x), video_ids))
print("Fetching API data...")
while len(video_ids) > 0:
print(f"⏳ {len(video_ids)} to go...")
videos = get_videos_page(video_ids[:50])
with open("api-dump.jsonl", "a", encoding="UTF-8") as f:
for video in videos:
f.write(f"{json.dumps(video)}\n")
with open("api-filtered.txt", "a") as f:
for video_id in filter_ids(videos):
print(f"🦄 {video_id}")
f.write(f"{video_id}\n")
with open("api-all.txt", "a") as f:
for video_id in [vid["id"] for vid in videos]:
f.write(f"{video_id}\n")
with open("api-all-plain.txt", "a") as f:
for video_id in video_ids[:50]:
f.write(f"{video_id}\n")
video_ids = video_ids[50:]
def parse_date_format(date_str):
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
def filter_ids(videos):
critical_datetime = datetime(year=2017, month=1, day=2, tzinfo=timezone.utc)
videos = list(filter(lambda x: x["status"]["privacyStatus"] == "unlisted", videos))
videos = [(x, parse_date_format(x["snippet"]["publishedAt"])) for x in videos]
videos = list(filter(lambda x: x[1] < critical_datetime, videos))
videos = [x[0] for x in videos]
video_ids = [vid["id"] for vid in videos]
return video_ids
print("Loading IDs...")
video_ids = []
with open("sorted.txt","r") as f:
for l in f:
video_ids.append(l.strip())
get_all_videos_from_ids(video_ids)
print("All done?")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment