rebane2001/api_filter.py

## api_filter.py
#!/usr/bin/env python3
import requests
import re
import os
import time
import json
from datetime import datetime, timezone

import random

api_keys = [
  "API-KEY", # Recommended to only use one to avoid ban
]
allids = set()

def updateIDs():
    global allids
    allids = set()
    for file in ["api-filtered.txt","api-all.txt","api-all-plain.txt"]:
        with open(f"{file}", "r", encoding="UTF-8") as f:
            for l in f:
                # Strip unnecessary parts and leave only the video ID
                videoid = l.replace("https://www.youtube.com/watch?v=", "").replace("youtube ", "").strip()
                allids.add(videoid)

def videoAlreadyAdded(videoid):
    return videoid in allids

def get_videos_page(video_ids):
    api_key = random.choice(api_keys)
    response = requests.get(
            (
                f'https://www.googleapis.com/youtube/v3/videos?'
                f'id={",".join(video_ids)}'
                f'&part=status,snippet,contentDetails,statistics'
                f'&maxResults=50'
                f'&key={api_key}'
            )
        )

    #print(response.text)

    if not response.status_code == 200:
        print("Something not right!")
        print(response.text,api_key)
        return ([], None)

    precious_data = json.loads(response.text)
    return precious_data["items"]

def get_all_videos_from_ids(video_ids):
    updateIDs()
    video_ids = list(filter(lambda x: not videoAlreadyAdded(x), video_ids))
    print("Fetching API data...")
    while len(video_ids) > 0:
        print(f"⏳ {len(video_ids)} to go...")
        videos = get_videos_page(video_ids[:50])
        with open("api-dump.jsonl", "a", encoding="UTF-8") as f:
            for video in videos:
                f.write(f"{json.dumps(video)}\n")
        with open("api-filtered.txt", "a") as f:
            for video_id in filter_ids(videos):
                print(f"🦄 {video_id}")
                f.write(f"{video_id}\n")
        with open("api-all.txt", "a") as f:
            for video_id in [vid["id"] for vid in videos]:
                f.write(f"{video_id}\n")
        with open("api-all-plain.txt", "a") as f:
            for video_id in video_ids[:50]:
                f.write(f"{video_id}\n")
        video_ids = video_ids[50:]

def parse_date_format(date_str):
    return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)

def filter_ids(videos):
    critical_datetime = datetime(year=2017, month=1, day=2, tzinfo=timezone.utc)
    videos = list(filter(lambda x: x["status"]["privacyStatus"] == "unlisted", videos))
    videos = [(x, parse_date_format(x["snippet"]["publishedAt"])) for x in videos]
    videos = list(filter(lambda x: x[1] < critical_datetime, videos))
    videos = [x[0] for x in videos]
    video_ids = [vid["id"] for vid in videos]
    return video_ids

print("Loading IDs...")
video_ids = []
with open("sorted.txt","r") as f:
    for l in f:
        video_ids.append(l.strip())
get_all_videos_from_ids(video_ids)
print("All done?")
	#!/usr/bin/env python3
	import requests
	import re
	import os
	import time
	import json
	from datetime import datetime, timezone

	import random

	api_keys = [
	"API-KEY", # Recommended to only use one to avoid ban
	]
	allids = set()

	def updateIDs():
	global allids
	allids = set()
	for file in ["api-filtered.txt","api-all.txt","api-all-plain.txt"]:
	with open(f"{file}", "r", encoding="UTF-8") as f:
	for l in f:
	# Strip unnecessary parts and leave only the video ID
	videoid = l.replace("https://www.youtube.com/watch?v=", "").replace("youtube ", "").strip()
	allids.add(videoid)

	def videoAlreadyAdded(videoid):
	return videoid in allids

	def get_videos_page(video_ids):
	api_key = random.choice(api_keys)
	response = requests.get(
	(
	f'https://www.googleapis.com/youtube/v3/videos?'
	f'id={",".join(video_ids)}'
	f'&part=status,snippet,contentDetails,statistics'
	f'&maxResults=50'
	f'&key={api_key}'
	)
	)

	#print(response.text)

	if not response.status_code == 200:
	print("Something not right!")
	print(response.text,api_key)
	return ([], None)

	precious_data = json.loads(response.text)
	return precious_data["items"]

	def get_all_videos_from_ids(video_ids):
	updateIDs()
	video_ids = list(filter(lambda x: not videoAlreadyAdded(x), video_ids))
	print("Fetching API data...")
	while len(video_ids) > 0:
	print(f"⏳ {len(video_ids)} to go...")
	videos = get_videos_page(video_ids[:50])
	with open("api-dump.jsonl", "a", encoding="UTF-8") as f:
	for video in videos:
	f.write(f"{json.dumps(video)}\n")
	with open("api-filtered.txt", "a") as f:
	for video_id in filter_ids(videos):
	print(f"🦄 {video_id}")
	f.write(f"{video_id}\n")
	with open("api-all.txt", "a") as f:
	for video_id in [vid["id"] for vid in videos]:
	f.write(f"{video_id}\n")
	with open("api-all-plain.txt", "a") as f:
	for video_id in video_ids[:50]:
	f.write(f"{video_id}\n")
	video_ids = video_ids[50:]

	def parse_date_format(date_str):
	return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)

	def filter_ids(videos):
	critical_datetime = datetime(year=2017, month=1, day=2, tzinfo=timezone.utc)
	videos = list(filter(lambda x: x["status"]["privacyStatus"] == "unlisted", videos))
	videos = [(x, parse_date_format(x["snippet"]["publishedAt"])) for x in videos]
	videos = list(filter(lambda x: x[1] < critical_datetime, videos))
	videos = [x[0] for x in videos]
	video_ids = [vid["id"] for vid in videos]
	return video_ids

	print("Loading IDs...")
	video_ids = []
	with open("sorted.txt","r") as f:
	for l in f:
	video_ids.append(l.strip())
	get_all_videos_from_ids(video_ids)
	print("All done?")