teemow/fetch_subtitles.sh

## fetch_subtitles.sh
#!/bin/bash

set -eu

FOLDER=$1
PLAYLIST=$2

rm -f playlist.txt
mkdir -p $FOLDER
yt-dlp --flat-playlist -i --print-to-file url playlist.txt $PLAYLIST

for i in $(cat playlist.txt)
do
    FILENAME=$(yt-dlp --get-title --skip-download "$i" | tr -s '[[:space:]]' '_').content

    if [ -f $FOLDER/$FILENAME ]; then
        continue
    fi

    rm -rf tmp
    mkdir -p tmp
    cd tmp

    # fetch subtitle
    yt-dlp --skip-download \
        --sub-lang en-orig \
        --write-auto-sub \
        "$i"

    if [ -f *.vtt ]; then
        # convert subtitle
        for j in *.vtt
        do
            vtt2text "$j"
        done

        # get title and description
        yt-dlp --get-title --get-description --skip-download "$i" > $FILENAME
        cat *.txt >> $FILENAME

        mv $FILENAME ../$FOLDER/$FILENAME
    fi

    cd ..
done

## train-with-subtitles.py
import os
import logging
import sys
import textwrap

from llama_index import (
    GPTKeywordTableIndex,
    Document,
    SimpleDirectoryReader,
    LLMPredictor,
)
from langchain import OpenAI


if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
    logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

    if not os.path.exists("index.json"):
        subtitles_folder = sys.argv[1]
        documents = SimpleDirectoryReader(subtitles_folder).load_data()
        llm_predictor = LLMPredictor(
            llm=OpenAI(temperature=0,
            model_name="text-davinci-003",
            max_tokens=2048)
        )
        index = GPTKeywordTableIndex(documents, llm_predictor=llm_predictor)
        index.save_to_disk("index.json")
    else:
        index = GPTKeywordTableIndex.load_from_disk("index.json")

    while True:
        try:
            prompt = input("What should I figure out? ")
            response = index.query(prompt)
            response = str(response).strip()
            if not response:
                continue
            for line in textwrap.wrap(response, width=75):
                print(line)
            print("-----")
        except KeyboardInterrupt:
            break
	#!/bin/bash

	set -eu

	FOLDER=$1
	PLAYLIST=$2

	rm -f playlist.txt
	mkdir -p $FOLDER
	yt-dlp --flat-playlist -i --print-to-file url playlist.txt $PLAYLIST

	for i in $(cat playlist.txt)
	do
	FILENAME=$(yt-dlp --get-title --skip-download "$i" \| tr -s '[[:space:]]' '_').content

	if [ -f $FOLDER/$FILENAME ]; then
	continue
	fi

	rm -rf tmp
	mkdir -p tmp
	cd tmp

	# fetch subtitle
	yt-dlp --skip-download \
	--sub-lang en-orig \
	--write-auto-sub \
	"$i"

	if [ -f *.vtt ]; then
	# convert subtitle
	for j in *.vtt
	do
	vtt2text "$j"
	done

	# get title and description
	yt-dlp --get-title --get-description --skip-download "$i" > $FILENAME
	cat *.txt >> $FILENAME

	mv $FILENAME ../$FOLDER/$FILENAME
	fi

	cd ..
	done
	import os
	import logging
	import sys
	import textwrap

	from llama_index import (
	GPTKeywordTableIndex,
	Document,
	SimpleDirectoryReader,
	LLMPredictor,
	)
	from langchain import OpenAI


	if __name__ == "__main__":
	logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
	logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

	if not os.path.exists("index.json"):
	subtitles_folder = sys.argv[1]
	documents = SimpleDirectoryReader(subtitles_folder).load_data()
	llm_predictor = LLMPredictor(
	llm=OpenAI(temperature=0,
	model_name="text-davinci-003",
	max_tokens=2048)
	)
	index = GPTKeywordTableIndex(documents, llm_predictor=llm_predictor)
	index.save_to_disk("index.json")
	else:
	index = GPTKeywordTableIndex.load_from_disk("index.json")

	while True:
	try:
	prompt = input("What should I figure out? ")
	response = index.query(prompt)
	response = str(response).strip()
	if not response:
	continue
	for line in textwrap.wrap(response, width=75):
	print(line)
	print("-----")
	except KeyboardInterrupt:
	break