Phoenix-Effect/main.py

## main.py
import pafy
import os
import glob
import re
from airtable import Airtable
from airtable.auth import AirtableAuth
import requests
from pathlib import Path
from webvtt import WebVTT


apikey = "airtable api key"
baseurl = "base url of airtable"
tableid = "table id"


# takes in a string and tells you if its a number ot not
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass

    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False


# takes in a line and cleans html tags
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


# Takes in a youtube url and returns video ID
def url_to_video_id(url):
    youtube_regex = (
        r'(https?://)?(www\.)?'
        '(youtube|youtu|youtube-nocookie)\.(com|be)/'
        '(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')

    youtube_regex_match = re.match(youtube_regex, url)
    if youtube_regex_match:
        return youtube_regex_match.group(6)

    return youtube_regex_match


# Optionally takes in a file name and returns all lines in an array
def videos_array(filename="videos.txt"):
    lines = [line.rstrip('\n') for line in open(filename)]
    return lines


# Takes video url, downloads subtitles and returns its filename
def download_subs(video_url, lang="en"):
    cmd = [
        "youtube-dl",
        "--skip-download",
        "--write-auto-sub",
        "--sub-lang",
        lang,
        video_url
    ]
    os.system(" ".join(cmd))
    videoID = url_to_video_id(video_url)
    for file in glob.glob('*' + videoID + '.*'):
        return file


def filetotranscript(filename):
    if os.path.isfile(filename):
        trans = ""
        webvtt = WebVTT().read(filename)

        count = 0
        for caption in webvtt:
            printthis = caption.text
            printthis = printthis.rstrip()
            if count % 3 == 0:
                trans += printthis + " "
            count = count + 1
        os.remove(filename)
    else:
        trans = "No captions"

    return trans


def youtube_info_object(url):
    info = pafy.new(url)
    infoObj = vars(info)
    filename = download_subs(url)
    if filename is None:
        filename = " "
    infoObj['transcript'] = filetotranscript(filename)
    infoObj['description'] = info.description
    infoObj['duration'] = info.duration
    return infoObj


def push_to_airtable(line):
    airtable = Airtable(baseurl, tableid, apikey)
    url = str(line['watchv_url'])
    id = str(line['videoid'])
    custom = str(line['_title'])
    views = int(line['_viewcount'])
    transcript = str(line['transcript'])
    thumbnail = str('https://img.youtube.com/vi/' + id + '/hqdefault.jpg')
    keywords = ", ".join(line['_keywords'])
    channelName = str(line['_username'])
    description = str(line['_description'])
    published = str(line['_published'])
    published = published.split('-')[0]
    dur = line['_length']

    if len(custom.split('|')) > 1:
        title = custom.split('|')[0]
        speaker = custom.split('|')[1]
    else:
        title = custom
        speaker = "Stephen Stearns"

    airtable.insert({"sys.yt_id": id, "URL": url, "Title": title, "sys.yt_transcript": transcript,
                     "KeyWords": keywords, "Length": dur, "Creator_FirstLast": speaker,
                     "Sponsor": channelName, "Abstract/Description": description, "Hosting site": hosting,
                     "Year": published, "Kind of resource": ["Video"], "Kind tex": "Video",
                     "sys.approved": False, "Image": [{"url": thumbnail}]})


# MAIN program
videos = videos_array()
for video in videos:
    vidObj = youtube_info_object(video)
    push_to_airtable(vidObj)
	import pafy
	import os
	import glob
	import re
	from airtable import Airtable
	from airtable.auth import AirtableAuth
	import requests
	from pathlib import Path
	from webvtt import WebVTT


	apikey = "airtable api key"
	baseurl = "base url of airtable"
	tableid = "table id"


	# takes in a string and tells you if its a number ot not
	def is_number(s):
	try:
	float(s)
	return True
	except ValueError:
	pass

	try:
	import unicodedata
	unicodedata.numeric(s)
	return True
	except (TypeError, ValueError):
	pass
	return False


	# takes in a line and cleans html tags
	def cleanhtml(raw_html):
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext


	# Takes in a youtube url and returns video ID
	def url_to_video_id(url):
	youtube_regex = (
	r'(https?://)?(www\.)?'
	'(youtube\|youtu\|youtube-nocookie)\.(com\|be)/'
	'(watch\?v=\|embed/\|v/\|.+\?v=)?([^&=%\?]{11})')

	youtube_regex_match = re.match(youtube_regex, url)
	if youtube_regex_match:
	return youtube_regex_match.group(6)

	return youtube_regex_match


	# Optionally takes in a file name and returns all lines in an array
	def videos_array(filename="videos.txt"):
	lines = [line.rstrip('\n') for line in open(filename)]
	return lines


	# Takes video url, downloads subtitles and returns its filename
	def download_subs(video_url, lang="en"):
	cmd = [
	"youtube-dl",
	"--skip-download",
	"--write-auto-sub",
	"--sub-lang",
	lang,
	video_url
	]
	os.system(" ".join(cmd))
	videoID = url_to_video_id(video_url)
	for file in glob.glob('' + videoID + '.'):
	return file


	def filetotranscript(filename):
	if os.path.isfile(filename):
	trans = ""
	webvtt = WebVTT().read(filename)

	count = 0
	for caption in webvtt:
	printthis = caption.text
	printthis = printthis.rstrip()
	if count % 3 == 0:
	trans += printthis + " "
	count = count + 1
	os.remove(filename)
	else:
	trans = "No captions"

	return trans


	def youtube_info_object(url):
	info = pafy.new(url)
	infoObj = vars(info)
	filename = download_subs(url)
	if filename is None:
	filename = " "
	infoObj['transcript'] = filetotranscript(filename)
	infoObj['description'] = info.description
	infoObj['duration'] = info.duration
	return infoObj


	def push_to_airtable(line):
	airtable = Airtable(baseurl, tableid, apikey)
	url = str(line['watchv_url'])
	id = str(line['videoid'])
	custom = str(line['_title'])
	views = int(line['_viewcount'])
	transcript = str(line['transcript'])
	thumbnail = str('https://img.youtube.com/vi/' + id + '/hqdefault.jpg')
	keywords = ", ".join(line['_keywords'])
	channelName = str(line['_username'])
	description = str(line['_description'])
	published = str(line['_published'])
	published = published.split('-')[0]
	dur = line['_length']

	if len(custom.split('\|')) > 1:
	title = custom.split('\|')[0]
	speaker = custom.split('\|')[1]
	else:
	title = custom
	speaker = "Stephen Stearns"

	airtable.insert({"sys.yt_id": id, "URL": url, "Title": title, "sys.yt_transcript": transcript,
	"KeyWords": keywords, "Length": dur, "Creator_FirstLast": speaker,
	"Sponsor": channelName, "Abstract/Description": description, "Hosting site": hosting,
	"Year": published, "Kind of resource": ["Video"], "Kind tex": "Video",
	"sys.approved": False, "Image": [{"url": thumbnail}]})


	# MAIN program
	videos = videos_array()
	for video in videos:
	vidObj = youtube_info_object(video)
	push_to_airtable(vidObj)