Last active February 11, 2022 22:00
Scrape videos data from youtube and push to airtable. By default reads from videos.txt but can defined with an argument.
import pafy
import os
import glob
import re
from airtable import Airtable
from airtable.auth import AirtableAuth
import requests
from pathlib import Path
from webvtt import WebVTT
apikey = "airtable api key"
baseurl = "base url of airtable"
tableid = "table id"
# takes in a string and tells you if its a number ot not
def is_number(s):
return True
except ValueError:
import unicodedata
return True
except (TypeError, ValueError):
return False
# takes in a line and cleans html tags
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
# Takes in a youtube url and returns video ID
def url_to_video_id(url):
youtube_regex = (
youtube_regex_match = re.match(youtube_regex, url)
if youtube_regex_match:
return youtube_regex_match
# Optionally takes in a file name and returns all lines in an array
def videos_array(filename="videos.txt"):
lines = [line.rstrip('\n') for line in open(filename)]
return lines
# Takes video url, downloads subtitles and returns its filename
def download_subs(video_url, lang="en"):
cmd = [
os.system(" ".join(cmd))
videoID = url_to_video_id(video_url)
for file in glob.glob('*' + videoID + '.*'):
return file
def filetotranscript(filename):
if os.path.isfile(filename):
trans = ""
webvtt = WebVTT().read(filename)
count = 0
for caption in webvtt:
printthis = caption.text
printthis = printthis.rstrip()
if count % 3 == 0:
trans += printthis + " "
count = count + 1
trans = "No captions"
return trans
def youtube_info_object(url):
info =
infoObj = vars(info)
filename = download_subs(url)
if filename is None:
filename = " "
infoObj['transcript'] = filetotranscript(filename)
infoObj['description'] = info.description
infoObj['duration'] = info.duration
return infoObj
def push_to_airtable(line):
airtable = Airtable(baseurl, tableid, apikey)
url = str(line['watchv_url'])
id = str(line['videoid'])
custom = str(line['_title'])
views = int(line['_viewcount'])
transcript = str(line['transcript'])
thumbnail = str('' + id + '/hqdefault.jpg')
keywords = ", ".join(line['_keywords'])
channelName = str(line['_username'])
description = str(line['_description'])
published = str(line['_published'])
published = published.split('-')[0]
dur = line['_length']
if len(custom.split('|')) > 1:
title = custom.split('|')[0]
speaker = custom.split('|')[1]
title = custom
speaker = "Stephen Stearns"
airtable.insert({"sys.yt_id": id, "URL": url, "Title": title, "sys.yt_transcript": transcript,
"KeyWords": keywords, "Length": dur, "Creator_FirstLast": speaker,
"Sponsor": channelName, "Abstract/Description": description, "Hosting site": hosting,
"Year": published, "Kind of resource": ["Video"], "Kind tex": "Video",
"sys.approved": False, "Image": [{"url": thumbnail}]})
# MAIN program
videos = videos_array()
for video in videos:
vidObj = youtube_info_object(video)
