Skip to content

Instantly share code, notes, and snippets.

@dbreunig
Created February 15, 2023 19:05
Show Gist options
  • Save dbreunig/8838ca3aa079a3aa702009114923e603 to your computer and use it in GitHub Desktop.
Save dbreunig/8838ca3aa079a3aa702009114923e603 to your computer and use it in GitHub Desktop.
Download podcasts from an XML feed, transcribe them with whisper, and insert the data into a sqlite db.
import feedparser
import whisper
import sqlite3
import requests
podcast_feed_url = "https://feeds.libsyn.com/92106/rss"
db_name = "podcast.db"
# Create the database and its tables.
con = sqlite3.connect(db_name)
cur = con.cursor()
cur.execute("""
CREATE TABLE episodes(
id TEXT PRIMARY KEY,
title TEXT,
pub_date TEXT,
link TEXT,
summary TEXT,
audio_link TEXT,
processed INTEGER DEFAULT 0
)
""")
cur.execute("""
CREATE TABLE segments(
episode_id INTEGER,
seek REAL,
start REAL,
end REAL,
text TEXT,
FOREIGN KEY (episode_id)
REFERENCES episodes (id)
ON DELETE CASCADE
ON UPDATE NO ACTION
)
""")
# Load whisper model [tiny, base, small, medium, or larger]
model = whisper.load_model("small")
# Load the podcast feed url
feed = feedparser.parse(podcast_feed_url)
# Prep the podcast feed data into an array prepared for sql insertion
episodes = []
for e in feed.entries:
# Find the audio link
filename = ""
for l in e['links']:
if l['rel'] == 'enclosure':
audio_link = l['href']
# Load the metadata we need
episodes.append((
e['id'],
e['title'],
e['published'],
e['link'],
e['summary'],
audio_link,
0
))
# Insert the episodes into the db
cur.executemany("INSERT INTO episodes VALUES(?, ?, ?, ?, ?, ?, ?)", episodes)
con.commit()
# Transcribe the first 10 episodes (adjust the subarray values to transcribe fewer or more)
transcription = ""
for e in episodes[0:10]:
print(f"Starting {e[1]}")
# Download
filename = f"{e[5].split('/')[-1]}.mp3"
response = requests.get(e[5])
open(filename, "wb").write(response.content)
# Transcribe
transcription = model.transcribe(filename)
# Load
segments = []
for s in transcription['segments']:
segments.append((e[0], s['seek'], s['start'], s['end'], s['text']))
cur.executemany("INSERT INTO segments VALUES(?, ?, ?, ?, ?)", segments)
con.commit()
# Puts result
print(f"Loaded {e[1]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment