Skip to content

Instantly share code, notes, and snippets.

@Foadsf
Created May 18, 2024 20:51
Show Gist options
  • Save Foadsf/f888714dc073ee11f184bfda8dc5e474 to your computer and use it in GitHub Desktop.
Save Foadsf/f888714dc073ee11f184bfda8dc5e474 to your computer and use it in GitHub Desktop.
PowerShell script to download and extract plain text from autogenerated subtitles of a YouTube playlist
param (
[string]$playlistID
)
if (-not $playlistID) {
Write-Host "Please provide a playlist ID as a command-line argument."
exit 1
}
# Define the playlist URL and output file
$playlistURL = "https://www.youtube.com/playlist?list=$playlistID"
$outputFile = "all_subtitles.txt"
try {
# Fetch the list of video URLs and titles in the playlist
$playlistInfo = yt-dlp --flat-playlist --print "%(id)s %(title)s" $playlistURL
Write-Host "Fetched playlist information."
if (-not $playlistInfo) {
throw "Failed to fetch playlist information."
}
# Initialize or clear the output file
"" > $outputFile
# Split the playlist information into IDs and Titles
$playlistLines = $playlistInfo -split "`n"
Write-Host "Split playlist information into lines."
if ($playlistLines.Length -eq 0) {
throw "No videos found in the playlist."
}
# Loop through each video, download the subtitles, and extract the plain text
for ($i = 0; $i -lt $playlistLines.Length; $i++) {
$line = $playlistLines[$i]
# Write-Host "Processing line ${i}: ${line}"
if ($line -match "^(?<id>[\w-]+) (?<title>.+)$") {
$videoID = $matches['id']
$videoTitle = $matches['title'] -replace '[^\w\s-]', '' -replace '\s+', '_'
$subFile = "$($i+1)_$videoTitle"
# Write-Host "Video ID: ${videoID}"
# Write-Host "Video Title: ${videoTitle}"
# Write-Host "Subtitle file: ${subFile}"
yt-dlp --write-auto-subs --sub-lang en --sub-format ttml --convert-subs lrc --skip-download -o "$subFile" "https://www.youtube.com/watch?v=${videoID}"
$fileName = "$subFile.en.lrc"
if (Test-Path $fileName) {
Add-Content -Path $outputFile -Value $videoTitle
Write-Host "Subtitle file ${fileName} downloaded successfully."
$subContent = Get-Content $fileName
foreach ($line in $subContent) {
if ($line -match '<font color="#ffffff">(.*?)</font>') {
$plainText = $matches[1]
# Write-Host "Extracted text: ${plainText}"
Add-Content -Path $outputFile -Value $plainText
}
else {
Write-Host "No match found in line: ${line}"
}
}
Remove-Item $fileName
}
else {
throw "Subtitle file ${fileName} not found."
}
}
else {
throw "Line did not match the expected pattern: ${line}"
}
}
Write-Host "All subtitles have been downloaded and merged into $outputFile."
}
catch {
Write-Host "An error occurred: $_"
exit 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment