Skip to content

Instantly share code, notes, and snippets.

@jiacai2050
Created November 27, 2023 13:55
Show Gist options
  • Save jiacai2050/60726d09400493548650269377b99600 to your computer and use it in GitHub Desktop.
Save jiacai2050/60726d09400493548650269377b99600 to your computer and use it in GitHub Desktop.
Download podcasts
package main
import (
"encoding/xml"
"flag"
"fmt"
"go-apps/pkg/flagx"
"go-apps/pkg/util"
"io"
"log"
"net/http"
_ "net/http/pprof"
"os"
"path"
"strings"
"sync"
"sync/atomic"
"time"
)
var (
rssURLOrPath string
output string
dryRun bool
addTitleSeq bool
parallel int
profile bool
profilePort int
)
func init() {
log.SetFlags(log.Lshortfile | log.LstdFlags)
flag.StringVar(&rssURLOrPath, "rss", "", "Podcast RSS URL or file")
flag.StringVar(&output, "output", "/tmp", "Output directory")
flag.BoolVar(&dryRun, "dry", false, "run without any side effect")
flag.BoolVar(&addTitleSeq, "seq", true, "Add sequence number to file, only works when parse pubDate failed")
flag.IntVar(&parallel, "parallel", 8, "How many threads to download")
flag.BoolVar(&profile, "profile", false, "Enable HTTP profile")
flag.IntVar(&profilePort, "profile-port", 6060, "HTTP profile port")
flagx.Parse()
}
// https://developer.mozilla.org/en-US/docs/Web/Media/Formats/Containers#browser_compatibility
var mimeTypeToFileExt = map[string]string{
"audio/mpeg": "mp3", // It seems most of podcast use this mime, and the audio is mp3, not mpg.
"audio/mp3": "mp3",
"audio/wav": "wav",
"audio/webm": "webm",
"audio/ogg": "ogg",
"audio/aac": "aac",
"audio/flac": "flac",
}
const defaultFileExt = "mp3"
func getFileExt(mimeType string) string {
if v, ok := mimeTypeToFileExt[mimeType]; ok {
return v
}
return defaultFileExt
}
func formatPubDate(pubDate string) (string, bool) {
layouts := []string{
time.RFC1123, time.RFC1123Z,
}
for _, layout := range layouts {
dt, err := time.Parse(layout, pubDate)
if err == nil {
return dt.Format("2006-01-02"), true
}
}
return "", false
}
type Enclosure struct {
XMLName xml.Name `xml:"enclosure"`
URL string `xml:"url,attr"`
MimeType string `xml:"type,attr"`
}
type Item struct {
XMLName xml.Name `xml:"item"`
Title string `xml:"title"`
PubDate string `xml:"pubDate"`
Enclosure Enclosure `xml:"enclosure"`
}
type Channel struct {
XMLName xml.Name `xml:"channel"`
Title string `xml:"title"`
Items []Item `xml:"item"`
}
type Rss struct {
XMLName xml.Name `xml:"rss"`
Channel Channel `xml:"channel"`
}
type DownloadItem struct {
path string
url string
}
func main() {
if rssURLOrPath == "" {
flag.Usage()
os.Exit(1)
}
if profile {
go func() {
log.Println(http.ListenAndServe(fmt.Sprintf("localhost:%d", profilePort), nil))
}()
}
util.InitHTTPClient(util.HTTPOption{
Timeout: 10 * time.Minute,
Verbose: flagx.Verbose,
})
var rss Rss
if strings.HasPrefix(rssURLOrPath, "http") {
bs, err := util.Get(rssURLOrPath)
if err != nil {
panic(err)
}
err = xml.Unmarshal(bs, &rss)
if err != nil {
panic(err)
}
} else {
f, err := os.Open(rssURLOrPath)
if err != nil {
panic(err)
}
defer f.Close()
bs, err := io.ReadAll(f)
if err != nil {
panic(err)
}
err = xml.Unmarshal(bs, &rss)
if err != nil {
panic(err)
}
}
title := rss.Channel.Title
outputDir := path.Join(output, normalizeDir(title))
if dryRun {
log.Printf("Ensure [%s] exist!\n", outputDir)
} else {
util.EnsureDir(outputDir)
}
items := rss.Channel.Items
total := len(items)
done := &atomic.Int32{}
wg := &sync.WaitGroup{}
// Start download workers
ch := make(chan DownloadItem, parallel*2)
for i := 0; i < parallel; i++ {
worker := Worker{
ch: ch,
done: done,
wg: wg,
}
go worker.do()
}
seq := 0
log.Printf("Find %d podcasts to download.\n", total)
for i := len(items) - 1; i >= 0; i-- {
showProgress(done, total)
seq += 1
item := items[i]
url := item.Enclosure.URL
ext := getFileExt(item.Enclosure.MimeType)
filename := normalizeFile(item.Title) + "." + ext
if pubDate, ok := formatPubDate(item.PubDate); ok {
filename = pubDate + "--" + filename
} else {
if addTitleSeq {
filename = fmt.Sprintf("%03d--%s", seq, filename)
}
}
audioFilepath := path.Join(outputDir, filename)
if util.FileIsExists(audioFilepath) {
done.Add(1)
log.Printf("Audio already exists, file:%s\n", audioFilepath)
} else {
wg.Add(1)
ch <- DownloadItem{
url: url,
path: audioFilepath,
}
}
}
waitDownload := make(chan struct{})
go func() {
wg.Wait()
close(waitDownload)
}()
for {
select {
case <-waitDownload:
return
default:
showProgress(done, total)
time.Sleep(5 * time.Second)
}
}
}
func showProgress(done *atomic.Int32, total int) {
currentDone := done.Load()
progress := float64(currentDone) / float64(total) * 100
log.Printf("Progress: %.2f%%, done:%d, total:%d\n", progress, currentDone, total)
}
type Worker struct {
ch <-chan DownloadItem
done *atomic.Int32
wg *sync.WaitGroup
}
func (w *Worker) do() {
for item := range w.ch {
w.download(item.url, item.path)
w.done.Add(1)
w.wg.Done()
}
}
func (w *Worker) download(url, audioFilepath string) {
log.Printf("Begin download, url:%s, path:%s\n", url, audioFilepath)
if !dryRun {
tmp := audioFilepath + ".tmp"
if err := util.Download(url, tmp); err != nil {
log.Printf("Download url failed, url:%s, err:%+v\n", url, err)
}
if err := os.Rename(tmp, audioFilepath); err != nil {
log.Printf("Rename tmp failed, file:%s, err:%+v\n", tmp, err)
}
}
}
func normalizeDir(name string) string {
name = strings.Trim(name, " ")
for _, c := range []string{":", " ", "/"} {
name = strings.ReplaceAll(name, c, "-")
}
return name
}
func normalizeFile(name string) string {
name = strings.Trim(name, " ")
for _, c := range []string{"/"} {
name = strings.ReplaceAll(name, c, "-")
}
return name
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment