Skip to content

Instantly share code, notes, and snippets.

@MonkeyIsNull
Last active January 1, 2016 06:59
Show Gist options
  • Save MonkeyIsNull/8109081 to your computer and use it in GitHub Desktop.
Save MonkeyIsNull/8109081 to your computer and use it in GitHub Desktop.
Download the MP3s from a DropBox server using Channels with 10 workers, handles the nasty redirects
package main
import (
"time"
"fmt"
"io"
"net/http"
"os"
"strings"
"github.com/PuerkitoBio/goquery"
)
func dumpLinks(site string) []string {
var doc *goquery.Document
var e error
files := make([]string,0) // Dumb, but we just want to append
fileSet := make(map[string]bool) // we have to keep a set b/c they aren't all unique
var mp3Count int
mp3Count = 0
doc, e = goquery.NewDocument(site)
if(e != nil) {
panic(e.Error())
}
doc.Find("a").Each(func(i int, s *goquery.Selection) {
s_attr, _ := s.Attr("href")
if(strings.Contains(s_attr, "MP3")) {
fmt.Printf("Selection: %s\n", s_attr)
fileSet[s_attr] = true
}
})
mp3Count = len(fileSet)
for k, v := range fileSet {
if(v == true) {
files = append(files, k)
}
}
fmt.Println("MP3Count: ", mp3Count)
fmt.Println("FilesCount: ", len(files))
return files
}
func grabActualDlLink(site string) string {
var doc *goquery.Document
var e error
doc, e = goquery.NewDocument(site)
if(e != nil) {
panic(e.Error())
}
var link_val string
doc.Find("#default_content_download_button").Each(func(i int, s *goquery.Selection) {
s_attr, _ := s.Attr("href")
if(strings.Contains(s_attr, "MP3")) {
fmt.Printf("[RealLink] %s\n", s_attr)
link_val = s_attr
}
})
return link_val
}
func cleanUrlMess(url string) string {
parts := strings.Split(url, "/")
last := len(parts)
effFileName := parts[last-1]
endChunks := strings.Split(effFileName, "?")
return(endChunks[0])
}
// Warning, this load the whole thing into memory
func grabFile(url string) string {
fmt.Println("Pulling: " + url)
fileName := cleanUrlMess(url)
output, err := os.Create(fileName)
defer output.Close()
resp, err := http.Get(url)
if err != nil {
fmt.Println("Lies, all damn lies:", err)
return(url + " FAILED!") //yeah, like really great error handling. BOO!
}
defer resp.Body.Close()
numBytes, err := io.Copy(output, resp.Body)
fmt.Println(numBytes, "bytes Downloaded, saved: ", fileName)
return fileName
}
func downloadUrl(urlChan chan string, doneChan chan string) {
redirLink := <-urlChan
url := grabActualDlLink(redirLink)
someFile := grabFile(url)
doneChan <- "[" + someFile + "] done"
downloadUrl(urlChan, doneChan)
return
}
func main() {
urlChan := make(chan string, 200)
doneChan := make(chan string, 10) // Because we have 10 workers
// All the task urls to download go here
fmt.Println("Initialize the tasks...")
someFiles := dumpLinks("https://www.dropbox.com/.../someId/SomethingInhere/blahblah")
for i := range someFiles {
fmt.Println(someFiles[i])
urlChan <- someFiles[i]
}
// This is your worker pool, they pull
// the next taskUrl off the masterChannel
fmt.Println("Initialize the workers...")
for worker := range make([]int, 10) {
fmt.Println("[worker]", worker)
go downloadUrl(urlChan, doneChan)
}
// Done counter so we break out
totalFiles := len(someFiles)
i := 0
// Read everything off the doneChan
fmt.Println("Wait for it all..")
for j := range doneChan {
fmt.Println("[DONE]", j)
i++
if(i == totalFiles) { //dont break until all grabbed
break
}
}
fmt.Println("ok, done")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment