Last active
January 1, 2016 06:59
-
-
Save MonkeyIsNull/8109081 to your computer and use it in GitHub Desktop.
Download the MP3s from a DropBox server using Channels with 10 workers, handles the nasty redirects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"time" | |
"fmt" | |
"io" | |
"net/http" | |
"os" | |
"strings" | |
"github.com/PuerkitoBio/goquery" | |
) | |
func dumpLinks(site string) []string { | |
var doc *goquery.Document | |
var e error | |
files := make([]string,0) // Dumb, but we just want to append | |
fileSet := make(map[string]bool) // we have to keep a set b/c they aren't all unique | |
var mp3Count int | |
mp3Count = 0 | |
doc, e = goquery.NewDocument(site) | |
if(e != nil) { | |
panic(e.Error()) | |
} | |
doc.Find("a").Each(func(i int, s *goquery.Selection) { | |
s_attr, _ := s.Attr("href") | |
if(strings.Contains(s_attr, "MP3")) { | |
fmt.Printf("Selection: %s\n", s_attr) | |
fileSet[s_attr] = true | |
} | |
}) | |
mp3Count = len(fileSet) | |
for k, v := range fileSet { | |
if(v == true) { | |
files = append(files, k) | |
} | |
} | |
fmt.Println("MP3Count: ", mp3Count) | |
fmt.Println("FilesCount: ", len(files)) | |
return files | |
} | |
func grabActualDlLink(site string) string { | |
var doc *goquery.Document | |
var e error | |
doc, e = goquery.NewDocument(site) | |
if(e != nil) { | |
panic(e.Error()) | |
} | |
var link_val string | |
doc.Find("#default_content_download_button").Each(func(i int, s *goquery.Selection) { | |
s_attr, _ := s.Attr("href") | |
if(strings.Contains(s_attr, "MP3")) { | |
fmt.Printf("[RealLink] %s\n", s_attr) | |
link_val = s_attr | |
} | |
}) | |
return link_val | |
} | |
func cleanUrlMess(url string) string { | |
parts := strings.Split(url, "/") | |
last := len(parts) | |
effFileName := parts[last-1] | |
endChunks := strings.Split(effFileName, "?") | |
return(endChunks[0]) | |
} | |
// Warning, this load the whole thing into memory | |
func grabFile(url string) string { | |
fmt.Println("Pulling: " + url) | |
fileName := cleanUrlMess(url) | |
output, err := os.Create(fileName) | |
defer output.Close() | |
resp, err := http.Get(url) | |
if err != nil { | |
fmt.Println("Lies, all damn lies:", err) | |
return(url + " FAILED!") //yeah, like really great error handling. BOO! | |
} | |
defer resp.Body.Close() | |
numBytes, err := io.Copy(output, resp.Body) | |
fmt.Println(numBytes, "bytes Downloaded, saved: ", fileName) | |
return fileName | |
} | |
func downloadUrl(urlChan chan string, doneChan chan string) { | |
redirLink := <-urlChan | |
url := grabActualDlLink(redirLink) | |
someFile := grabFile(url) | |
doneChan <- "[" + someFile + "] done" | |
downloadUrl(urlChan, doneChan) | |
return | |
} | |
func main() { | |
urlChan := make(chan string, 200) | |
doneChan := make(chan string, 10) // Because we have 10 workers | |
// All the task urls to download go here | |
fmt.Println("Initialize the tasks...") | |
someFiles := dumpLinks("https://www.dropbox.com/.../someId/SomethingInhere/blahblah") | |
for i := range someFiles { | |
fmt.Println(someFiles[i]) | |
urlChan <- someFiles[i] | |
} | |
// This is your worker pool, they pull | |
// the next taskUrl off the masterChannel | |
fmt.Println("Initialize the workers...") | |
for worker := range make([]int, 10) { | |
fmt.Println("[worker]", worker) | |
go downloadUrl(urlChan, doneChan) | |
} | |
// Done counter so we break out | |
totalFiles := len(someFiles) | |
i := 0 | |
// Read everything off the doneChan | |
fmt.Println("Wait for it all..") | |
for j := range doneChan { | |
fmt.Println("[DONE]", j) | |
i++ | |
if(i == totalFiles) { //dont break until all grabbed | |
break | |
} | |
} | |
fmt.Println("ok, done") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment