Skip to content

Instantly share code, notes, and snippets.

@coyove
Created July 5, 2016 02:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save coyove/0665ad3d161ba3cd5b55cdc4524abecd to your computer and use it in GitHub Desktop.
Save coyove/0665ad3d161ba3cd5b55cdc4524abecd to your computer and use it in GitHub Desktop.
package main
import (
"flag"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"net/http"
"os"
"path/filepath"
"regexp"
"strconv"
"sync"
"sync/atomic"
"time"
)
var MainURL = flag.String("url", "http://schoolgirl2015.tumblr.com", "Main URL (No trailing slash)")
var MainDir = flag.String("dir", ".", "Save to directory")
var Before = flag.String("before", "", "")
func TryDownloadLargest(URL, mainDir, filename string, wg *sync.WaitGroup, succCount *int64) {
defer wg.Done()
tid, _ := strconv.Atoi(filename)
dir := mainDir + "/" + strconv.Itoa(tid/500000000) + "/"
os.MkdirAll(dir, 0777)
filename = dir + filename + filepath.Ext(URL)
candi := []string{"1280", "540", "250"}
re := regexp.MustCompile(`(http.+tumblr_.+)_(\d+)(\..+)`)
s := re.FindStringSubmatch(URL)
if len(s) != 4 {
return
}
download := func(url string) error {
timeout := time.Duration(10 * time.Second)
client := http.Client{
Timeout: timeout,
}
res, err := client.Get(url)
if err != nil {
return err
}
defer res.Body.Close()
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
_, err = io.Copy(f, res.Body)
return nil
}
for _, r := range candi {
if err := download(s[1] + "_" + r + s[3]); err == nil {
atomic.AddInt64(succCount, 1)
return
} else {
fmt.Println(err)
}
}
return
}
func GoPage(suffix string, suffixDir string) (string, bool) {
url := *MainURL + suffix
fmt.Print(time.Now().Format(time.RFC822), " > Start ", suffix, " ...")
doc, _ := goquery.NewDocument(url)
var wg sync.WaitGroup
var succCount, count int64
doc.Find("div.post_thumbnail_container").Each(func(i int, div *goquery.Selection) {
img, _ := div.Attr("data-imageurl")
id, _ := div.Parent().Parent().Parent().Attr("id")
wg.Add(1)
count++
go TryDownloadLargest(img, suffixDir, id[11:], &wg, &succCount)
})
wg.Wait()
fmt.Print(" Success: ", succCount, ", Fail: ", count-succCount, "\n")
return doc.Find("#next_page_link").Attr("href")
}
func main() {
flag.Parse()
fmt.Println(time.Now().Format(time.RFC822), "> ===", *MainURL, "===")
var next string = "/archive"
if *Before != "" {
next += "?before_time=" + *Before
}
var ok bool
for {
next, ok = GoPage(next, *MainDir)
if !ok {
break
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment