Skip to content

Instantly share code, notes, and snippets.

@raydog
Last active March 29, 2016 18:27
Show Gist options
  • Save raydog/a12701b5ba9a2c4f5e9c to your computer and use it in GitHub Desktop.
Save raydog/a12701b5ba9a2c4f5e9c to your computer and use it in GitHub Desktop.
Dumps entire subreddits out of imgur into the PWD.
// rDump -- Dumps images in an Imgur sub-reddit thing
// Dependencies:
// go get github.com/PuerkitoBio/goquery
package main
import (
"encoding/json"
"flag"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"path"
"strconv"
)
// Magic values go here:
const (
user_agent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"
base_url_fmt string = "https://imgur.com/r/%s"
next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled"
detail_url_fmt string = "https://imgur.com%s"
ajax_url_fmt string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true"
ajax_img_fmt string = "https://i.imgur.com/%s%s"
download_fmt string = "https:%s"
album_cutoff int = 8
num_workers int = 5
max_pages int = 10
)
// Used for parsing the AJAX endpoints:
type PostDetail struct {
Hash string `json:"hash"`
Title string `json:"title"`
Desc string `json:"description"`
Width int `json:"width"`
Height int `json:"height"`
Size int `json:"size"`
Ext string `json:"ext"`
Anim bool `json:"animated"`
PreferVid bool `json:"prefer_video"`
Looping bool `json:"looping"`
Timestamp string `json:"datetime"`
}
type ListData struct {
Count int `json:"count"`
Images []PostDetail `json:"images"`
}
type AJAXResponse struct {
Data ListData `json:"data"`
Success bool `json:"success"`
Status int `json:"status"`
}
func (pd PostDetail) GetURL() string {
if pd.Hash == "" || pd.Ext == "" {
return ""
}
return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext)
}
// From a subreddit name, fetches all urls from that subreddit:
func fetchAllImageLinks(subreddit string) chan string {
// We give this channel a buffer, just so that page changes are less likely to
// block image workers:
out := make(chan string, 10)
go (func() {
defer close(out)
for link := range urlGenerator(subreddit) {
pageNo, linkChannel := fetchUrlList(link)
log.Printf("Entering Page #%d : %s", pageNo, link)
for link := range linkChannel {
out <- link
}
}
})()
return out
}
// Given a subreddit name, returns a channel of URLs to scrape:
func urlGenerator(seed string) chan string {
out := make(chan string)
base := fmt.Sprintf(base_url_fmt, seed)
go (func() {
out <- base
for n := 1; n < max_pages; n++ {
out <- fmt.Sprintf(next_url_fmt, seed, n)
}
close(out)
})()
return out
}
// Performs an HTTP GET, with the correct fake headers:
func httpGET(url string) (*http.Response, error) {
request, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
request.Header.Set("User-Agent", user_agent)
return http.DefaultClient.Do(request)
}
// A stupid hack so we can manipulate our user-agent when fetching pages:
func buildGoQueryDocument(url string) (*goquery.Document, error) {
resp, err := httpGET(url)
if err != nil {
return nil, err
}
return goquery.NewDocumentFromResponse(resp)
}
func extractFilename(link string) (string, error) {
parsed, err := url.Parse(link)
if err != nil {
return "", err
}
return path.Base(parsed.Path), nil
}
// Download a file... Unless we already have it:
func maybeDownload(link string) {
fname, err := extractFilename(link)
if err != nil {
log.Printf("Cannot download [%s] : Bad link. %v", link, err)
return
}
stat, err := os.Stat(fname)
if err == nil && stat.Size() > 0 {
log.Printf("Already have '%s'. Skipping.", fname)
return
}
destFile, err := os.Create(fname)
if err != nil {
log.Printf("Failed to create '%s': %v", fname, err)
return
}
defer destFile.Close()
httpResp, err := httpGET(link)
if err != nil {
log.Printf("Couldn't download '%s': %v", fname, err)
return
}
defer httpResp.Body.Close()
if httpResp.StatusCode > 299 {
log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode)
return
}
n, err := io.Copy(destFile, httpResp.Body)
if err != nil {
log.Printf("Download failed for '%s': %v", fname, err)
return
}
log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n)
}
// Parses images and the data-page thing out of the entry lists:
func fetchUrlList(link string) (pageNum int, urls chan string) {
pageNum, urls = -1, make(chan string)
doc, err := buildGoQueryDocument(link)
if err != nil {
log.Printf("Failed to read URL: %s", link)
close(urls)
return
}
// We recieve a single value on this, which is the page num:
pageNumSent, pageNumChan := false, make(chan int)
defer close(pageNumChan)
go (func() {
defer close(urls)
doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) {
page, pageExists := s.Attr("data-page")
href, hrefExists := s.Attr("href")
if pageExists && !pageNumSent {
pageNo, _ := strconv.ParseInt(page, 10, 32)
pageNumSent = true
pageNumChan <- int(pageNo)
}
if hrefExists {
urls <- href
}
})
// If page was malformed, and/or had no useable content, just send back page -1
if !pageNumSent {
log.Printf("Page [%s] contained no usable data", link)
pageNumChan <- -1
}
})()
pageNum = <-pageNumChan
return
}
func httpAJAX(detailLink string) ([]byte, error) {
albumId, err := extractFilename(detailLink)
if err != nil {
return nil, err
}
albumUrl := fmt.Sprintf(ajax_url_fmt, albumId)
resp, err := httpGET(albumUrl)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode > 299 {
return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode)
}
return ioutil.ReadAll(resp.Body)
}
// Will use the AJAX endpoint to pluck all images in an album out:
func fetchAJAXUrls(detailLink string) chan string {
out := make(chan string)
data, err := httpAJAX(detailLink)
if err != nil {
close(out)
return out
}
go (func() {
defer close(out)
parsed := AJAXResponse{}
err = json.Unmarshal(data, &parsed)
if err != nil {
log.Printf("AJAX Parse failed: %v", err)
return
}
for _, img := range parsed.Data.Images {
if imgUrl := img.GetURL(); imgUrl != "" {
out <- imgUrl
}
}
})()
return out
}
// Given the URL to a post detail page, returns the URLs to download:
func fetchDownloadUrls(detailLink string) chan string {
out := make(chan string)
detailUrl := fmt.Sprintf(detail_url_fmt, detailLink)
doc, err := buildGoQueryDocument(detailUrl)
if err != nil {
log.Printf("Failed to read detail URL: %s", detailUrl)
close(out)
return out
}
_maybeSend := func(s string, exists bool) {
if exists && s != "" {
fullUrl := fmt.Sprintf(download_fmt, s)
out <- fullUrl
}
}
go (func() {
defer close(out)
// Albums could have TONS of pics, so use AJAX if too many pics:
if doc.Find("div.post-image").Length() >= album_cutoff {
log.Printf("Large album: %s", detailLink)
for linkz := range fetchAJAXUrls(detailLink) {
out <- linkz
}
return
}
// Else, emit a single entry:
doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) {
_maybeSend(s.Find("img").Attr("src"))
_maybeSend(s.Find("source").Attr("src"))
})
})()
return out
}
// Will read from a channel, downloading links until the channel dies:
func imageWorker(urls chan string, workerName string) chan bool {
out := make(chan bool)
go (func() {
defer close(out)
log.Printf("Starting up worker: %s", workerName)
for link := range urls {
log.Printf("%s : Handling %s", workerName, link)
for downloadMe := range fetchDownloadUrls(link) {
log.Printf("%s : Found: %s", workerName, downloadMe)
maybeDownload(downloadMe)
}
}
})()
return out
}
// Main func parses args, and sets things up:
func main() {
verbose := flag.Bool("v", false, "Verbosely log what's happening")
flag.Parse()
target := flag.Arg(0)
if !(*verbose) {
log.SetOutput(ioutil.Discard)
}
imageChan := fetchAllImageLinks(target)
var workers [num_workers]chan bool
for i := range workers {
name := fmt.Sprintf("Worker[%d]", i+1)
workers[i] = imageWorker(imageChan, name)
}
for _, w := range workers {
_ = <-w
}
log.Printf("Done.")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment