raydog/rdump.go

## rdump.go
// rDump -- Dumps images in an Imgur sub-reddit thing

// Dependencies:
//   go get github.com/PuerkitoBio/goquery

package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"os"
	"path"
	"strconv"
)

// Magic values go here:
const (
	user_agent     string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"
	base_url_fmt   string = "https://imgur.com/r/%s"
	next_url_fmt   string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled"
	detail_url_fmt string = "https://imgur.com%s"
	ajax_url_fmt   string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true"
	ajax_img_fmt   string = "https://i.imgur.com/%s%s"
	download_fmt   string = "https:%s"
	album_cutoff   int    = 8
	num_workers    int    = 5
	max_pages      int    = 10
)

// Used for parsing the AJAX endpoints:
type PostDetail struct {
	Hash      string `json:"hash"`
	Title     string `json:"title"`
	Desc      string `json:"description"`
	Width     int    `json:"width"`
	Height    int    `json:"height"`
	Size      int    `json:"size"`
	Ext       string `json:"ext"`
	Anim      bool   `json:"animated"`
	PreferVid bool   `json:"prefer_video"`
	Looping   bool   `json:"looping"`
	Timestamp string `json:"datetime"`
}
type ListData struct {
	Count  int          `json:"count"`
	Images []PostDetail `json:"images"`
}
type AJAXResponse struct {
	Data    ListData `json:"data"`
	Success bool     `json:"success"`
	Status  int      `json:"status"`
}

func (pd PostDetail) GetURL() string {
	if pd.Hash == "" || pd.Ext == "" {
		return ""
	}
	return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext)
}

// From a subreddit name, fetches all urls from that subreddit:
func fetchAllImageLinks(subreddit string) chan string {
	// We give this channel a buffer, just so that page changes are less likely to
	// block image workers:
	out := make(chan string, 10)

	go (func() {
		defer close(out)
		for link := range urlGenerator(subreddit) {
			pageNo, linkChannel := fetchUrlList(link)
			log.Printf("Entering Page #%d : %s", pageNo, link)

			for link := range linkChannel {
				out <- link
			}
		}
	})()
	return out
}

// Given a subreddit name, returns a channel of URLs to scrape:
func urlGenerator(seed string) chan string {
	out := make(chan string)
	base := fmt.Sprintf(base_url_fmt, seed)
	go (func() {
		out <- base
		for n := 1; n < max_pages; n++ {
			out <- fmt.Sprintf(next_url_fmt, seed, n)
		}
		close(out)
	})()
	return out
}

// Performs an HTTP GET, with the correct fake headers:
func httpGET(url string) (*http.Response, error) {
	request, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return nil, err
	}

	request.Header.Set("User-Agent", user_agent)

	return http.DefaultClient.Do(request)
}

// A stupid hack so we can manipulate our user-agent when fetching pages:
func buildGoQueryDocument(url string) (*goquery.Document, error) {
	resp, err := httpGET(url)
	if err != nil {
		return nil, err
	}

	return goquery.NewDocumentFromResponse(resp)
}

func extractFilename(link string) (string, error) {
	parsed, err := url.Parse(link)
	if err != nil {
		return "", err
	}
	return path.Base(parsed.Path), nil
}

// Download a file... Unless we already have it:
func maybeDownload(link string) {

	fname, err := extractFilename(link)
	if err != nil {
		log.Printf("Cannot download [%s] : Bad link. %v", link, err)
		return
	}

	stat, err := os.Stat(fname)
	if err == nil && stat.Size() > 0 {
		log.Printf("Already have '%s'. Skipping.", fname)
		return
	}

	destFile, err := os.Create(fname)
	if err != nil {
		log.Printf("Failed to create '%s': %v", fname, err)
		return
	}
	defer destFile.Close()

	httpResp, err := httpGET(link)
	if err != nil {
		log.Printf("Couldn't download '%s': %v", fname, err)
		return
	}
	defer httpResp.Body.Close()

	if httpResp.StatusCode > 299 {
		log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode)
		return
	}

	n, err := io.Copy(destFile, httpResp.Body)
	if err != nil {
		log.Printf("Download failed for '%s': %v", fname, err)
		return
	}

	log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n)
}

// Parses images and the data-page thing out of the entry lists:
func fetchUrlList(link string) (pageNum int, urls chan string) {
	pageNum, urls = -1, make(chan string)

	doc, err := buildGoQueryDocument(link)
	if err != nil {
		log.Printf("Failed to read URL: %s", link)
		close(urls)
		return
	}

	// We recieve a single value on this, which is the page num:
	pageNumSent, pageNumChan := false, make(chan int)
	defer close(pageNumChan)

	go (func() {
		defer close(urls)

		doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) {
			page, pageExists := s.Attr("data-page")
			href, hrefExists := s.Attr("href")

			if pageExists && !pageNumSent {
				pageNo, _ := strconv.ParseInt(page, 10, 32)
				pageNumSent = true
				pageNumChan <- int(pageNo)
			}

			if hrefExists {
				urls <- href
			}
		})

		// If page was malformed, and/or had no useable content, just send back page -1
		if !pageNumSent {
			log.Printf("Page [%s] contained no usable data", link)
			pageNumChan <- -1
		}
	})()

	pageNum = <-pageNumChan
	return
}

func httpAJAX(detailLink string) ([]byte, error) {
	albumId, err := extractFilename(detailLink)
	if err != nil {
		return nil, err
	}

	albumUrl := fmt.Sprintf(ajax_url_fmt, albumId)

	resp, err := httpGET(albumUrl)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode > 299 {
		return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode)
	}

	return ioutil.ReadAll(resp.Body)
}

// Will use the AJAX endpoint to pluck all images in an album out:
func fetchAJAXUrls(detailLink string) chan string {
	out := make(chan string)

	data, err := httpAJAX(detailLink)
	if err != nil {
		close(out)
		return out
	}

	go (func() {
		defer close(out)

		parsed := AJAXResponse{}
		err = json.Unmarshal(data, &parsed)
		if err != nil {
			log.Printf("AJAX Parse failed: %v", err)
			return
		}

		for _, img := range parsed.Data.Images {
			if imgUrl := img.GetURL(); imgUrl != "" {
				out <- imgUrl
			}
		}
	})()

	return out
}

// Given the URL to a post detail page, returns the URLs to download:
func fetchDownloadUrls(detailLink string) chan string {
	out := make(chan string)

	detailUrl := fmt.Sprintf(detail_url_fmt, detailLink)
	doc, err := buildGoQueryDocument(detailUrl)
	if err != nil {
		log.Printf("Failed to read detail URL: %s", detailUrl)
		close(out)
		return out
	}

	_maybeSend := func(s string, exists bool) {
		if exists && s != "" {
			fullUrl := fmt.Sprintf(download_fmt, s)
			out <- fullUrl
		}
	}

	go (func() {
		defer close(out)

		// Albums could have TONS of pics, so use AJAX if too many pics:
		if doc.Find("div.post-image").Length() >= album_cutoff {
			log.Printf("Large album: %s", detailLink)
			for linkz := range fetchAJAXUrls(detailLink) {
				out <- linkz
			}
			return
		}

		// Else, emit a single entry:
		doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) {
			_maybeSend(s.Find("img").Attr("src"))
			_maybeSend(s.Find("source").Attr("src"))
		})

	})()

	return out
}

// Will read from a channel, downloading links until the channel dies:
func imageWorker(urls chan string, workerName string) chan bool {
	out := make(chan bool)
	go (func() {
		defer close(out)
		log.Printf("Starting up worker: %s", workerName)
		for link := range urls {
			log.Printf("%s : Handling %s", workerName, link)
			for downloadMe := range fetchDownloadUrls(link) {
				log.Printf("%s : Found: %s", workerName, downloadMe)
				maybeDownload(downloadMe)
			}
		}
	})()
	return out
}

// Main func parses args, and sets things up:
func main() {

	verbose := flag.Bool("v", false, "Verbosely log what's happening")
	flag.Parse()

	target := flag.Arg(0)

	if !(*verbose) {
		log.SetOutput(ioutil.Discard)
	}

	imageChan := fetchAllImageLinks(target)

	var workers [num_workers]chan bool
	for i := range workers {
		name := fmt.Sprintf("Worker[%d]", i+1)
		workers[i] = imageWorker(imageChan, name)
	}

	for _, w := range workers {
		_ = <-w
	}

	log.Printf("Done.")
}
	// rDump -- Dumps images in an Imgur sub-reddit thing

	// Dependencies:
	// go get github.com/PuerkitoBio/goquery

	package main

	import (
	"encoding/json"
	"flag"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"os"
	"path"
	"strconv"
	)

	// Magic values go here:
	const (
	user_agent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"
	base_url_fmt string = "https://imgur.com/r/%s"
	next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled"
	detail_url_fmt string = "https://imgur.com%s"
	ajax_url_fmt string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true"
	ajax_img_fmt string = "https://i.imgur.com/%s%s"
	download_fmt string = "https:%s"
	album_cutoff int = 8
	num_workers int = 5
	max_pages int = 10
	)

	// Used for parsing the AJAX endpoints:
	type PostDetail struct {
	Hash string `json:"hash"`
	Title string `json:"title"`
	Desc string `json:"description"`
	Width int `json:"width"`
	Height int `json:"height"`
	Size int `json:"size"`
	Ext string `json:"ext"`
	Anim bool `json:"animated"`
	PreferVid bool `json:"prefer_video"`
	Looping bool `json:"looping"`
	Timestamp string `json:"datetime"`
	}
	type ListData struct {
	Count int `json:"count"`
	Images []PostDetail `json:"images"`
	}
	type AJAXResponse struct {
	Data ListData `json:"data"`
	Success bool `json:"success"`
	Status int `json:"status"`
	}

	func (pd PostDetail) GetURL() string {
	if pd.Hash == "" \|\| pd.Ext == "" {
	return ""
	}
	return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext)
	}

	// From a subreddit name, fetches all urls from that subreddit:
	func fetchAllImageLinks(subreddit string) chan string {
	// We give this channel a buffer, just so that page changes are less likely to
	// block image workers:
	out := make(chan string, 10)

	go (func() {
	defer close(out)
	for link := range urlGenerator(subreddit) {
	pageNo, linkChannel := fetchUrlList(link)
	log.Printf("Entering Page #%d : %s", pageNo, link)

	for link := range linkChannel {
	out <- link
	}
	}
	})()
	return out
	}

	// Given a subreddit name, returns a channel of URLs to scrape:
	func urlGenerator(seed string) chan string {
	out := make(chan string)
	base := fmt.Sprintf(base_url_fmt, seed)
	go (func() {
	out <- base
	for n := 1; n < max_pages; n++ {
	out <- fmt.Sprintf(next_url_fmt, seed, n)
	}
	close(out)
	})()
	return out
	}

	// Performs an HTTP GET, with the correct fake headers:
	func httpGET(url string) (*http.Response, error) {
	request, err := http.NewRequest("GET", url, nil)
	if err != nil {
	return nil, err
	}

	request.Header.Set("User-Agent", user_agent)

	return http.DefaultClient.Do(request)
	}

	// A stupid hack so we can manipulate our user-agent when fetching pages:
	func buildGoQueryDocument(url string) (*goquery.Document, error) {
	resp, err := httpGET(url)
	if err != nil {
	return nil, err
	}

	return goquery.NewDocumentFromResponse(resp)
	}

	func extractFilename(link string) (string, error) {
	parsed, err := url.Parse(link)
	if err != nil {
	return "", err
	}
	return path.Base(parsed.Path), nil
	}

	// Download a file... Unless we already have it:
	func maybeDownload(link string) {

	fname, err := extractFilename(link)
	if err != nil {
	log.Printf("Cannot download [%s] : Bad link. %v", link, err)
	return
	}

	stat, err := os.Stat(fname)
	if err == nil && stat.Size() > 0 {
	log.Printf("Already have '%s'. Skipping.", fname)
	return
	}

	destFile, err := os.Create(fname)
	if err != nil {
	log.Printf("Failed to create '%s': %v", fname, err)
	return
	}
	defer destFile.Close()

	httpResp, err := httpGET(link)
	if err != nil {
	log.Printf("Couldn't download '%s': %v", fname, err)
	return
	}
	defer httpResp.Body.Close()

	if httpResp.StatusCode > 299 {
	log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode)
	return
	}

	n, err := io.Copy(destFile, httpResp.Body)
	if err != nil {
	log.Printf("Download failed for '%s': %v", fname, err)
	return
	}

	log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n)
	}

	// Parses images and the data-page thing out of the entry lists:
	func fetchUrlList(link string) (pageNum int, urls chan string) {
	pageNum, urls = -1, make(chan string)

	doc, err := buildGoQueryDocument(link)
	if err != nil {
	log.Printf("Failed to read URL: %s", link)
	close(urls)
	return
	}

	// We recieve a single value on this, which is the page num:
	pageNumSent, pageNumChan := false, make(chan int)
	defer close(pageNumChan)

	go (func() {
	defer close(urls)

	doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) {
	page, pageExists := s.Attr("data-page")
	href, hrefExists := s.Attr("href")

	if pageExists && !pageNumSent {
	pageNo, _ := strconv.ParseInt(page, 10, 32)
	pageNumSent = true
	pageNumChan <- int(pageNo)
	}

	if hrefExists {
	urls <- href
	}
	})

	// If page was malformed, and/or had no useable content, just send back page -1
	if !pageNumSent {
	log.Printf("Page [%s] contained no usable data", link)
	pageNumChan <- -1
	}
	})()

	pageNum = <-pageNumChan
	return
	}

	func httpAJAX(detailLink string) ([]byte, error) {
	albumId, err := extractFilename(detailLink)
	if err != nil {
	return nil, err
	}

	albumUrl := fmt.Sprintf(ajax_url_fmt, albumId)

	resp, err := httpGET(albumUrl)
	if err != nil {
	return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode > 299 {
	return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode)
	}

	return ioutil.ReadAll(resp.Body)
	}

	// Will use the AJAX endpoint to pluck all images in an album out:
	func fetchAJAXUrls(detailLink string) chan string {
	out := make(chan string)

	data, err := httpAJAX(detailLink)
	if err != nil {
	close(out)
	return out
	}

	go (func() {
	defer close(out)

	parsed := AJAXResponse{}
	err = json.Unmarshal(data, &parsed)
	if err != nil {
	log.Printf("AJAX Parse failed: %v", err)
	return
	}

	for _, img := range parsed.Data.Images {
	if imgUrl := img.GetURL(); imgUrl != "" {
	out <- imgUrl
	}
	}
	})()

	return out
	}

	// Given the URL to a post detail page, returns the URLs to download:
	func fetchDownloadUrls(detailLink string) chan string {
	out := make(chan string)

	detailUrl := fmt.Sprintf(detail_url_fmt, detailLink)
	doc, err := buildGoQueryDocument(detailUrl)
	if err != nil {
	log.Printf("Failed to read detail URL: %s", detailUrl)
	close(out)
	return out
	}

	_maybeSend := func(s string, exists bool) {
	if exists && s != "" {
	fullUrl := fmt.Sprintf(download_fmt, s)
	out <- fullUrl
	}
	}

	go (func() {
	defer close(out)

	// Albums could have TONS of pics, so use AJAX if too many pics:
	if doc.Find("div.post-image").Length() >= album_cutoff {
	log.Printf("Large album: %s", detailLink)
	for linkz := range fetchAJAXUrls(detailLink) {
	out <- linkz
	}
	return
	}

	// Else, emit a single entry:
	doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) {
	_maybeSend(s.Find("img").Attr("src"))
	_maybeSend(s.Find("source").Attr("src"))
	})

	})()

	return out
	}

	// Will read from a channel, downloading links until the channel dies:
	func imageWorker(urls chan string, workerName string) chan bool {
	out := make(chan bool)
	go (func() {
	defer close(out)
	log.Printf("Starting up worker: %s", workerName)
	for link := range urls {
	log.Printf("%s : Handling %s", workerName, link)
	for downloadMe := range fetchDownloadUrls(link) {
	log.Printf("%s : Found: %s", workerName, downloadMe)
	maybeDownload(downloadMe)
	}
	}
	})()
	return out
	}

	// Main func parses args, and sets things up:
	func main() {

	verbose := flag.Bool("v", false, "Verbosely log what's happening")
	flag.Parse()

	target := flag.Arg(0)

	if !(*verbose) {
	log.SetOutput(ioutil.Discard)
	}

	imageChan := fetchAllImageLinks(target)

	var workers [num_workers]chan bool
	for i := range workers {
	name := fmt.Sprintf("Worker[%d]", i+1)
	workers[i] = imageWorker(imageChan, name)
	}

	for _, w := range workers {
	_ = <-w
	}

	log.Printf("Done.")
	}