wylee/gist:3783735

## gistfile1.go
/*
This "script" scrapes a collection of images from Wikipedia.

It reads a file containing a list of URLs (that were previously scraped
from the index page of the collection). For each of those URLs, it
retrieves the pointed-to page, locates a particular image URL within that
page (using a regex), and then downloads the image.

The image is written in 1MB chunks; progress is reported as each chunk is
written. Downloaded images are named N.jpg where N corresponds to the line
number in the URL file.

If an image has already been downloaded, it will be skipped. A file is
considered already downloaded if it has a JPEG EOF marker (FF D9) AND it
has the same byte count as the source image; if only one of these
conditions is true, the program will abort (in that case, delete the
offending file and try again).

If an image has been partially downloaded, the next attempt at downloading
it will resume directly after the already downloaded bytes by adding a
Range header to the request.

Command line args:

	-inputFile The file containing the list of URLs, one per line
	-start Image to start from (line # in list of URLs); if not present,
	       start from the first incomplete or missing image
	-outputDir The directory into which images will be downloaded

TODO:

	- Resume after partial download
		- Resume after suspend
		- Retry after network failure
	- Allow downloading of a single file
*/
package main

import (
	"errors"
	"flag"
	"fmt"
	"image/jpeg"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"time"
)

const (
	chunkSize    = 1 << 20
	statMsg      = "\rDownloading %d/%d %s (%d/%d %.2f%%)..." // index, total, name, bytes read, total bytes, percent read
	doneMsg      = "\nDownloaded %d/%d %s in %v."             // index, total, name, time
	pauseTime    = 2 * time.Second
	imgUrlRegexp = `<div class="fullImageLink".*><a href="(//upload.wikimedia.org/wikipedia/commons/.+/.+/.+\.jpg)">`
)

var (
	start     int
	inputFile string
	outputDir string
)

func init() {
	flag.StringVar(&inputFile, "inputFile", "", "File containing URLs, one per line")
	flag.IntVar(&start, "start", 0, "Image to start at")
	flag.StringVar(&outputDir, "outputDir", "", "Output directory")
}

func main() {
	flag.Parse()

	// Get the list of preview page URLs from the specified file
	content, _ := ioutil.ReadFile(inputFile)
	if content[len(content)-1] == '\n' {
		content = content[:len(content)-1]
	}
	urls := strings.Split(string(content), "\n")

	numImgs := len(urls)

	start, err := findStart(start, outputDir, numImgs)
	if err != nil {
		die(err, 1)
	}

	urls = urls[start-1:]

	fmt.Printf("Starting from image %d.\n", start)

	for i, pageUrl := range urls {
		startTime := time.Now()

		ordinal := start + i
		fileName := fmt.Sprintf("%d.jpg", ordinal)
		path := filepath.Join(outputDir, fileName)

		page, err := getPreviewPage(pageUrl)
		if err != nil {
			die(err, 2)
		}

		imgUrl, err := extractImgUrl(page)
		if err != nil {
			die(err, 2)
		}

		// Get last part of image URL for stat message
		shortName := strings.Split(imgUrl, "wikipedia/commons")[1]

		// Make HEAD request to get total number of bytes
		headResp, _ := http.Head(imgUrl)
		headResp.Body.Close()

		targetSize, _ := strconv.ParseInt(headResp.Header["Content-Length"][0], 10, 64)
		targetFile := NewTargetFile(path, ordinal, targetSize)

		stat := func() {
			bytesWritten := targetFile.currentSize
			percentage := float32(bytesWritten) / float32(targetSize) * 100
			fmt.Printf(statMsg, ordinal, numImgs, shortName, bytesWritten, targetSize, percentage)
		}

		stat()

		if targetFile.isDownloaded() {
			fmt.Printf("\n%s already downloaded; skipping.", path)
		} else if targetFile.endsWithJpegEofMarker() || targetFile.currentSizeIsTargetSize() {
			die(errors.New("File appears to be complete OR byte count matches but not both; too confused to continue."), 3)
		} else {
			for {
				err := downloadImg(imgUrl, targetFile, stat)
				// If we get an EOF here, it's from the TCP socket.
				// This will happen when the image is fully transferred.
				// It will also happen if the socket gets closed prematurely.
				// In the latter case, retry.
				if err != nil {
					if err == io.EOF {
						if targetFile.currentSize == targetSize {
							fmt.Printf(doneMsg, targetFile.ordinal, numImgs, shortName, time.Now().Sub(startTime))
							break
						}
						fmt.Printf("\nRetrying %s...\n", fileName)
					} else {
						die(err, 4)
					}
				}
			}
		}

		targetFile.Close() // XXX: Do this here or somewhere else?

		fmt.Printf("\nPausing for %d seconds...", (pauseTime / time.Second))
		time.Sleep(pauseTime)
	}
}

type TargetFile struct {
	*os.File
	ordinal     int
	currentSize int64
	targetSize  int64
}

func NewTargetFile(path string, ordinal int, targetSize int64) *TargetFile {
	var currentSize int64 = 0
	f, err := os.OpenFile(path, os.O_RDWR, 0664)
	if os.IsNotExist(err) {
		f, _ = os.Create(path)
	} else if err != nil {
		die(err, 100)
	} else {
		fileInfo, _ := f.Stat()
		currentSize = fileInfo.Size()
	}
	return &TargetFile{f, ordinal, currentSize, targetSize}
}

func (f *TargetFile) Write(b []byte) (n int, err error) {
	n, err = f.File.Write(b)
	f.currentSize += int64(n)
	return
}

func (f *TargetFile) isDownloaded() bool {
	return f.currentSizeIsTargetSize() && f.endsWithJpegEofMarker()
}

func (f *TargetFile) currentSizeIsTargetSize() bool {
	return f.currentSize == f.targetSize
}

func (f *TargetFile) endsWithJpegEofMarker() bool {
	return endsWithJpegEofMarker(f.File)
}

// A valid JPEG file has JPEG info and ends with a JPEG EOF marker
func isValidJpegFile(f *os.File) bool {
	_, err := jpeg.DecodeConfig(f)
	return err == nil && endsWithJpegEofMarker(f)
}

func isJpegEofMarker(marker []byte) bool {
	return (len(marker) == 2) && (marker[0] == 0xff && marker[1] == 0xd9)
}

func endsWithJpegEofMarker(f *os.File) bool {
	info, _ := f.Stat()
	size := info.Size()
	if size < 2 {
		return false
	}
	a := make([]byte, 2)
	f.ReadAt(a, size-2)
	return isJpegEofMarker(a)
}

// When no -start flag is present, figure out where to start downloading.
// This function returns the ordinal of the first image that is either not
// complete (no JPEG EOF marker) or not present.
func findStart(start int, outputDir string, numImgs int) (int, error) {
	if start > 0 {
		return start, nil
	}
	infoList, err := ioutil.ReadDir(outputDir)
	if err != nil {
		return 0, err
	}
	present := make([]int, numImgs+1)
	for _, info := range infoList {
		name := info.Name()
		if ext := filepath.Ext(name); ext == ".jpg" {
			nameMinusExt := name[:len(name)-len(ext)]
			i, err := strconv.Atoi(nameMinusExt)
			if err != nil {
				return 0, err
			}
			f, err := os.Open(filepath.Join(outputDir, name))
			if err != nil {
				return 0, err
			}
			if endsWithJpegEofMarker(f) {
				present[i] = i
			}
			f.Close()
		}
	}
	for i, v := range present {
		if i != v {
			return i, nil
		}
	}
	return 0, errors.New("All images appear to be present")
}

func getPreviewPage(pageUrl string) ([]byte, error) {
	resp, err := http.Get(pageUrl)
	defer resp.Body.Close()
	if err != nil {
		return nil, err
	}
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}
	return body, nil
}

func extractImgUrl(page []byte) (imgUrl string, err error) {
	re, _ := regexp.Compile(imgUrlRegexp)
	m := re.FindSubmatch(page)
	if m == nil || len(m) != 2 {
		err = errors.New("Could not find image URL.")
	}
	return fmt.Sprintf("http:%s", m[1]), err
}

func downloadImg(imgUrl string, file *TargetFile, stat func()) error {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", imgUrl, nil)

	currentSize := file.currentSize
	targetSize := file.targetSize

	if currentSize > 0 && currentSize < targetSize {
		req.Header.Add("Range", fmt.Sprintf("bytes=%d-", currentSize))
	}

	resp, err := client.Do(req)
	defer resp.Body.Close()
	if err != nil {
		return err
	} else if resp.StatusCode >= 300 {
		b, _ := ioutil.ReadAll(resp.Body)
		return errors.New(fmt.Sprintf("%s\n%s\n", resp.Status, b))
	}

	contentLength := resp.ContentLength

	if currentSize+contentLength != targetSize {
		return errors.New(
			fmt.Sprintf("Byte counts don't match: %d (file size + Range) != %d (total)\n", currentSize+contentLength, targetSize))
	}

	// Download the full size image in chunks
	file.Seek(0, os.SEEK_END)
	for file.currentSize < targetSize {
		if _, err := io.CopyN(file, resp.Body, chunkSize); err != nil {
			stat()
			return err
		}
		stat()
	}

	return nil
}

func die(err error, errCode int, a ...interface{}) {
	msg := fmt.Sprintf(err.Error(), a...)
	fmt.Println(os.Stderr, msg)
	os.Exit(errCode)
}
	/*
	This "script" scrapes a collection of images from Wikipedia.

	It reads a file containing a list of URLs (that were previously scraped
	from the index page of the collection). For each of those URLs, it
	retrieves the pointed-to page, locates a particular image URL within that
	page (using a regex), and then downloads the image.

	The image is written in 1MB chunks; progress is reported as each chunk is
	written. Downloaded images are named N.jpg where N corresponds to the line
	number in the URL file.

	If an image has already been downloaded, it will be skipped. A file is
	considered already downloaded if it has a JPEG EOF marker (FF D9) AND it
	has the same byte count as the source image; if only one of these
	conditions is true, the program will abort (in that case, delete the
	offending file and try again).

	If an image has been partially downloaded, the next attempt at downloading
	it will resume directly after the already downloaded bytes by adding a
	Range header to the request.

	Command line args:

	-inputFile The file containing the list of URLs, one per line
	-start Image to start from (line # in list of URLs); if not present,
	start from the first incomplete or missing image
	-outputDir The directory into which images will be downloaded

	TODO:

	- Resume after partial download
	- Resume after suspend
	- Retry after network failure
	- Allow downloading of a single file
	*/
	package main

	import (
	"errors"
	"flag"
	"fmt"
	"image/jpeg"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"time"
	)

	const (
	chunkSize = 1 << 20
	statMsg = "\rDownloading %d/%d %s (%d/%d %.2f%%)..." // index, total, name, bytes read, total bytes, percent read
	doneMsg = "\nDownloaded %d/%d %s in %v." // index, total, name, time
	pauseTime = 2 * time.Second
	imgUrlRegexp = `<div class="fullImageLink".*><a href="(//upload.wikimedia.org/wikipedia/commons/.+/.+/.+\.jpg)">`
	)

	var (
	start int
	inputFile string
	outputDir string
	)

	func init() {
	flag.StringVar(&inputFile, "inputFile", "", "File containing URLs, one per line")
	flag.IntVar(&start, "start", 0, "Image to start at")
	flag.StringVar(&outputDir, "outputDir", "", "Output directory")
	}

	func main() {
	flag.Parse()

	// Get the list of preview page URLs from the specified file
	content, _ := ioutil.ReadFile(inputFile)
	if content[len(content)-1] == '\n' {
	content = content[:len(content)-1]
	}
	urls := strings.Split(string(content), "\n")

	numImgs := len(urls)

	start, err := findStart(start, outputDir, numImgs)
	if err != nil {
	die(err, 1)
	}

	urls = urls[start-1:]

	fmt.Printf("Starting from image %d.\n", start)

	for i, pageUrl := range urls {
	startTime := time.Now()

	ordinal := start + i
	fileName := fmt.Sprintf("%d.jpg", ordinal)
	path := filepath.Join(outputDir, fileName)

	page, err := getPreviewPage(pageUrl)
	if err != nil {
	die(err, 2)
	}

	imgUrl, err := extractImgUrl(page)
	if err != nil {
	die(err, 2)
	}

	// Get last part of image URL for stat message
	shortName := strings.Split(imgUrl, "wikipedia/commons")[1]

	// Make HEAD request to get total number of bytes
	headResp, _ := http.Head(imgUrl)
	headResp.Body.Close()

	targetSize, _ := strconv.ParseInt(headResp.Header["Content-Length"][0], 10, 64)
	targetFile := NewTargetFile(path, ordinal, targetSize)

	stat := func() {
	bytesWritten := targetFile.currentSize
	percentage := float32(bytesWritten) / float32(targetSize) * 100
	fmt.Printf(statMsg, ordinal, numImgs, shortName, bytesWritten, targetSize, percentage)
	}

	stat()

	if targetFile.isDownloaded() {
	fmt.Printf("\n%s already downloaded; skipping.", path)
	} else if targetFile.endsWithJpegEofMarker() \|\| targetFile.currentSizeIsTargetSize() {
	die(errors.New("File appears to be complete OR byte count matches but not both; too confused to continue."), 3)
	} else {
	for {
	err := downloadImg(imgUrl, targetFile, stat)
	// If we get an EOF here, it's from the TCP socket.
	// This will happen when the image is fully transferred.
	// It will also happen if the socket gets closed prematurely.
	// In the latter case, retry.
	if err != nil {
	if err == io.EOF {
	if targetFile.currentSize == targetSize {
	fmt.Printf(doneMsg, targetFile.ordinal, numImgs, shortName, time.Now().Sub(startTime))
	break
	}
	fmt.Printf("\nRetrying %s...\n", fileName)
	} else {
	die(err, 4)
	}
	}
	}
	}

	targetFile.Close() // XXX: Do this here or somewhere else?

	fmt.Printf("\nPausing for %d seconds...", (pauseTime / time.Second))
	time.Sleep(pauseTime)
	}
	}

	type TargetFile struct {
	*os.File
	ordinal int
	currentSize int64
	targetSize int64
	}

	func NewTargetFile(path string, ordinal int, targetSize int64) *TargetFile {
	var currentSize int64 = 0
	f, err := os.OpenFile(path, os.O_RDWR, 0664)
	if os.IsNotExist(err) {
	f, _ = os.Create(path)
	} else if err != nil {
	die(err, 100)
	} else {
	fileInfo, _ := f.Stat()
	currentSize = fileInfo.Size()
	}
	return &TargetFile{f, ordinal, currentSize, targetSize}
	}

	func (f *TargetFile) Write(b []byte) (n int, err error) {
	n, err = f.File.Write(b)
	f.currentSize += int64(n)
	return
	}

	func (f *TargetFile) isDownloaded() bool {
	return f.currentSizeIsTargetSize() && f.endsWithJpegEofMarker()
	}

	func (f *TargetFile) currentSizeIsTargetSize() bool {
	return f.currentSize == f.targetSize
	}

	func (f *TargetFile) endsWithJpegEofMarker() bool {
	return endsWithJpegEofMarker(f.File)
	}

	// A valid JPEG file has JPEG info and ends with a JPEG EOF marker
	func isValidJpegFile(f *os.File) bool {
	_, err := jpeg.DecodeConfig(f)
	return err == nil && endsWithJpegEofMarker(f)
	}

	func isJpegEofMarker(marker []byte) bool {
	return (len(marker) == 2) && (marker[0] == 0xff && marker[1] == 0xd9)
	}

	func endsWithJpegEofMarker(f *os.File) bool {
	info, _ := f.Stat()
	size := info.Size()
	if size < 2 {
	return false
	}
	a := make([]byte, 2)
	f.ReadAt(a, size-2)
	return isJpegEofMarker(a)
	}

	// When no -start flag is present, figure out where to start downloading.
	// This function returns the ordinal of the first image that is either not
	// complete (no JPEG EOF marker) or not present.
	func findStart(start int, outputDir string, numImgs int) (int, error) {
	if start > 0 {
	return start, nil
	}
	infoList, err := ioutil.ReadDir(outputDir)
	if err != nil {
	return 0, err
	}
	present := make([]int, numImgs+1)
	for _, info := range infoList {
	name := info.Name()
	if ext := filepath.Ext(name); ext == ".jpg" {
	nameMinusExt := name[:len(name)-len(ext)]
	i, err := strconv.Atoi(nameMinusExt)
	if err != nil {
	return 0, err
	}
	f, err := os.Open(filepath.Join(outputDir, name))
	if err != nil {
	return 0, err
	}
	if endsWithJpegEofMarker(f) {
	present[i] = i
	}
	f.Close()
	}
	}
	for i, v := range present {
	if i != v {
	return i, nil
	}
	}
	return 0, errors.New("All images appear to be present")
	}

	func getPreviewPage(pageUrl string) ([]byte, error) {
	resp, err := http.Get(pageUrl)
	defer resp.Body.Close()
	if err != nil {
	return nil, err
	}
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
	return nil, err
	}
	return body, nil
	}

	func extractImgUrl(page []byte) (imgUrl string, err error) {
	re, _ := regexp.Compile(imgUrlRegexp)
	m := re.FindSubmatch(page)
	if m == nil \|\| len(m) != 2 {
	err = errors.New("Could not find image URL.")
	}
	return fmt.Sprintf("http:%s", m[1]), err
	}

	func downloadImg(imgUrl string, file *TargetFile, stat func()) error {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", imgUrl, nil)

	currentSize := file.currentSize
	targetSize := file.targetSize

	if currentSize > 0 && currentSize < targetSize {
	req.Header.Add("Range", fmt.Sprintf("bytes=%d-", currentSize))
	}

	resp, err := client.Do(req)
	defer resp.Body.Close()
	if err != nil {
	return err
	} else if resp.StatusCode >= 300 {
	b, _ := ioutil.ReadAll(resp.Body)
	return errors.New(fmt.Sprintf("%s\n%s\n", resp.Status, b))
	}

	contentLength := resp.ContentLength

	if currentSize+contentLength != targetSize {
	return errors.New(
	fmt.Sprintf("Byte counts don't match: %d (file size + Range) != %d (total)\n", currentSize+contentLength, targetSize))
	}

	// Download the full size image in chunks
	file.Seek(0, os.SEEK_END)
	for file.currentSize < targetSize {
	if _, err := io.CopyN(file, resp.Body, chunkSize); err != nil {
	stat()
	return err
	}
	stat()
	}

	return nil
	}

	func die(err error, errCode int, a ...interface{}) {
	msg := fmt.Sprintf(err.Error(), a...)
	fmt.Println(os.Stderr, msg)
	os.Exit(errCode)
	}