Skip to content

Instantly share code, notes, and snippets.

@wylee
Created September 25, 2012 18:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wylee/3783735 to your computer and use it in GitHub Desktop.
Save wylee/3783735 to your computer and use it in GitHub Desktop.
My first Go program
/*
This "script" scrapes a collection of images from Wikipedia.
It reads a file containing a list of URLs (that were previously scraped
from the index page of the collection). For each of those URLs, it
retrieves the pointed-to page, locates a particular image URL within that
page (using a regex), and then downloads the image.
The image is written in 1MB chunks; progress is reported as each chunk is
written. Downloaded images are named N.jpg where N corresponds to the line
number in the URL file.
If an image has already been downloaded, it will be skipped. A file is
considered already downloaded if it has a JPEG EOF marker (FF D9) AND it
has the same byte count as the source image; if only one of these
conditions is true, the program will abort (in that case, delete the
offending file and try again).
If an image has been partially downloaded, the next attempt at downloading
it will resume directly after the already downloaded bytes by adding a
Range header to the request.
Command line args:
-inputFile The file containing the list of URLs, one per line
-start Image to start from (line # in list of URLs); if not present,
start from the first incomplete or missing image
-outputDir The directory into which images will be downloaded
TODO:
- Resume after partial download
- Resume after suspend
- Retry after network failure
- Allow downloading of a single file
*/
package main
import (
"errors"
"flag"
"fmt"
"image/jpeg"
"io"
"io/ioutil"
"net/http"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
)
const (
chunkSize = 1 << 20
statMsg = "\rDownloading %d/%d %s (%d/%d %.2f%%)..." // index, total, name, bytes read, total bytes, percent read
doneMsg = "\nDownloaded %d/%d %s in %v." // index, total, name, time
pauseTime = 2 * time.Second
imgUrlRegexp = `<div class="fullImageLink".*><a href="(//upload.wikimedia.org/wikipedia/commons/.+/.+/.+\.jpg)">`
)
var (
start int
inputFile string
outputDir string
)
func init() {
flag.StringVar(&inputFile, "inputFile", "", "File containing URLs, one per line")
flag.IntVar(&start, "start", 0, "Image to start at")
flag.StringVar(&outputDir, "outputDir", "", "Output directory")
}
func main() {
flag.Parse()
// Get the list of preview page URLs from the specified file
content, _ := ioutil.ReadFile(inputFile)
if content[len(content)-1] == '\n' {
content = content[:len(content)-1]
}
urls := strings.Split(string(content), "\n")
numImgs := len(urls)
start, err := findStart(start, outputDir, numImgs)
if err != nil {
die(err, 1)
}
urls = urls[start-1:]
fmt.Printf("Starting from image %d.\n", start)
for i, pageUrl := range urls {
startTime := time.Now()
ordinal := start + i
fileName := fmt.Sprintf("%d.jpg", ordinal)
path := filepath.Join(outputDir, fileName)
page, err := getPreviewPage(pageUrl)
if err != nil {
die(err, 2)
}
imgUrl, err := extractImgUrl(page)
if err != nil {
die(err, 2)
}
// Get last part of image URL for stat message
shortName := strings.Split(imgUrl, "wikipedia/commons")[1]
// Make HEAD request to get total number of bytes
headResp, _ := http.Head(imgUrl)
headResp.Body.Close()
targetSize, _ := strconv.ParseInt(headResp.Header["Content-Length"][0], 10, 64)
targetFile := NewTargetFile(path, ordinal, targetSize)
stat := func() {
bytesWritten := targetFile.currentSize
percentage := float32(bytesWritten) / float32(targetSize) * 100
fmt.Printf(statMsg, ordinal, numImgs, shortName, bytesWritten, targetSize, percentage)
}
stat()
if targetFile.isDownloaded() {
fmt.Printf("\n%s already downloaded; skipping.", path)
} else if targetFile.endsWithJpegEofMarker() || targetFile.currentSizeIsTargetSize() {
die(errors.New("File appears to be complete OR byte count matches but not both; too confused to continue."), 3)
} else {
for {
err := downloadImg(imgUrl, targetFile, stat)
// If we get an EOF here, it's from the TCP socket.
// This will happen when the image is fully transferred.
// It will also happen if the socket gets closed prematurely.
// In the latter case, retry.
if err != nil {
if err == io.EOF {
if targetFile.currentSize == targetSize {
fmt.Printf(doneMsg, targetFile.ordinal, numImgs, shortName, time.Now().Sub(startTime))
break
}
fmt.Printf("\nRetrying %s...\n", fileName)
} else {
die(err, 4)
}
}
}
}
targetFile.Close() // XXX: Do this here or somewhere else?
fmt.Printf("\nPausing for %d seconds...", (pauseTime / time.Second))
time.Sleep(pauseTime)
}
}
type TargetFile struct {
*os.File
ordinal int
currentSize int64
targetSize int64
}
func NewTargetFile(path string, ordinal int, targetSize int64) *TargetFile {
var currentSize int64 = 0
f, err := os.OpenFile(path, os.O_RDWR, 0664)
if os.IsNotExist(err) {
f, _ = os.Create(path)
} else if err != nil {
die(err, 100)
} else {
fileInfo, _ := f.Stat()
currentSize = fileInfo.Size()
}
return &TargetFile{f, ordinal, currentSize, targetSize}
}
func (f *TargetFile) Write(b []byte) (n int, err error) {
n, err = f.File.Write(b)
f.currentSize += int64(n)
return
}
func (f *TargetFile) isDownloaded() bool {
return f.currentSizeIsTargetSize() && f.endsWithJpegEofMarker()
}
func (f *TargetFile) currentSizeIsTargetSize() bool {
return f.currentSize == f.targetSize
}
func (f *TargetFile) endsWithJpegEofMarker() bool {
return endsWithJpegEofMarker(f.File)
}
// A valid JPEG file has JPEG info and ends with a JPEG EOF marker
func isValidJpegFile(f *os.File) bool {
_, err := jpeg.DecodeConfig(f)
return err == nil && endsWithJpegEofMarker(f)
}
func isJpegEofMarker(marker []byte) bool {
return (len(marker) == 2) && (marker[0] == 0xff && marker[1] == 0xd9)
}
func endsWithJpegEofMarker(f *os.File) bool {
info, _ := f.Stat()
size := info.Size()
if size < 2 {
return false
}
a := make([]byte, 2)
f.ReadAt(a, size-2)
return isJpegEofMarker(a)
}
// When no -start flag is present, figure out where to start downloading.
// This function returns the ordinal of the first image that is either not
// complete (no JPEG EOF marker) or not present.
func findStart(start int, outputDir string, numImgs int) (int, error) {
if start > 0 {
return start, nil
}
infoList, err := ioutil.ReadDir(outputDir)
if err != nil {
return 0, err
}
present := make([]int, numImgs+1)
for _, info := range infoList {
name := info.Name()
if ext := filepath.Ext(name); ext == ".jpg" {
nameMinusExt := name[:len(name)-len(ext)]
i, err := strconv.Atoi(nameMinusExt)
if err != nil {
return 0, err
}
f, err := os.Open(filepath.Join(outputDir, name))
if err != nil {
return 0, err
}
if endsWithJpegEofMarker(f) {
present[i] = i
}
f.Close()
}
}
for i, v := range present {
if i != v {
return i, nil
}
}
return 0, errors.New("All images appear to be present")
}
func getPreviewPage(pageUrl string) ([]byte, error) {
resp, err := http.Get(pageUrl)
defer resp.Body.Close()
if err != nil {
return nil, err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
return body, nil
}
func extractImgUrl(page []byte) (imgUrl string, err error) {
re, _ := regexp.Compile(imgUrlRegexp)
m := re.FindSubmatch(page)
if m == nil || len(m) != 2 {
err = errors.New("Could not find image URL.")
}
return fmt.Sprintf("http:%s", m[1]), err
}
func downloadImg(imgUrl string, file *TargetFile, stat func()) error {
client := &http.Client{}
req, _ := http.NewRequest("GET", imgUrl, nil)
currentSize := file.currentSize
targetSize := file.targetSize
if currentSize > 0 && currentSize < targetSize {
req.Header.Add("Range", fmt.Sprintf("bytes=%d-", currentSize))
}
resp, err := client.Do(req)
defer resp.Body.Close()
if err != nil {
return err
} else if resp.StatusCode >= 300 {
b, _ := ioutil.ReadAll(resp.Body)
return errors.New(fmt.Sprintf("%s\n%s\n", resp.Status, b))
}
contentLength := resp.ContentLength
if currentSize+contentLength != targetSize {
return errors.New(
fmt.Sprintf("Byte counts don't match: %d (file size + Range) != %d (total)\n", currentSize+contentLength, targetSize))
}
// Download the full size image in chunks
file.Seek(0, os.SEEK_END)
for file.currentSize < targetSize {
if _, err := io.CopyN(file, resp.Body, chunkSize); err != nil {
stat()
return err
}
stat()
}
return nil
}
func die(err error, errCode int, a ...interface{}) {
msg := fmt.Sprintf(err.Error(), a...)
fmt.Println(os.Stderr, msg)
os.Exit(errCode)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment