Skip to content

Instantly share code, notes, and snippets.

@dustin
Created October 4, 2012 02:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dustin/3831133 to your computer and use it in GitHub Desktop.
Save dustin/3831133 to your computer and use it in GitHub Desktop.
wikipedia file fetcher/uploader
package main
import (
"crypto/md5"
"encoding/hex"
"encoding/json"
"flag"
"fmt"
"io"
"log"
"net/http"
"net/url"
"regexp"
"time"
"code.google.com/p/dsallings-couch-go"
"github.com/dustin/go-wikiparse"
)
var fileRE, nowikiRE *regexp.Regexp
var cbfsUrl = flag.String("cbfs", "http://localhost:8484/", "URL to cbfs")
type change struct {
Seq int64
Id string
Doc struct {
Text string
}
}
func init() {
nowikiRE = regexp.MustCompile(`(?ms)<nowiki>.*</nowiki>`)
fileRE = regexp.MustCompile(`\[File:([^\|\]]+)`)
http.DefaultTransport = &http.Transport{
Proxy: http.ProxyFromEnvironment,
DisableKeepAlives: true,
}
}
func maybefatal(err error, msg string, args ...interface{}) {
if err != nil {
log.Fatalf(msg, args...)
}
}
func cbfsUrlFor(name string) string {
outu, err := url.Parse(*cbfsUrl)
if err != nil {
panic(err)
}
h := md5.New()
h.Write([]byte(name))
hash := hex.EncodeToString(h.Sum([]byte{}))
outu.Path = "/wikipedia/files/" + hash[0:1] + "/" + hash[1:2] + "/" + name
return outu.String()
}
func exists(name string) bool {
resp, err := http.Head(cbfsUrlFor(name))
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == 200
}
func copyImage(name, inu string) error {
inreq, err := http.Get(inu)
if err != nil {
return err
}
defer inreq.Body.Close()
if inreq.StatusCode != 200 {
return fmt.Errorf("HTTP error in source (%v): %v",
name, inreq.Status)
}
outreq, err := http.NewRequest("PUT", cbfsUrlFor(name), inreq.Body)
if err != nil {
return err
}
resp, err := http.DefaultClient.Do(outreq)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 201 {
return fmt.Errorf("HTTP error writing content: %v", resp.Status)
}
return nil
}
func fetch(c change) (rv error) {
log.Printf("Fetching stuff from %v (%v, %v bytes)",
c.Id, c.Seq, len(c.Doc.Text))
for _, img := range wikiparse.FindFiles(c.Doc.Text) {
if exists(img) {
log.Printf("Already have %v, skipping", img)
continue
}
iu := wikiparse.URLForFile(img)
var err error
for i := 0; i < 3; i++ {
err = copyImage(img, iu)
if err == nil {
break
}
time.Sleep(time.Second)
}
if err != nil {
log.Printf("Error copying %v - %v", img, err)
}
}
return nil
}
func fetcher(ch <-chan change) {
for c := range ch {
err := fetch(c)
if err != nil {
log.Printf("Error fetching images from %v", c.Id)
}
}
}
func feedBody(r io.Reader, results chan<- change) int64 {
largest := int64(0)
d := json.NewDecoder(r)
for {
thing := change{}
err := d.Decode(&thing)
if err != nil {
if err.Error() == "unexpected EOF" {
return largest
} else {
log.Fatalf("Error decoding stuff: %#v", err)
}
}
results <- thing
largest = thing.Seq
}
return largest
}
func main() {
startNum := flag.Int64("start", 0, "Starting change ID")
numWorkers := flag.Int("workers", 4, "Number of workers")
flag.Parse()
db, err := couch.Connect(flag.Arg(0))
maybefatal(err, "Error connecting: %v", err)
ch := make(chan change)
for i := 0; i < *numWorkers; i++ {
go fetcher(ch)
}
err = db.Changes(func(r io.Reader) int64 {
return feedBody(r, ch)
},
map[string]interface{}{
"since": *startNum,
"feed": "continuous",
"include_docs": true,
})
maybefatal(err, "Error changesing: %v", err)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment