Created
October 4, 2012 02:25
-
-
Save dustin/3831133 to your computer and use it in GitHub Desktop.
wikipedia file fetcher/uploader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"crypto/md5" | |
"encoding/hex" | |
"encoding/json" | |
"flag" | |
"fmt" | |
"io" | |
"log" | |
"net/http" | |
"net/url" | |
"regexp" | |
"time" | |
"code.google.com/p/dsallings-couch-go" | |
"github.com/dustin/go-wikiparse" | |
) | |
var fileRE, nowikiRE *regexp.Regexp | |
var cbfsUrl = flag.String("cbfs", "http://localhost:8484/", "URL to cbfs") | |
type change struct { | |
Seq int64 | |
Id string | |
Doc struct { | |
Text string | |
} | |
} | |
func init() { | |
nowikiRE = regexp.MustCompile(`(?ms)<nowiki>.*</nowiki>`) | |
fileRE = regexp.MustCompile(`\[File:([^\|\]]+)`) | |
http.DefaultTransport = &http.Transport{ | |
Proxy: http.ProxyFromEnvironment, | |
DisableKeepAlives: true, | |
} | |
} | |
func maybefatal(err error, msg string, args ...interface{}) { | |
if err != nil { | |
log.Fatalf(msg, args...) | |
} | |
} | |
func cbfsUrlFor(name string) string { | |
outu, err := url.Parse(*cbfsUrl) | |
if err != nil { | |
panic(err) | |
} | |
h := md5.New() | |
h.Write([]byte(name)) | |
hash := hex.EncodeToString(h.Sum([]byte{})) | |
outu.Path = "/wikipedia/files/" + hash[0:1] + "/" + hash[1:2] + "/" + name | |
return outu.String() | |
} | |
func exists(name string) bool { | |
resp, err := http.Head(cbfsUrlFor(name)) | |
if err != nil { | |
return false | |
} | |
defer resp.Body.Close() | |
return resp.StatusCode == 200 | |
} | |
func copyImage(name, inu string) error { | |
inreq, err := http.Get(inu) | |
if err != nil { | |
return err | |
} | |
defer inreq.Body.Close() | |
if inreq.StatusCode != 200 { | |
return fmt.Errorf("HTTP error in source (%v): %v", | |
name, inreq.Status) | |
} | |
outreq, err := http.NewRequest("PUT", cbfsUrlFor(name), inreq.Body) | |
if err != nil { | |
return err | |
} | |
resp, err := http.DefaultClient.Do(outreq) | |
if err != nil { | |
return err | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != 201 { | |
return fmt.Errorf("HTTP error writing content: %v", resp.Status) | |
} | |
return nil | |
} | |
func fetch(c change) (rv error) { | |
log.Printf("Fetching stuff from %v (%v, %v bytes)", | |
c.Id, c.Seq, len(c.Doc.Text)) | |
for _, img := range wikiparse.FindFiles(c.Doc.Text) { | |
if exists(img) { | |
log.Printf("Already have %v, skipping", img) | |
continue | |
} | |
iu := wikiparse.URLForFile(img) | |
var err error | |
for i := 0; i < 3; i++ { | |
err = copyImage(img, iu) | |
if err == nil { | |
break | |
} | |
time.Sleep(time.Second) | |
} | |
if err != nil { | |
log.Printf("Error copying %v - %v", img, err) | |
} | |
} | |
return nil | |
} | |
func fetcher(ch <-chan change) { | |
for c := range ch { | |
err := fetch(c) | |
if err != nil { | |
log.Printf("Error fetching images from %v", c.Id) | |
} | |
} | |
} | |
func feedBody(r io.Reader, results chan<- change) int64 { | |
largest := int64(0) | |
d := json.NewDecoder(r) | |
for { | |
thing := change{} | |
err := d.Decode(&thing) | |
if err != nil { | |
if err.Error() == "unexpected EOF" { | |
return largest | |
} else { | |
log.Fatalf("Error decoding stuff: %#v", err) | |
} | |
} | |
results <- thing | |
largest = thing.Seq | |
} | |
return largest | |
} | |
func main() { | |
startNum := flag.Int64("start", 0, "Starting change ID") | |
numWorkers := flag.Int("workers", 4, "Number of workers") | |
flag.Parse() | |
db, err := couch.Connect(flag.Arg(0)) | |
maybefatal(err, "Error connecting: %v", err) | |
ch := make(chan change) | |
for i := 0; i < *numWorkers; i++ { | |
go fetcher(ch) | |
} | |
err = db.Changes(func(r io.Reader) int64 { | |
return feedBody(r, ch) | |
}, | |
map[string]interface{}{ | |
"since": *startNum, | |
"feed": "continuous", | |
"include_docs": true, | |
}) | |
maybefatal(err, "Error changesing: %v", err) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment