Skip to content

Instantly share code, notes, and snippets.

@atomotic
Last active August 29, 2015 14:26
Show Gist options
  • Save atomotic/2c85a5971d28fb2f4e80 to your computer and use it in GitHub Desktop.
Save atomotic/2c85a5971d28fb2f4e80 to your computer and use it in GitHub Desktop.
warc-formats
package main
import (
"bufio"
"bytes"
"fmt"
"github.com/richardlehane/siegfried"
"github.com/slyrz/warc"
"io/ioutil"
"log"
"net/http"
"os"
)
func main() {
sf, err := siegfried.Load("/Users/raffaele/siegfried/pronom.sig")
if err != nil {
log.Fatal(err)
}
f, err := os.Open(os.Args[1])
defer f.Close()
reader, err := warc.NewReader(f)
if err != nil {
log.Fatal(err)
}
defer reader.Close()
for {
record, err := reader.ReadRecord()
if err != nil {
break
}
if record.Header["warc-type"] == "response" {
response, err := http.ReadResponse(bufio.NewReader(record.Content), nil)
if err != nil {
break
}
defer response.Body.Close()
body, _ := ioutil.ReadAll(response.Body)
c, err := sf.Identify(record.Header["warc-record-id"], bytes.NewBuffer(body))
if err != nil {
log.Fatal(err)
}
for id := range c {
fmt.Printf("%v\t%v\t%v\n", record.Header["warc-record-id"], record.Header["warc-target-uri"], id)
}
}
}
}
@atomotic
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment