Skip to content

Instantly share code, notes, and snippets.

@djinn
Created June 2, 2014 12:56
Show Gist options
  • Save djinn/14e7bd77ce05d49297cf to your computer and use it in GitHub Desktop.
Save djinn/14e7bd77ce05d49297cf to your computer and use it in GitHub Desktop.
XML parsing in golang
package main
import (
"os"
"encoding/xml"
"strings"
"log"
"runtime/debug"
)
type Article struct {
Title string `xml:"title"`
Url string `xml:"url"`
Abstract string `xml:"abstract"`
}
func CanonicalizeTitle(title string) string {
can := strings.ToLower(title)
//can = strings.Replace(can, " ", "_", -1)
//can = url.QueryEscape(can)
spl := strings.Split(can, ":")
can = spl[1]
return can
}
func readAbstract(filename string, article chan Article) {
defer close(article)
xmlFile, err := os.Open(filename)
if err != nil {
log.Printf("Error opening file:", err)
return
}
var inElement string
decoder := xml.NewDecoder(xmlFile)
for {
t, err := decoder.Token()
if err != nil {
log.Printf("Error while parsing -> %s", err)
debug.PrintStack()
continue
}
// Inspect the type of the token just read.
switch se := t.(type) {
case xml.StartElement:
// If we just read a StartElement token
inElement = se.Name.Local
// ...and its name is "page"
if inElement == "doc" {
var p Article
// decode a whole chunk of following XML into the
// variable p which is a Page (se above)
decoder.DecodeElement(&p, &se)
p.Title = CanonicalizeTitle(p.Title)
article <- p
}
default:
}
}
xmlFile.Close()
return
}
func main() {
d := make(chan Article, 400)
go readAbstract("enwiki-20140502-abstract.xml", d)
for r := range d {
log.Printf("%s", r.Title)
}
log.Printf("Reached here")
}
@djinn
Copy link
Author

djinn commented Oct 15, 2014

Parsing Wikimedia Abstract.xml in golang

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment