Last active
December 19, 2015 10:49
-
-
Save jagregory/5942951 to your computer and use it in GitHub Desktop.
Parsing wikipedia dumps in Go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"compress/bzip2" | |
"encoding/xml" | |
"fmt" | |
"io" | |
"log" | |
"os" | |
"strings" | |
) | |
func main() { | |
filename := os.Args[1] | |
f, err := os.Open(filename) | |
if err != nil { | |
log.Fatal("Unable to read zip", err) | |
} | |
defer f.Close() | |
r := bzip2.NewReader(f) | |
dec := xml.NewDecoder(r) | |
currentElement := "" | |
currentTitle := "" | |
for { | |
token, err := dec.Token() | |
if err != nil { | |
if err == io.EOF { | |
log.Println("End of file.") | |
break | |
} | |
log.Fatal("Error reading zip", err) | |
} | |
if start, ok := token.(xml.StartElement); ok { | |
currentElement = start.Name.Local | |
if currentElement == "page" { | |
currentTitle = "" | |
} | |
} else if _, ok := token.(xml.EndElement); ok { | |
currentElement = "" | |
} | |
if char, ok := token.(xml.CharData); ok { | |
if currentElement == "title" { | |
currentTitle = string(char) | |
} else if currentElement == "text" { | |
text := string(char) | |
if strings.Contains(text, "{{Infobox musical artist") { | |
fmt.Print(currentTitle) | |
fmt.Print("\n") | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This program reads through the 9GB wikipedia dump, extracting it as a stream, and prints any titles of pages about musicians.
Runtime: 112 minutes on a 1.8ghz MacBook Air.
Memory usage: average 15mb, maximum 24mb.
Go is proving to be quite an interesting language, it gives results similar to C without the pesky issues you get with writing pure C. Still obviously less expressive than @kornysietsma's clojure implementation though. 😄