Skip to content

Instantly share code, notes, and snippets.

@allisonmorgan
Last active September 13, 2017 02:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allisonmorgan/8a973a86e9bd353e57e44377e1bcc3c1 to your computer and use it in GitHub Desktop.
Save allisonmorgan/8a973a86e9bd353e57e44377e1bcc3c1 to your computer and use it in GitHub Desktop.
Parse DBLP XML data in Go
package main
import (
"compress/gzip"
"encoding/xml"
"fmt"
"golang.org/x/net/html/charset"
"log"
"os"
"strings"
)
// Structs to unmarshal the DBLP XML data into
// Only unpacks "inproceedings" publications
type InProceedings struct {
Papers []Paper `xml:"inproceedings"`
}
// Common features of publications. Not a complete
// set of attributes.
type Paper struct {
Key string `xml:"key,attr"`
Authors []string `xml:"author"`
Title string `xml:"title"`
Year int `xml:"year"`
Booktitle string `xml:"booktitle"`
EE string `xml:"ee"`
Crossref string `xml:"crossref"`
Url string `xml:"url"`
}
// Since decoder.Decode() will error on special
// characters, the map below will convert these
// foreign language characters
var characters = []string{"a", "e", "o", "u", "i", "c", "b", "s", "y", "n", "ae"}
var suffixes = []string{"acute", "uml", "ring", "zlig", "slash", "cedil", "grave", "circ", "tilde", "lig"}
func build_special_character_list() map[string]string {
var special_characters = make(map[string]string)
for _, character := range characters {
for _, suffix := range suffixes {
special_characters[character+suffix] = character
special_characters[strings.ToUpper(character)+suffix] = character
}
}
// Other random special characters
special_characters["times"] = "*"
special_characters["reg"] = "reg"
special_characters["eth"] = "d"
special_characters["ETH"] = "d"
special_characters["micro"] = "u"
special_characters["thorn"] = "p"
special_characters["THORN"] = "p"
return special_characters
}
func main() {
// Download data from http://dblp.uni-trier.de/xml/,
// change the file path below to point to it
log.Println("Reading in DBLP gzipped file")
f, err := os.Open("../data/dblp.xml.gz")
if err != nil {
log.Fatal(err)
}
defer f.Close()
// Golang can read straight from gzipped files!
gzf, err := gzip.NewReader(f)
if err != nil {
log.Fatal(err)
}
// Start a new XML decoder instance and ask it to
// find the encoding specified in the file's header.
// Specify the special character replacements.
decoder := xml.NewDecoder(gzf)
decoder.CharsetReader = charset.NewReaderLabel
decoder.Entity = build_special_character_list()
// This step takes while (~2 minutes)
log.Println("Decoding all proceedings from file")
var papers InProceedings
err = decoder.Decode(&papers)
if err != nil {
log.Fatal(err)
}
log.Println("Done decoding")
// log.Printf("Example paper: %+v\n", papers.Papers[0])
conference := os.Args[1]
// Now grab all the conference related publications
titles := make([]string, 0, len(papers.Papers))
avg_length := 0
for _, paper := range papers.Papers {
match_on := fmt.Sprintf("conf/%s/", conference)
if strings.Contains(paper.Crossref, match_on) && len(paper.Title) > 0 {
var new_title = strings.TrimRight(paper.Title, ".")
titles = append(titles, new_title)
avg_length += len(new_title)
}
}
log.Printf("Number of %s titles: %v\tAverage title length: %v\n", strings.ToUpper(conference), len(titles), float64(avg_length)/float64(len(titles)))
// Write all the papers to a new text file
f, err = os.Create(fmt.Sprintf("../data/%s.txt", conference))
if err != nil{
log.Fatal(err)
}
defer f.Close()
for _, title := range titles {
_, err := f.WriteString(strings.TrimSpace(title) + "\n")
if err != nil {
log.Fatal(err)
}
}
log.Printf("Output data to %s.txt\n", conference)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment