Last active
September 13, 2017 02:02
-
-
Save allisonmorgan/8a973a86e9bd353e57e44377e1bcc3c1 to your computer and use it in GitHub Desktop.
Parse DBLP XML data in Go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"compress/gzip" | |
"encoding/xml" | |
"fmt" | |
"golang.org/x/net/html/charset" | |
"log" | |
"os" | |
"strings" | |
) | |
// Structs to unmarshal the DBLP XML data into | |
// Only unpacks "inproceedings" publications | |
type InProceedings struct { | |
Papers []Paper `xml:"inproceedings"` | |
} | |
// Common features of publications. Not a complete | |
// set of attributes. | |
type Paper struct { | |
Key string `xml:"key,attr"` | |
Authors []string `xml:"author"` | |
Title string `xml:"title"` | |
Year int `xml:"year"` | |
Booktitle string `xml:"booktitle"` | |
EE string `xml:"ee"` | |
Crossref string `xml:"crossref"` | |
Url string `xml:"url"` | |
} | |
// Since decoder.Decode() will error on special | |
// characters, the map below will convert these | |
// foreign language characters | |
var characters = []string{"a", "e", "o", "u", "i", "c", "b", "s", "y", "n", "ae"} | |
var suffixes = []string{"acute", "uml", "ring", "zlig", "slash", "cedil", "grave", "circ", "tilde", "lig"} | |
func build_special_character_list() map[string]string { | |
var special_characters = make(map[string]string) | |
for _, character := range characters { | |
for _, suffix := range suffixes { | |
special_characters[character+suffix] = character | |
special_characters[strings.ToUpper(character)+suffix] = character | |
} | |
} | |
// Other random special characters | |
special_characters["times"] = "*" | |
special_characters["reg"] = "reg" | |
special_characters["eth"] = "d" | |
special_characters["ETH"] = "d" | |
special_characters["micro"] = "u" | |
special_characters["thorn"] = "p" | |
special_characters["THORN"] = "p" | |
return special_characters | |
} | |
func main() { | |
// Download data from http://dblp.uni-trier.de/xml/, | |
// change the file path below to point to it | |
log.Println("Reading in DBLP gzipped file") | |
f, err := os.Open("../data/dblp.xml.gz") | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer f.Close() | |
// Golang can read straight from gzipped files! | |
gzf, err := gzip.NewReader(f) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// Start a new XML decoder instance and ask it to | |
// find the encoding specified in the file's header. | |
// Specify the special character replacements. | |
decoder := xml.NewDecoder(gzf) | |
decoder.CharsetReader = charset.NewReaderLabel | |
decoder.Entity = build_special_character_list() | |
// This step takes while (~2 minutes) | |
log.Println("Decoding all proceedings from file") | |
var papers InProceedings | |
err = decoder.Decode(&papers) | |
if err != nil { | |
log.Fatal(err) | |
} | |
log.Println("Done decoding") | |
// log.Printf("Example paper: %+v\n", papers.Papers[0]) | |
conference := os.Args[1] | |
// Now grab all the conference related publications | |
titles := make([]string, 0, len(papers.Papers)) | |
avg_length := 0 | |
for _, paper := range papers.Papers { | |
match_on := fmt.Sprintf("conf/%s/", conference) | |
if strings.Contains(paper.Crossref, match_on) && len(paper.Title) > 0 { | |
var new_title = strings.TrimRight(paper.Title, ".") | |
titles = append(titles, new_title) | |
avg_length += len(new_title) | |
} | |
} | |
log.Printf("Number of %s titles: %v\tAverage title length: %v\n", strings.ToUpper(conference), len(titles), float64(avg_length)/float64(len(titles))) | |
// Write all the papers to a new text file | |
f, err = os.Create(fmt.Sprintf("../data/%s.txt", conference)) | |
if err != nil{ | |
log.Fatal(err) | |
} | |
defer f.Close() | |
for _, title := range titles { | |
_, err := f.WriteString(strings.TrimSpace(title) + "\n") | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
log.Printf("Output data to %s.txt\n", conference) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment