Skip to content

Instantly share code, notes, and snippets.

@bunyk
Last active March 16, 2018 18:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bunyk/9c06015b2f99eaf7c6a8fc887c823c48 to your computer and use it in GitHub Desktop.
Save bunyk/9c06015b2f99eaf7c6a8fc887c823c48 to your computer and use it in GitHub Desktop.
dump fun
package main
import (
"fmt"
"log"
"os"
"sort"
)
type RuneFreq struct {
R rune
Freq uint
}
func main() {
if len(os.Args) < 2 {
log.Fatal("Please specify a file to read")
}
charcount := make(map[rune]uint)
count := 0
for page := range Read(os.Args[1]) {
count++
if count%123 == 0 {
fmt.Fprintf(os.Stderr, "\rPages: %d\t\tRunes: %d", count, len(charcount))
}
if page.Redirect != nil {
continue // skip redirects
}
for _, r := range []rune(page.Text) {
charcount[r]++
}
}
top := make([]RuneFreq, len(charcount))
i := 0
var sum uint = 0
for k, v := range charcount {
top[i] = RuneFreq{k, v}
i++
sum += v
}
sort.Slice(top, func(i, j int) bool {
return top[i].Freq < top[j].Freq
})
for i, rf := range top {
fmt.Println("%d) &#%d;: %d", len(charcount)-i, rf.R, rf.Freq)
}
fmt.Println("Total characters: %d", sum)
fmt.Println("Different characters: %d", len(charcount))
}
package main
import (
"compress/bzip2"
"encoding/xml"
"io"
"log"
"os"
"strings"
)
type Page struct {
Title string `xml:"title"`
Namespace int `xml:"ns"`
Id int `xml:"id"`
Text string `xml:"revision>text"`
Redirect *Redirect `xml:"redirect"`
}
type Redirect struct {
To string `xml:"title,attr"`
}
func (p Page) String() string {
redirect := ""
if p.Redirect != nil {
redirect = "-> " + p.Redirect.To
}
return p.Title + redirect + ": " + (p.Text + strings.Repeat(" ", 50))[:50]
}
func Read(filename string) <-chan Page {
out := make(chan Page, 100)
go func() {
f, err := os.Open(filename)
stopOnError(err)
decompressedFile := bzip2.NewReader(f)
xmlReader := xml.NewDecoder(decompressedFile)
for {
t, err := xmlReader.Token()
if err == io.EOF {
close(out)
break
}
stopOnError(err)
switch t := t.(type) {
case xml.StartElement:
if t.Name.Local == "page" {
var page Page
stopOnError(xmlReader.DecodeElement(&page, &t))
stopOnError(err)
out <- page
}
}
}
}()
return out
}
func stopOnError(err error) {
if err != nil {
log.Fatal(err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment