Skip to content

Instantly share code, notes, and snippets.

@xeoncross
Last active August 18, 2022 18:13
Show Gist options
  • Save xeoncross/b8a244cd056ae67e54f97d6c745f692e to your computer and use it in GitHub Desktop.
Save xeoncross/b8a244cd056ae67e54f97d6c745f692e to your computer and use it in GitHub Desktop.
Simple script to parse the lines in the wikipedia page title dump file and output a list of all the english page titles (deduplicated). See https://dumps.wikimedia.org/enwiki/20220801/ for more files
package wikipediatitles
import (
_ "embed"
"sort"
"strings"
"unicode"
)
// https://dumps.wikimedia.org/enwiki/20220801/enwiki-20220801-all-titles-in-ns0.gz
//go:embed enwiki-20220801-all-titles-in-ns0
var allTitlesString string
func Parse(minSize int) []string {
start := 0
m := map[string]struct{}{}
for index, r := range allTitlesString {
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '-' || r == ' ' { // || r == '\'' ?
continue
}
if index-start >= minSize {
text := allTitlesString[start:index]
text = strings.ToLower(strings.ReplaceAll(text, "_", " "))
text = strings.Trim(text, "- ")
if len(text) >= minSize {
m[text] = struct{}{}
}
}
start = index + len(string(r))
}
var results []string
for k := range m {
results = append(results, k)
}
sort.Strings(results)
return results
}
package wikipediatitles
import (
"fmt"
"os"
"strings"
"testing"
)
func TestParse(t *testing.T) {
titles := Parse(5)
for i := 0; i < 100; i++ {
fmt.Println(titles[i])
}
f, err := os.Create("titles.txt")
if err != nil {
t.Fatal(err)
}
_, err = f.Write([]byte(strings.Join(titles, "\n")))
if err != nil {
t.Fatal(err)
}
err = f.Close()
if err != nil {
t.Fatal(err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment