Skip to content

Instantly share code, notes, and snippets.

@jojomi
Last active February 11, 2021 18:54
Show Gist options
  • Save jojomi/30afa4ed4a1c5238af35353d4da03153 to your computer and use it in GitHub Desktop.
Save jojomi/30afa4ed4a1c5238af35353d4da03153 to your computer and use it in GitHub Desktop.
JSON-Datei mit deutschen KFZ-Kennzeichen erstellen (aus Wikipedia)
package main
import (
"bufio"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"regexp"
"strings"
)
type Kennzeichen struct {
Kürzel string `json:"kuerzel"`
Bundesland string `json:"bundesland"`
Bezeichnung string `json:"bezeichnung"`
Erklärung string `json:"erklaerung"`
}
func (k Kennzeichen) String() string {
return fmt.Sprintf("%-3s – %s (offiziell: %s) – %s", k.Kürzel, k.Erklärung, k.Bezeichnung, k.Bundesland)
}
func main() {
// get
var input io.Reader
cacheFilename := "cache"
if _, err := os.Stat(cacheFilename); os.IsNotExist(err) {
// fetch online
sourceURL := "https://de.wikipedia.org/w/index.php?title=Liste_der_Kfz-Kennzeichen_in_Deutschland&action=raw"
resp, err := http.Get(sourceURL)
if err != nil {
panic(err)
}
defer resp.Body.Close()
out, err := os.Create(cacheFilename)
if err != nil {
panic(err)
}
defer out.Close()
io.Copy(out, resp.Body)
input = bufio.NewReader(resp.Body)
} else {
// use local cache file
out, err := os.Open(cacheFilename)
if err != nil {
panic(err)
}
input = out
defer out.Close()
}
// parse
kennzeichen := parse(input)
kennzeichenMap := make(map[string]*Kennzeichen)
for _, k := range kennzeichen {
kennzeichenMap[k.Kürzel] = k
}
// write to JSON file
data := struct {
Kennzeichen map[string]*Kennzeichen `json:"kennzeichen"`
Tree map[string]map[string]map[string]bool `json:"tree"` // tree output is optional here, if you dont need it, comment this line and the one a few lines later
}{
Kennzeichen: kennzeichenMap,
Tree: getTree(kennzeichen),
}
output, err := json.Marshal(data)
// output, err := json.MarshalIndent(data, "", " ") // print with indention instead
if err != nil {
panic(err)
}
ioutil.WriteFile("data.json", output, 0644)
}
func getTree(kennzeichen []*Kennzeichen) map[string]map[string]map[string]bool {
result := make(map[string]map[string]map[string]bool)
var (
c1 string
c2 string
c3 string
)
for _, k := range kennzeichen {
c := []rune(k.Kürzel)
c1 = string(c[0])
if _, ok := result[c1]; !ok {
result[c1] = make(map[string]map[string]bool)
}
c2 = ""
if len(c) > 1 {
c2 = string(c[1])
}
if _, ok := result[c1][c2]; !ok {
result[c1][c2] = make(map[string]bool)
}
c3 = ""
if len(c) > 2 {
c3 = string(c[2])
}
result[c1][c2][c3] = true
}
return result
}
func parse(input io.Reader) []*Kennzeichen {
result := make([]*Kennzeichen, 0, 100)
active := false
blockActive := false
blockLine := 0
scanner := bufio.NewScanner(input)
var (
text string
letter string
ok bool
currentKennzeichen *Kennzeichen
)
for scanner.Scan() {
text = scanner.Text()
if !active {
if isLetterSectionHead(text) {
active = true
}
continue
}
if isSectionHead(text) && !isLetterSectionHead(text) {
active = false
continue
}
if blockActive {
if !strings.HasPrefix(text, "| ") {
continue
}
switch blockLine {
case 2:
currentKennzeichen.Bezeichnung = stripMediaWiki(text)
case 3:
currentKennzeichen.Erklärung = stripMediaWiki(text)
case 4:
currentKennzeichen.Bundesland = stripMediaWiki(text)
}
if blockLine == 4 {
blockActive = false
blockLine = 0
continue
}
blockLine++
}
if letter, ok = getLetterColumn(text); ok {
currentKennzeichen = &Kennzeichen{
Kürzel: letter,
}
blockLine = 2
blockActive = true
result = append(result, currentKennzeichen)
}
}
return result
}
var sectionRegexp = regexp.MustCompile("=== [A-ZÖÜÄ] ===")
func isSectionHead(line string) bool {
return strings.HasPrefix(line, "=")
}
func isLetterSectionHead(line string) bool {
return strings.HasPrefix(line, "===") && strings.HasSuffix(line, "===") && sectionRegexp.MatchString(line)
}
var letterColumnRegexp = regexp.MustCompile(`^\|(?: rowspan="\d" \|)? '''(?:\[\[[^|]*\|)?([A-ZÖÜÄ]{1,3})(?:\]\])?'''$`)
func getLetterColumn(line string) (string, bool) {
result := letterColumnRegexp.FindStringSubmatch(line)
if len(result) < 1 {
return "", false
}
return result[1], result[1] != ""
}
var stripReplacer = strings.NewReplacer("|", "", "[", "", "]", "", "_", " ", "(", "", ")", "", "*", "", "'", "", "&nbsp;", "")
var bracketRegexp = regexp.MustCompile(`\([^\)]*\)|<[^\)]*>`)
var rowspanRegexp = regexp.MustCompile(`rowspan="(\d+)"`)
var linkRegexp = regexp.MustCompile(`\[\[(.*?)\|(.*?)\]\]`)
var wordRegexp = regexp.MustCompile(`([\wäöüÄÖÜ])([\wäöüÄÖÜ]*)`)
func stripMediaWiki(line string) string {
// remove links
line = linkRegexp.ReplaceAllString(line, `$1`)
// remove further explanations in brackets
line = bracketRegexp.ReplaceAllString(line, "")
// remove rowspans
line = rowspanRegexp.ReplaceAllString(line, "")
// remove unwanted chars
line = stripReplacer.Replace(line)
// fix casing
line = wordRegexp.ReplaceAllStringFunc(line, func(m string) string {
return m[:1] + strings.ToLower(m[1:])
})
// trim result
return strings.TrimSpace(line)
}
@jojomi
Copy link
Author

jojomi commented Feb 11, 2021

Das liegt wahrscheinlich an Änderungen im HTML-Output von Wikipedia. Der Editor hat sich seit dem Skript mindestens einmal verändert. Ist das immer noch interessant für dich?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment