Last active
February 11, 2021 18:54
-
-
Save jojomi/30afa4ed4a1c5238af35353d4da03153 to your computer and use it in GitHub Desktop.
JSON-Datei mit deutschen KFZ-Kennzeichen erstellen (aus Wikipedia)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"encoding/json" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"regexp" | |
"strings" | |
) | |
type Kennzeichen struct { | |
Kürzel string `json:"kuerzel"` | |
Bundesland string `json:"bundesland"` | |
Bezeichnung string `json:"bezeichnung"` | |
Erklärung string `json:"erklaerung"` | |
} | |
func (k Kennzeichen) String() string { | |
return fmt.Sprintf("%-3s – %s (offiziell: %s) – %s", k.Kürzel, k.Erklärung, k.Bezeichnung, k.Bundesland) | |
} | |
func main() { | |
// get | |
var input io.Reader | |
cacheFilename := "cache" | |
if _, err := os.Stat(cacheFilename); os.IsNotExist(err) { | |
// fetch online | |
sourceURL := "https://de.wikipedia.org/w/index.php?title=Liste_der_Kfz-Kennzeichen_in_Deutschland&action=raw" | |
resp, err := http.Get(sourceURL) | |
if err != nil { | |
panic(err) | |
} | |
defer resp.Body.Close() | |
out, err := os.Create(cacheFilename) | |
if err != nil { | |
panic(err) | |
} | |
defer out.Close() | |
io.Copy(out, resp.Body) | |
input = bufio.NewReader(resp.Body) | |
} else { | |
// use local cache file | |
out, err := os.Open(cacheFilename) | |
if err != nil { | |
panic(err) | |
} | |
input = out | |
defer out.Close() | |
} | |
// parse | |
kennzeichen := parse(input) | |
kennzeichenMap := make(map[string]*Kennzeichen) | |
for _, k := range kennzeichen { | |
kennzeichenMap[k.Kürzel] = k | |
} | |
// write to JSON file | |
data := struct { | |
Kennzeichen map[string]*Kennzeichen `json:"kennzeichen"` | |
Tree map[string]map[string]map[string]bool `json:"tree"` // tree output is optional here, if you dont need it, comment this line and the one a few lines later | |
}{ | |
Kennzeichen: kennzeichenMap, | |
Tree: getTree(kennzeichen), | |
} | |
output, err := json.Marshal(data) | |
// output, err := json.MarshalIndent(data, "", " ") // print with indention instead | |
if err != nil { | |
panic(err) | |
} | |
ioutil.WriteFile("data.json", output, 0644) | |
} | |
func getTree(kennzeichen []*Kennzeichen) map[string]map[string]map[string]bool { | |
result := make(map[string]map[string]map[string]bool) | |
var ( | |
c1 string | |
c2 string | |
c3 string | |
) | |
for _, k := range kennzeichen { | |
c := []rune(k.Kürzel) | |
c1 = string(c[0]) | |
if _, ok := result[c1]; !ok { | |
result[c1] = make(map[string]map[string]bool) | |
} | |
c2 = "" | |
if len(c) > 1 { | |
c2 = string(c[1]) | |
} | |
if _, ok := result[c1][c2]; !ok { | |
result[c1][c2] = make(map[string]bool) | |
} | |
c3 = "" | |
if len(c) > 2 { | |
c3 = string(c[2]) | |
} | |
result[c1][c2][c3] = true | |
} | |
return result | |
} | |
func parse(input io.Reader) []*Kennzeichen { | |
result := make([]*Kennzeichen, 0, 100) | |
active := false | |
blockActive := false | |
blockLine := 0 | |
scanner := bufio.NewScanner(input) | |
var ( | |
text string | |
letter string | |
ok bool | |
currentKennzeichen *Kennzeichen | |
) | |
for scanner.Scan() { | |
text = scanner.Text() | |
if !active { | |
if isLetterSectionHead(text) { | |
active = true | |
} | |
continue | |
} | |
if isSectionHead(text) && !isLetterSectionHead(text) { | |
active = false | |
continue | |
} | |
if blockActive { | |
if !strings.HasPrefix(text, "| ") { | |
continue | |
} | |
switch blockLine { | |
case 2: | |
currentKennzeichen.Bezeichnung = stripMediaWiki(text) | |
case 3: | |
currentKennzeichen.Erklärung = stripMediaWiki(text) | |
case 4: | |
currentKennzeichen.Bundesland = stripMediaWiki(text) | |
} | |
if blockLine == 4 { | |
blockActive = false | |
blockLine = 0 | |
continue | |
} | |
blockLine++ | |
} | |
if letter, ok = getLetterColumn(text); ok { | |
currentKennzeichen = &Kennzeichen{ | |
Kürzel: letter, | |
} | |
blockLine = 2 | |
blockActive = true | |
result = append(result, currentKennzeichen) | |
} | |
} | |
return result | |
} | |
var sectionRegexp = regexp.MustCompile("=== [A-ZÖÜÄ] ===") | |
func isSectionHead(line string) bool { | |
return strings.HasPrefix(line, "=") | |
} | |
func isLetterSectionHead(line string) bool { | |
return strings.HasPrefix(line, "===") && strings.HasSuffix(line, "===") && sectionRegexp.MatchString(line) | |
} | |
var letterColumnRegexp = regexp.MustCompile(`^\|(?: rowspan="\d" \|)? '''(?:\[\[[^|]*\|)?([A-ZÖÜÄ]{1,3})(?:\]\])?'''$`) | |
func getLetterColumn(line string) (string, bool) { | |
result := letterColumnRegexp.FindStringSubmatch(line) | |
if len(result) < 1 { | |
return "", false | |
} | |
return result[1], result[1] != "" | |
} | |
var stripReplacer = strings.NewReplacer("|", "", "[", "", "]", "", "_", " ", "(", "", ")", "", "*", "", "'", "", " ", "") | |
var bracketRegexp = regexp.MustCompile(`\([^\)]*\)|<[^\)]*>`) | |
var rowspanRegexp = regexp.MustCompile(`rowspan="(\d+)"`) | |
var linkRegexp = regexp.MustCompile(`\[\[(.*?)\|(.*?)\]\]`) | |
var wordRegexp = regexp.MustCompile(`([\wäöüÄÖÜ])([\wäöüÄÖÜ]*)`) | |
func stripMediaWiki(line string) string { | |
// remove links | |
line = linkRegexp.ReplaceAllString(line, `$1`) | |
// remove further explanations in brackets | |
line = bracketRegexp.ReplaceAllString(line, "") | |
// remove rowspans | |
line = rowspanRegexp.ReplaceAllString(line, "") | |
// remove unwanted chars | |
line = stripReplacer.Replace(line) | |
// fix casing | |
line = wordRegexp.ReplaceAllStringFunc(line, func(m string) string { | |
return m[:1] + strings.ToLower(m[1:]) | |
}) | |
// trim result | |
return strings.TrimSpace(line) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Das liegt wahrscheinlich an Änderungen im HTML-Output von Wikipedia. Der Editor hat sich seit dem Skript mindestens einmal verändert. Ist das immer noch interessant für dich?