Skip to content

Instantly share code, notes, and snippets.

@veer66
Created April 17, 2014 17:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save veer66/11001209 to your computer and use it in GitHub Desktop.
Save veer66/11001209 to your computer and use it in GitHub Desktop.
There are still so many errors.
package main
// Based on https://github.com/dps/go-xml-parse/blob/master/go-xml-parse.go
import (
"fmt"
"os"
"flag"
"encoding/xml"
"strings"
"regexp"
"encoding/json"
)
var inputFile = flag.String("infile", "thwiktionary-20140406-pages-articles-multistream.xml",
"Input file path")
var outDictPath = flag.String("outfile", "dict.json", "Output dictionary file")
var langLinkRe = regexp.MustCompile("\\*{{(\\w\\w)}}: (.+)")
var linkRe = regexp.MustCompile("\\[\\[([^\\[]+)\\]\\]")
var langSymbolRe = regexp.MustCompile("\\{\\{\\w\\w\\}\\}")
var transHeaderRe = regexp.MustCompile("===\\s*คำแปล\\s*===")
var latinRe = regexp.MustCompile("^[\u0000-\u00FF]+$")
var sectionRe = regexp.MustCompile("==.+==")
var posSectionRe = regexp.MustCompile("=+\\s*{{หน้าที่-\\w\\w\\|([^=]+)\\s*=+|=*\\s*{*([\u0E00-\u0EFF]+)|=+{*(วลี)\\|.*|=+\\s*{*(ตัว[\u0E00-\u0EFF]+).+|=*\\s*{*ความหมาย|===(Determiner)===")
var itemRe = regexp.MustCompile("\\s*[\\*#:](.+)")
var thaiPartRe = regexp.MustCompile("[\u0E00-\u0EFF]+")
var thaiRe = regexp.MustCompile("^[\u0E00-\u0EFF]+$")
type Redirect struct {
Title string `xml:"title,attr"`
}
type Page struct {
Title string `xml:"title"`
Redir Redirect `xml:"redirect"`
Text string `xml:"revision>text"`
}
type Gross struct {
Lang string
Text string
}
func ExtractGrossListDetail(matchInfo []string) []Gross {
var lang = matchInfo[1]
var rest = matchInfo[2]
var grossList = []Gross{}
for _, link := range linkRe.FindAllString(rest, -1) {
var linkMatchInfo = linkRe.FindStringSubmatch(link)
var text = linkMatchInfo[1]
if strings.Contains(text, "|") {
var alters = strings.Split(text, "|")
if len(alters) > 0 {
if strings.Contains(alters[0], ":") {
var toks = strings.Split(alters[0], ":")
if len(toks) >= 2 {
grossList = append(grossList, Gross{lang, toks[1]})
}
} else {
for _, alter := range alters {
grossList = append(grossList, Gross{lang, alter})
}
}
}
} else {
grossList = append(grossList, Gross{lang, text})
}
}
return grossList
}
func ExtractGrossList(lines []string) []Gross {
var grossList = []Gross{}
for _, line := range lines {
var m = langLinkRe.FindStringSubmatch(line)
if len(m) == 3 {
grossList = append(grossList, ExtractGrossListDetail(m)...)
}
}
return grossList
}
func SeekInTransSection(lines []string, level int) []string {
trans := []string{}
for _, line := range lines {
if sectionRe.MatchString(line) {
break
}
var m = itemRe.FindStringSubmatch(line)
if len(m) > 0 {
var rest = m[1]
var links = linkRe.FindAllStringSubmatch(rest, -1);
for _, link := range links {
if thaiRe.MatchString(link[1]) {
trans = append(trans, link[1])
}
}
if level > 1 {
var links_ = thaiPartRe.FindAllStringSubmatch(rest, -1)
//fmt.Println("#REST:", rest, " === #LINKS: ", links_)
for _, link := range links_ {
if thaiRe.MatchString(link[0]) {
trans = append(trans, link[0])
}
}
}
}
}
if level > 0 {
if len(trans) == 0 {
for _, line := range lines {
if sectionRe.MatchString(line) {
break
}
for _, link := range linkRe.FindAllStringSubmatch(line, -1) {
if thaiRe.MatchString(link[1]) && link[1] != "ภาษาอังกฤษ" {
trans = append(trans, link[1])
}
}
if len(trans) > 0 {
break
}
}
}
}
return trans
}
func SeekTrans(lines []string) []string {
trans := []string{}
var level int
for level = 0; level <= 3; level++ {
for i, line := range lines {
var m = posSectionRe.FindAllStringSubmatch(line, -1)
if len(m) > 0 {
trans = append(trans, SeekInTransSection(lines[i + 1:], level)...)
}
}
if len(trans) > 0 {
break
}
}
return trans
}
type Entry struct {
Li string
Gloss []string
}
func ExtractFromPage(p *Page, o *os.File) {
if strings.Index(p.Title, ":") < 0 {
if latinRe.MatchString(p.Title) {
if strings.Contains(p.Text, "ภาษาอังกฤษ") || strings.Contains(p.Text, "{{en}}==") {
var lines = strings.Split(p.Text, "\n")
var trans = SeekTrans(lines)
entry := Entry{p.Title, trans}
b, err := json.Marshal(entry)
if err != nil {
fmt.Println(err)
panic("Cannot convert JSON")
}
o.Write(b)
o.WriteString("\n")
}
}
}
}
func main() {
flag.Parse()
xmlFile, err := os.Open(*inputFile)
if err != nil {
fmt.Println("Error opening file:", err)
return
}
defer xmlFile.Close()
outDictFile, err := os.Create(*outDictPath)
if err != nil {
fmt.Println("Error opening file:", err)
return
}
defer outDictFile.Close()
decoder := xml.NewDecoder(xmlFile)
var inElement string
for {
t, _ := decoder.Token()
if t == nil {
break
}
switch se := t.(type) {
case xml.StartElement:
inElement = se.Name.Local
if inElement == "page" {
var p Page
decoder.DecodeElement(&p, &se)
if p.Redir.Title == "" {
ExtractFromPage(&p, outDictFile)
}
}
default:
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment