Created
April 17, 2014 17:54
-
-
Save veer66/11001209 to your computer and use it in GitHub Desktop.
There are still so many errors.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// Based on https://github.com/dps/go-xml-parse/blob/master/go-xml-parse.go | |
import ( | |
"fmt" | |
"os" | |
"flag" | |
"encoding/xml" | |
"strings" | |
"regexp" | |
"encoding/json" | |
) | |
var inputFile = flag.String("infile", "thwiktionary-20140406-pages-articles-multistream.xml", | |
"Input file path") | |
var outDictPath = flag.String("outfile", "dict.json", "Output dictionary file") | |
var langLinkRe = regexp.MustCompile("\\*{{(\\w\\w)}}: (.+)") | |
var linkRe = regexp.MustCompile("\\[\\[([^\\[]+)\\]\\]") | |
var langSymbolRe = regexp.MustCompile("\\{\\{\\w\\w\\}\\}") | |
var transHeaderRe = regexp.MustCompile("===\\s*คำแปล\\s*===") | |
var latinRe = regexp.MustCompile("^[\u0000-\u00FF]+$") | |
var sectionRe = regexp.MustCompile("==.+==") | |
var posSectionRe = regexp.MustCompile("=+\\s*{{หน้าที่-\\w\\w\\|([^=]+)\\s*=+|=*\\s*{*([\u0E00-\u0EFF]+)|=+{*(วลี)\\|.*|=+\\s*{*(ตัว[\u0E00-\u0EFF]+).+|=*\\s*{*ความหมาย|===(Determiner)===") | |
var itemRe = regexp.MustCompile("\\s*[\\*#:](.+)") | |
var thaiPartRe = regexp.MustCompile("[\u0E00-\u0EFF]+") | |
var thaiRe = regexp.MustCompile("^[\u0E00-\u0EFF]+$") | |
type Redirect struct { | |
Title string `xml:"title,attr"` | |
} | |
type Page struct { | |
Title string `xml:"title"` | |
Redir Redirect `xml:"redirect"` | |
Text string `xml:"revision>text"` | |
} | |
type Gross struct { | |
Lang string | |
Text string | |
} | |
func ExtractGrossListDetail(matchInfo []string) []Gross { | |
var lang = matchInfo[1] | |
var rest = matchInfo[2] | |
var grossList = []Gross{} | |
for _, link := range linkRe.FindAllString(rest, -1) { | |
var linkMatchInfo = linkRe.FindStringSubmatch(link) | |
var text = linkMatchInfo[1] | |
if strings.Contains(text, "|") { | |
var alters = strings.Split(text, "|") | |
if len(alters) > 0 { | |
if strings.Contains(alters[0], ":") { | |
var toks = strings.Split(alters[0], ":") | |
if len(toks) >= 2 { | |
grossList = append(grossList, Gross{lang, toks[1]}) | |
} | |
} else { | |
for _, alter := range alters { | |
grossList = append(grossList, Gross{lang, alter}) | |
} | |
} | |
} | |
} else { | |
grossList = append(grossList, Gross{lang, text}) | |
} | |
} | |
return grossList | |
} | |
func ExtractGrossList(lines []string) []Gross { | |
var grossList = []Gross{} | |
for _, line := range lines { | |
var m = langLinkRe.FindStringSubmatch(line) | |
if len(m) == 3 { | |
grossList = append(grossList, ExtractGrossListDetail(m)...) | |
} | |
} | |
return grossList | |
} | |
func SeekInTransSection(lines []string, level int) []string { | |
trans := []string{} | |
for _, line := range lines { | |
if sectionRe.MatchString(line) { | |
break | |
} | |
var m = itemRe.FindStringSubmatch(line) | |
if len(m) > 0 { | |
var rest = m[1] | |
var links = linkRe.FindAllStringSubmatch(rest, -1); | |
for _, link := range links { | |
if thaiRe.MatchString(link[1]) { | |
trans = append(trans, link[1]) | |
} | |
} | |
if level > 1 { | |
var links_ = thaiPartRe.FindAllStringSubmatch(rest, -1) | |
//fmt.Println("#REST:", rest, " === #LINKS: ", links_) | |
for _, link := range links_ { | |
if thaiRe.MatchString(link[0]) { | |
trans = append(trans, link[0]) | |
} | |
} | |
} | |
} | |
} | |
if level > 0 { | |
if len(trans) == 0 { | |
for _, line := range lines { | |
if sectionRe.MatchString(line) { | |
break | |
} | |
for _, link := range linkRe.FindAllStringSubmatch(line, -1) { | |
if thaiRe.MatchString(link[1]) && link[1] != "ภาษาอังกฤษ" { | |
trans = append(trans, link[1]) | |
} | |
} | |
if len(trans) > 0 { | |
break | |
} | |
} | |
} | |
} | |
return trans | |
} | |
func SeekTrans(lines []string) []string { | |
trans := []string{} | |
var level int | |
for level = 0; level <= 3; level++ { | |
for i, line := range lines { | |
var m = posSectionRe.FindAllStringSubmatch(line, -1) | |
if len(m) > 0 { | |
trans = append(trans, SeekInTransSection(lines[i + 1:], level)...) | |
} | |
} | |
if len(trans) > 0 { | |
break | |
} | |
} | |
return trans | |
} | |
type Entry struct { | |
Li string | |
Gloss []string | |
} | |
func ExtractFromPage(p *Page, o *os.File) { | |
if strings.Index(p.Title, ":") < 0 { | |
if latinRe.MatchString(p.Title) { | |
if strings.Contains(p.Text, "ภาษาอังกฤษ") || strings.Contains(p.Text, "{{en}}==") { | |
var lines = strings.Split(p.Text, "\n") | |
var trans = SeekTrans(lines) | |
entry := Entry{p.Title, trans} | |
b, err := json.Marshal(entry) | |
if err != nil { | |
fmt.Println(err) | |
panic("Cannot convert JSON") | |
} | |
o.Write(b) | |
o.WriteString("\n") | |
} | |
} | |
} | |
} | |
func main() { | |
flag.Parse() | |
xmlFile, err := os.Open(*inputFile) | |
if err != nil { | |
fmt.Println("Error opening file:", err) | |
return | |
} | |
defer xmlFile.Close() | |
outDictFile, err := os.Create(*outDictPath) | |
if err != nil { | |
fmt.Println("Error opening file:", err) | |
return | |
} | |
defer outDictFile.Close() | |
decoder := xml.NewDecoder(xmlFile) | |
var inElement string | |
for { | |
t, _ := decoder.Token() | |
if t == nil { | |
break | |
} | |
switch se := t.(type) { | |
case xml.StartElement: | |
inElement = se.Name.Local | |
if inElement == "page" { | |
var p Page | |
decoder.DecodeElement(&p, &se) | |
if p.Redir.Title == "" { | |
ExtractFromPage(&p, outDictFile) | |
} | |
} | |
default: | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment