Created
July 20, 2019 14:04
-
-
Save shinderuman/11f4e4bfb8baf41fcb376f03c709481f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"strings" | |
"unicode" | |
"github.com/ikawaha/kagome/tokenizer" | |
) | |
func main() { | |
fmt.Println(gienkun("我は劉備の夢を叶えたい")) | |
fmt.Println(gienkun("これが異世界です")) | |
} | |
func gienkun(s string) string { | |
t := tokenizer.New() | |
text := strings.TrimSpace(s) | |
if text == "" { | |
return "" | |
} | |
tokens := t.Tokenize(text) | |
var items []string | |
kanaConv := unicode.SpecialCase{ | |
// ひらがなをカタカナに変換 | |
unicode.CaseRange{ | |
0x3041, // Lo: ぁ | |
0x3093, // Hi: ん | |
[unicode.MaxCase]rune{ | |
0x30a1 - 0x3041, // UpperCase でカタカナに変換 | |
0, // LowerCase では変換しない | |
0x30a1 - 0x3041, // TitleCase でカタカナに変換 | |
}, | |
}, | |
// カタカナをひらがなに変換 | |
unicode.CaseRange{ | |
0x30a1, // Lo: ァ | |
0x30f3, // Hi: ン | |
[unicode.MaxCase]rune{ | |
0, // UpperCase では変換しない | |
0x3041 - 0x30a1, // LowerCase でひらがなに変換 | |
0, // TitleCase では変換しない | |
}, | |
}, | |
} | |
for _, token := range tokens { | |
features := token.Features() | |
if token.Class != tokenizer.UNKNOWN { | |
if len(features) == 0 || token.Surface == "BOS" || token.Surface == "EOS" { | |
continue | |
} | |
if features[0] == "助詞" || features[6] == "*" { | |
continue | |
} | |
} | |
items = append(items, strings.ToUpperSpecial(kanaConv, token.Surface)) | |
} | |
return strings.Join(items, "…") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment