Skip to content

Instantly share code, notes, and snippets.

@bumcru
Created November 20, 2018 11:51
Show Gist options
  • Save bumcru/729632c7587f16c69d40a878c0bde750 to your computer and use it in GitHub Desktop.
Save bumcru/729632c7587f16c69d40a878c0bde750 to your computer and use it in GitHub Desktop.
goで禁則処理
func splitByBlock(text string) (words []string) {
if text == "" {
return []string{}
}
rs := []rune(text)
var tmp string
for _, r := range rs {
// 空白文字の前後は無条件に分割
if isEmptyString(string(r)) {
if tmp != "" {
words = append(words, tmp)
tmp = ""
}
words = append(words, string(r))
continue
}
// 1バイト文字の場合、もしくは、行末禁止文字の場合は分割しない
if len(string(r)) == 1 || !isValidAsEnd(string(r)) {
tmp += string(r)
continue
}
tmp += string(r)
// 1文字目が行頭禁止文字の場合、1つ前の単語の末尾に付加する
if !isValidAsBullet(string([]rune(tmp)[0])) {
if len(words) == 0 {
words = []string{tmp}
} else {
words[len(words)-1] += tmp
}
} else {
words = append(words, tmp)
}
tmp = ""
}
if tmp != "" {
words = append(words, tmp)
}
return
}
var emptyStrings = map[string]bool{
" ": true,
" ": true,
"\t": true,
"\n": true,
}
func isEmptyString(s string) (res bool) {
_, res = emptyStrings[s]
return
}
var invalidBullets = map[string]bool{
"、": true, "。": true, ".": true,
"ぁ": true, "ぃ": true, "ぅ": true, "ぇ": true, "ぉ": true,
"っ": true, "ゃ": true, "ゅ": true, "ょ": true,
"ァ": true, "ィ": true, "ゥ": true, "ェ": true, "ォ": true,
"ッ": true, "ャ": true, "ュ": true, "ョ": true,
"ァ": true, "ィ": true, "ゥ": true, "ェ": true, "ォ": true,
"ッ": true, "ャ": true, "ュ": true, "ョ": true,
")": true, "}": true, "]": true, ">": true,
"」": true, "』": true, ")": true, "}": true, "】": true, ">": true, "≫": true, "]": true,
"・": true, "ー": true, "―": true, "-": true,
":": true, ";": true, "/": true, "/": true,
"ゝ": true, "々": true, "!": true, "?": true, "!": true, "?": true,
}
func isValidAsBullet(s string) bool {
_, res := invalidBullets[s]
return !res
}
var invalidEndChars = map[string]bool{
"(": true, "{": true, "[": true, "<": true,
"「": true, "『": true, "(": true, "{": true, "【": true, "<": true, "≪": true, "[": true,
}
func isValidAsEnd(s string) bool {
_, res := invalidEndChars[s]
return !res
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment