Skip to content

Instantly share code, notes, and snippets.

@rokujyouhitoma
Created June 5, 2018 02:22
Show Gist options
  • Save rokujyouhitoma/0c533b9e2aa70f8b6972dbc8488691ef to your computer and use it in GitHub Desktop.
Save rokujyouhitoma/0c533b9e2aa70f8b6972dbc8488691ef to your computer and use it in GitHub Desktop.
convert wikipedia titles to mecab
import codecs
import re
import sys
import unicodedata
REGEXP_ALIAS = re.compile(r'_\(.*?\)')
REGEXP_NUMBER = re.compile(r'^[0-9]+$')
def IsSyllabaryType(char, syllabaryType):
return unicodedata.name(char).startswith(syllabaryType)
def IsOneChar(word):
return len(word) == 1
def IsTwoHiraganaChars(word):
return len(word) == 2 and IsHiraganaChars(word)
def IsHiraganaChars(word):
for char in word:
if IsSyllabaryType(char, 'HIRAGANA'):
return False
return True
def HasAlias(word):
# e.g. XXX_(YYY)
return REGEXP_ALIAS.search(word) != None
def IsAllNumber(word):
# e.g. 1234
return REGEXP_NUMBER.search(word) != None
def HasSpecialChars(word):
for char in word:
if not (IsSyllabaryType(char, 'LATIN') or
IsSyllabaryType(char, 'DIGIT') or
IsSyllabaryType(char, 'CJK') or
IsSyllabaryType(char, 'HIRAGANA') or
IsSyllabaryType(char, 'KATAKANA')):
return False
return True
if __name__ == '__main__':
#mecab_line_tmpl = u'%s,-1,-1,%d,名詞,一般,*,*,*,*,*,*,*,wikipedia\n'
#cost = int(max(-36000, -400 * len(word)**1.5))
#print(mecab_line_tmpl % (word, cost), end='')
with codecs.open('jawiki-20180520-all-titles-in-ns0', 'r', 'utf-8') as f:
for line in f:
word = line.rstrip()
if not (IsOneChar(word) or
HasAlias(word) or
IsAllNumber(word) or
IsTwoHiraganaChars(word) or
HasSpecialChars(word)):
print(word, '')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment