Skip to content

Instantly share code, notes, and snippets.

@hayakawa
Created November 30, 2017 05:02
Show Gist options
  • Save hayakawa/a3c71bc64a1bdf426b58e91f087d219a to your computer and use it in GitHub Desktop.
Save hayakawa/a3c71bc64a1bdf426b58e91f087d219a to your computer and use it in GitHub Desktop.
Normalizer for neologd(This is modified version. you can use as a filter.)
# encoding: utf8
from __future__ import unicode_literals
import re
import unicodedata
import sys
################################################################
# This is modify version.
#
# If you need original source code, see below:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp
#
# (Keyword for search: 'normalize_neologd.py')
################################################################
def unicode_normalize(cls, s):
pt = re.compile('([{}]+)'.format(cls))
def norm(c):
return unicodedata.normalize('NFKC', c) if pt.match(c) else c
s = ''.join(norm(x) for x in re.split(pt, s))
s = re.sub('-', '-', s)
return s
def remove_extra_spaces(s):
s = re.sub('[  ]+', ' ', s)
blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS
'\u3040-\u309F', # HIRAGANA
'\u30A0-\u30FF', # KATAKANA
'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION
'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS
))
basic_latin = '\u0000-\u007F'
def remove_space_between(cls1, cls2, s):
p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
while p.search(s):
s = p.sub(r'\1\2', s)
return s
s = remove_space_between(blocks, blocks, s)
s = remove_space_between(blocks, basic_latin, s)
s = remove_space_between(basic_latin, blocks, s)
return s
def normalize_neologd(s):
s = s.strip()
s = unicode_normalize('0-9A-Za-z。-゚', s)
def maketrans(f, t):
return {ord(x): ord(y) for x, y in zip(f, t)}
s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens
s = re.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus
s = re.sub('[~∼∾〜〰~]', '', s) # remove tildes
s = s.translate(
maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」',
'!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」'))
s = remove_extra_spaces(s)
s = unicode_normalize('!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜', s) # keep =,・,「,」
s = re.sub('[’]', '\'', s)
s = re.sub('[”]', '"', s)
return s
if __name__ == "__main__":
for target in sys.stdin:
print(normalize_neologd(target))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment