Skip to content

Instantly share code, notes, and snippets.

@tuxedocat
Last active August 2, 2018 04:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tuxedocat/f0ec90858d6b54b94f0b80c66802ce3f to your computer and use it in GitHub Desktop.
Save tuxedocat/f0ec90858d6b54b94f0b80c66802ce3f to your computer and use it in GitHub Desktop.
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *
from typing import *
import pickle
class TokenizerFactory:
def __init__(self, token_filters=('compound', 'pos', 'lowercase'), exclude_pos=('記号', '助詞')):
self.token_filters = token_filters[:]
char_filters = [UnicodeNormalizeCharFilter()]
tokenizer = Tokenizer(mmap=False)
token_filters = []
if 'compound' in self.token_filters:
token_filters.append(CompoundNounFilter())
if 'pos' in self.token_filters:
if exclude_pos is not None:
token_filters.append(POSStopFilter(list(exclude_pos)))
if 'lowercase' in self.token_filters:
token_filters.append(LowerCaseFilter())
self.analyzer_ = Analyzer(char_filters, tokenizer, token_filters)
self.tokenizer_ = None
def _base(self, s: str) -> List[str]:
return [t.base_form for t in self.analyzer_.analyze(s)]
def _surface(self, s: str) -> List[str]:
return [t.surface for t in self.analyzer_.analyze(s)]
def _yomi(self, s: str) -> List[str]:
return [t.reading for t in self.analyzer_.analyze(s)]
def _phonetic(self, s: str) -> List[str]:
return [t.phonetic for t in self.analyzer_.analyze(s)]
def _base_with_pos(self, s: str) -> List[str]:
return [f'{t.base_form}/{t.part_of_speech}' for t in self.analyzer_.analyze(s)]
def _surface_with_pos(self, s: str) -> List[str]:
return [f'{t.surface}/{t.part_of_speech}' for t in self.analyzer_.analyze(s)]
def __call__(self, tokenization_type='surface'):
if tokenization_type == 'base':
self.tokenizer_ = self._base
elif tokenization_type == 'surface':
self.tokenizer_ = self._surface
elif tokenization_type == 'yomi':
self.tokenizer_ = self._yomi
elif tokenization_type == 'phonetic':
self.tokenizer_ = self._phonetic
else:
raise NotImplementedError(
f'Tokenization type {tokenization_type} is not supported.')
return self.tokenizer_
def __reduce_ex__(self, protocol):
return type(self), ('',)
tokenizer_func = TokenizerFactory()()
unpickled = pickle.loads(pickle.dumps(tokenizer_func))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment