わかりやすい規則の記述
class Candidate: | |
def __init__(self, name, algorithm): | |
self.name = name | |
self.algorithm = algorithm | |
def as_dict(self): | |
return { | |
"name": self.name, | |
"algorithm": self.algorithm | |
} | |
def __repr__(self): | |
return f"<Candidate(name={ self.name })>" | |
class DummyExtractor: | |
def __init__(self): | |
pass | |
def extract(self, sentence): | |
return [Candidate(name="HOGE", algorithm="dummy"), Candidate(name="FUGA", algorithm="dummy")] | |
def __repr__(self): | |
return self.__class__.__name__ |
from abc import ABC, abstractmethod | |
class Postprocessor(ABC): | |
@abstractmethod | |
def process(self, candidates): | |
pass | |
def __repr__(self): | |
return self.__class__.__name__ | |
class StopwordsFilter(Postprocessor): | |
def __init__(self): | |
self.stop_words = {"HOGE"} | |
def process(self, candidates): | |
return [cand for cand in candidates if cand.name not in self.stop_words] |
import unicodedata | |
from abc import ABC, abstractmethod | |
class Preprocessor(ABC): | |
@abstractmethod | |
def process(self, text): | |
pass | |
def __repr__(self): | |
return self.__class__.__name__ | |
class UnicodeNormalizer(Preprocessor): | |
def process(self, sentences): | |
return [unicodedata.normalize("NFKC", sentence) for sentence in sentences] |
from logzero import logger | |
from extractor import DummyExtractor | |
from preprocessor import UnicodeNormalizer | |
from postprocessor import StopwordsFilter | |
from worker import ExtractorWorker | |
if __name__ == "__main__": | |
extractor = ExtractorWorker(description="sample") | |
extractor.add_preprocessor(UnicodeNormalizer()) | |
extractor.add_extractor(DummyExtractor()) | |
extractor.add_postprocessor(StopwordsFilter()) | |
logger.info(extractor.get_steps()) | |
logger.info(extractor.extract("これはテストです。")) |
import json | |
from logzero import logger | |
class ExtractorWorker: | |
def __init__(self, description=""): | |
self.preprocessors = [] | |
self.postprocessors= [] | |
self.extractors = [] | |
self._description = description | |
def add_preprocessor(self, preprocessor): | |
self.preprocessors.append(preprocessor) | |
def add_postprocessor(self, postprocessor): | |
self.postprocessors.append(postprocessor) | |
def add_extractor(self, extractor): | |
self.extractors.append(extractor) | |
def extract(self, text): | |
sentences = self._preprocess([text]) | |
candidates = self._extract(sentences) | |
logger.debug("Candidates:" + json.dumps([cand.as_dict() for cand in candidates], ensure_ascii=False)) | |
return self._post_process(candidates) | |
def _preprocess(self, sentences): | |
logger.debug(json.dumps(sentences, ensure_ascii=False)) | |
for processor in self.preprocessors: | |
sentences = processor.process(sentences) | |
logger.debug(str(processor) + ":" + json.dumps(sentences, ensure_ascii=False)) | |
return sentences | |
def _extract(self , sentences): | |
extracted = [] | |
for extractor in self.extractors: | |
for sentence in sentences: | |
extracted.extend(extractor.extract(sentence)) | |
return extracted | |
def _post_process(self, candidates): | |
for postprocessor in self.postprocessors: | |
candidates = postprocessor.process(candidates) | |
logger.debug(str(postprocessor) + ":" + json.dumps([cand.as_dict() for cand in candidates], ensure_ascii=False)) | |
return candidates | |
def get_steps(self): | |
return { | |
"Description": self._description, | |
"Preprocessing": [str(step) for step in self.preprocessors], | |
"Extractors": [str(step) for step in self.extractors], | |
"Postprocessing": [str(step) for step in self.postprocessors] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment