わかりやすい規則の記述
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Candidate: | |
def __init__(self, name, algorithm): | |
self.name = name | |
self.algorithm = algorithm | |
def as_dict(self): | |
return { | |
"name": self.name, | |
"algorithm": self.algorithm | |
} | |
def __repr__(self): | |
return f"<Candidate(name={ self.name })>" | |
class DummyExtractor: | |
def __init__(self): | |
pass | |
def extract(self, sentence): | |
return [Candidate(name="HOGE", algorithm="dummy"), Candidate(name="FUGA", algorithm="dummy")] | |
def __repr__(self): | |
return self.__class__.__name__ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from abc import ABC, abstractmethod | |
class Postprocessor(ABC): | |
@abstractmethod | |
def process(self, candidates): | |
pass | |
def __repr__(self): | |
return self.__class__.__name__ | |
class StopwordsFilter(Postprocessor): | |
def __init__(self): | |
self.stop_words = {"HOGE"} | |
def process(self, candidates): | |
return [cand for cand in candidates if cand.name not in self.stop_words] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
from abc import ABC, abstractmethod | |
class Preprocessor(ABC): | |
@abstractmethod | |
def process(self, text): | |
pass | |
def __repr__(self): | |
return self.__class__.__name__ | |
class UnicodeNormalizer(Preprocessor): | |
def process(self, sentences): | |
return [unicodedata.normalize("NFKC", sentence) for sentence in sentences] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from logzero import logger | |
from extractor import DummyExtractor | |
from preprocessor import UnicodeNormalizer | |
from postprocessor import StopwordsFilter | |
from worker import ExtractorWorker | |
if __name__ == "__main__": | |
extractor = ExtractorWorker(description="sample") | |
extractor.add_preprocessor(UnicodeNormalizer()) | |
extractor.add_extractor(DummyExtractor()) | |
extractor.add_postprocessor(StopwordsFilter()) | |
logger.info(extractor.get_steps()) | |
logger.info(extractor.extract("これはテストです。")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from logzero import logger | |
class ExtractorWorker: | |
def __init__(self, description=""): | |
self.preprocessors = [] | |
self.postprocessors= [] | |
self.extractors = [] | |
self._description = description | |
def add_preprocessor(self, preprocessor): | |
self.preprocessors.append(preprocessor) | |
def add_postprocessor(self, postprocessor): | |
self.postprocessors.append(postprocessor) | |
def add_extractor(self, extractor): | |
self.extractors.append(extractor) | |
def extract(self, text): | |
sentences = self._preprocess([text]) | |
candidates = self._extract(sentences) | |
logger.debug("Candidates:" + json.dumps([cand.as_dict() for cand in candidates], ensure_ascii=False)) | |
return self._post_process(candidates) | |
def _preprocess(self, sentences): | |
logger.debug(json.dumps(sentences, ensure_ascii=False)) | |
for processor in self.preprocessors: | |
sentences = processor.process(sentences) | |
logger.debug(str(processor) + ":" + json.dumps(sentences, ensure_ascii=False)) | |
return sentences | |
def _extract(self , sentences): | |
extracted = [] | |
for extractor in self.extractors: | |
for sentence in sentences: | |
extracted.extend(extractor.extract(sentence)) | |
return extracted | |
def _post_process(self, candidates): | |
for postprocessor in self.postprocessors: | |
candidates = postprocessor.process(candidates) | |
logger.debug(str(postprocessor) + ":" + json.dumps([cand.as_dict() for cand in candidates], ensure_ascii=False)) | |
return candidates | |
def get_steps(self): | |
return { | |
"Description": self._description, | |
"Preprocessing": [str(step) for step in self.preprocessors], | |
"Extractors": [str(step) for step in self.extractors], | |
"Postprocessing": [str(step) for step in self.postprocessors] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment