Skip to content

Instantly share code, notes, and snippets.

@kanjirz50 kanjirz50/extractor.py Secret

Created Feb 12, 2020
Embed
What would you like to do?
わかりやすい規則の記述
class Candidate:
def __init__(self, name, algorithm):
self.name = name
self.algorithm = algorithm
def as_dict(self):
return {
"name": self.name,
"algorithm": self.algorithm
}
def __repr__(self):
return f"<Candidate(name={ self.name })>"
class DummyExtractor:
def __init__(self):
pass
def extract(self, sentence):
return [Candidate(name="HOGE", algorithm="dummy"), Candidate(name="FUGA", algorithm="dummy")]
def __repr__(self):
return self.__class__.__name__
from abc import ABC, abstractmethod
class Postprocessor(ABC):
@abstractmethod
def process(self, candidates):
pass
def __repr__(self):
return self.__class__.__name__
class StopwordsFilter(Postprocessor):
def __init__(self):
self.stop_words = {"HOGE"}
def process(self, candidates):
return [cand for cand in candidates if cand.name not in self.stop_words]
import unicodedata
from abc import ABC, abstractmethod
class Preprocessor(ABC):
@abstractmethod
def process(self, text):
pass
def __repr__(self):
return self.__class__.__name__
class UnicodeNormalizer(Preprocessor):
def process(self, sentences):
return [unicodedata.normalize("NFKC", sentence) for sentence in sentences]
from logzero import logger
from extractor import DummyExtractor
from preprocessor import UnicodeNormalizer
from postprocessor import StopwordsFilter
from worker import ExtractorWorker
if __name__ == "__main__":
extractor = ExtractorWorker(description="sample")
extractor.add_preprocessor(UnicodeNormalizer())
extractor.add_extractor(DummyExtractor())
extractor.add_postprocessor(StopwordsFilter())
logger.info(extractor.get_steps())
logger.info(extractor.extract("これはテストです。"))
import json
from logzero import logger
class ExtractorWorker:
def __init__(self, description=""):
self.preprocessors = []
self.postprocessors= []
self.extractors = []
self._description = description
def add_preprocessor(self, preprocessor):
self.preprocessors.append(preprocessor)
def add_postprocessor(self, postprocessor):
self.postprocessors.append(postprocessor)
def add_extractor(self, extractor):
self.extractors.append(extractor)
def extract(self, text):
sentences = self._preprocess([text])
candidates = self._extract(sentences)
logger.debug("Candidates:" + json.dumps([cand.as_dict() for cand in candidates], ensure_ascii=False))
return self._post_process(candidates)
def _preprocess(self, sentences):
logger.debug(json.dumps(sentences, ensure_ascii=False))
for processor in self.preprocessors:
sentences = processor.process(sentences)
logger.debug(str(processor) + ":" + json.dumps(sentences, ensure_ascii=False))
return sentences
def _extract(self , sentences):
extracted = []
for extractor in self.extractors:
for sentence in sentences:
extracted.extend(extractor.extract(sentence))
return extracted
def _post_process(self, candidates):
for postprocessor in self.postprocessors:
candidates = postprocessor.process(candidates)
logger.debug(str(postprocessor) + ":" + json.dumps([cand.as_dict() for cand in candidates], ensure_ascii=False))
return candidates
def get_steps(self):
return {
"Description": self._description,
"Preprocessing": [str(step) for step in self.preprocessors],
"Extractors": [str(step) for step in self.extractors],
"Postprocessing": [str(step) for step in self.postprocessors]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.