Skip to content

Instantly share code, notes, and snippets.

@Aunsiels
Created July 17, 2019 13:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Aunsiels/5791c747582b878cf85aec3ce3260504 to your computer and use it in GitHub Desktop.
Save Aunsiels/5791c747582b878cf85aec3ce3260504 to your computer and use it in GitHub Desktop.
import logging
from quasimodo.content_comparator import ContentComparator
from quasimodo.only_subject_submodule import get_subjects_in_all_forms
def get_parsing_tree(subjects):
root = Node()
for subject in subjects:
root.add(subject)
return root
class SentenceComparator(ContentComparator):
def __init__(self, module_reference, filename):
super().__init__(module_reference)
self.subjects = []
self.variations = dict()
self.parsing_tree = None
self.filename = filename
self.per_subject = dict()
self._name = "Sentence Comparator"
def setup_processing(self, input_interface):
if len(self.filename) == 0:
return
self.per_subject = dict()
logging.info("Constructing parsing tree...")
self.subjects, self.variations = get_subjects_in_all_forms(input_interface)
self.parsing_tree = get_parsing_tree(self.subjects)
logging.info("Reading sentence file...")
with open(self.filename) as f:
for line in f:
self.add_line_to_corresponding_subjects(line)
logging.info("End of preprocessing")
def add_line_to_corresponding_subjects(self, line):
current_nodes = [self.parsing_tree]
line = line.strip()
for character in line:
next_nodes = self.process_character_in_line_and_add_subject_if_necessary(character, current_nodes, line)
current_nodes = next_nodes
def process_character_in_line_and_add_subject_if_necessary(self, character, current_nodes, line):
next_nodes = []
for node in current_nodes:
next_node = node.get_next_node(character)
if next_node is not None:
next_nodes.append(next_node)
if next_node.is_terminal:
self.process_terminal_node(line, next_node)
if character == " ":
next_nodes.append(self.parsing_tree)
return next_nodes
def process_terminal_node(self, line, next_node):
subject = self.variations[next_node.value]
if subject not in self.per_subject:
self.per_subject[subject] = set()
self.per_subject[subject].add(line)
def get_contents(self, subject):
if len(self.filename) == 0:
return []
return self.per_subject.get(subject, [])
class Node:
def __init__(self):
self.nexts = dict()
self.is_terminal = False
self.value = None
def add(self, word):
self._add(word, word)
def _add(self, word, remaining):
if len(remaining) == 0:
self.create_terminal_node(word)
else:
self._create_transition_to_new_node_if_required(remaining)
self._add_to_next_node(remaining, word)
def _add_to_next_node(self, remaining, word):
self.nexts[remaining[0]]._add(word, remaining[1:])
def _create_transition_to_new_node_if_required(self, remaining):
if remaining[0] not in self.nexts:
self._create_transition_to_new_node(remaining[0])
def _create_transition_to_new_node(self, symbol):
new_node = Node()
self.nexts[symbol] = new_node
def create_terminal_node(self, word):
self.is_terminal = True
self.value = word
def get_next_node(self, character):
return self.nexts.get(character, None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment