Created
September 23, 2020 01:31
-
-
Save dirkgr/07771723453012be94a7ca53a409a990 to your computer and use it in GitHub Desktop.
A generator that wraps another generator, but filters out near-duplicate strings as it goes along
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import typing | |
_T = typing.TypeVar('_T') | |
def filter_near_duplicates(items: typing.Iterable[_T], key = lambda x: x) -> typing.Generator[_T, None, None]: | |
"""Filters out items that overlap too much with items we've seen earlier in the sequence.""" | |
trigram_to_sentence_indices = collections.defaultdict(set) | |
for sentence_index, item in enumerate(items): | |
sentence = key(item) | |
trigrams = [sentence[i:i+3] for i in range(len(sentence) - 1)] | |
if len(trigrams) <= 0: | |
yield item | |
continue | |
overlapping_sentence_indices = collections.Counter() | |
for trigram in trigrams: | |
for si in trigram_to_sentence_indices[trigram]: | |
overlapping_sentence_indices[si] += 1 | |
if len(overlapping_sentence_indices) > 0: | |
max_overlap = max(overlapping_sentence_indices.values()) | |
else: | |
max_overlap = 0 | |
if max_overlap / len(trigrams) >= 0.9: | |
continue | |
yield item | |
for trigram in trigrams: | |
trigram_to_sentence_indices[trigram].add(sentence_index) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment