Skip to content

Instantly share code, notes, and snippets.

@dirkgr
Created September 23, 2020 01:31
Show Gist options
  • Save dirkgr/07771723453012be94a7ca53a409a990 to your computer and use it in GitHub Desktop.
Save dirkgr/07771723453012be94a7ca53a409a990 to your computer and use it in GitHub Desktop.
A generator that wraps another generator, but filters out near-duplicate strings as it goes along
import collections
import typing
_T = typing.TypeVar('_T')
def filter_near_duplicates(items: typing.Iterable[_T], key = lambda x: x) -> typing.Generator[_T, None, None]:
"""Filters out items that overlap too much with items we've seen earlier in the sequence."""
trigram_to_sentence_indices = collections.defaultdict(set)
for sentence_index, item in enumerate(items):
sentence = key(item)
trigrams = [sentence[i:i+3] for i in range(len(sentence) - 1)]
if len(trigrams) <= 0:
yield item
continue
overlapping_sentence_indices = collections.Counter()
for trigram in trigrams:
for si in trigram_to_sentence_indices[trigram]:
overlapping_sentence_indices[si] += 1
if len(overlapping_sentence_indices) > 0:
max_overlap = max(overlapping_sentence_indices.values())
else:
max_overlap = 0
if max_overlap / len(trigrams) >= 0.9:
continue
yield item
for trigram in trigrams:
trigram_to_sentence_indices[trigram].add(sentence_index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment