Skip to content

Instantly share code, notes, and snippets.

@kuk
Created June 28, 2019 05:13
Show Gist options
  • Save kuk/75604283c212ad9f89595b78e22b203e to your computer and use it in GitHub Desktop.
Save kuk/75604283c212ad9f89595b78e22b203e to your computer and use it in GitHub Desktop.
Match = namedtuple(
'Match',
['share', 'a', 'b']
)
def group_host(records):
for host, group in groupby(records, key=lambda _: _.info.host):
yield list(group)
def group_judge(records):
mapping = defaultdict(list)
for record in records:
judge = record.info.judge
if judge:
mapping[judge].append(record)
for judge in mapping:
yield mapping[judge]
def space_tokenize(text):
return text.split()
def match_pair(a, b):
matcher = SequenceMatcher(
a=space_tokenize(a.doc),
b=space_tokenize(b.doc)
)
union = sum(len(_) for _ in matcher.a + matcher.b)
if not union:
return 0
intersection = 0
for block in matcher.get_matching_blocks():
for index in range(block.size):
intersection += len(matcher.a[block.a + index])
return intersection / union * 2
def match_pairs(records, cap=200):
records = records[:cap] # 200 covers 99% judges
for a, b in combinations(records, 2):
share = match_pair(a, b)
yield share, a, b
def run_match(records):
for records in group_host(records):
for group in group_judge(records):
for share, a, b in match_pairs(group):
yield Match(share, a, b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment