Skip to content

Instantly share code, notes, and snippets.

@Varriount
Created October 3, 2023 20:38
Show Gist options
  • Save Varriount/523461a8ae5b39a32fb4e6e96e453cdf to your computer and use it in GitHub Desktop.
Save Varriount/523461a8ae5b39a32fb4e6e96e453cdf to your computer and use it in GitHub Desktop.
Sort terms based on word and character similarity
import codecs
import sys
from editdistance import eval as distance
from pprint import pprint
import re
def sort_phrases(phrase_list):
result = []
result.append(phrase_list.pop())
while phrase_list:
index, score = find_best_phrase(result[-1], phrase_list)
result.append(phrase_list.pop(index))
return result
def find_best_phrase(target_phrase, sample_phrases):
best_index = None
best_score = None
for index, sample_phrase in enumerate(sample_phrases):
score = score_phrases(target_phrase, sample_phrase)
if best_score is None or score < best_score:
best_index = index
best_score = score
return best_index, best_score
def score_phrases(left_phrase, right_phrase):
return min(
(
100 - intersecting_word_count(left_variation, right_variation),
distance(left_variation, right_variation)
)
for left_variation in left_phrase
for right_variation in right_phrase
)
def intersecting_word_count(left, right):
left_set = set(re.split(r"\W+", left))
right_set = set(re.split(r"\W+", right))
return len(left_set & right_set)
if __name__ == '__main__':
delimiter = sys.stdin.readline()[:-1]
if delimiter == '':
delimiter = '(?!x)x'
phrases = [line.strip() for line in sys.stdin.readlines()]
phrases_map = {
tuple(re.split(delimiter, phrase)): phrase
for phrase in phrases
}
keys = sort_phrases(list(phrases_map.keys()))
for key in keys:
print(phrases_map[key])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment