Skip to content

Instantly share code, notes, and snippets.

@Deepayan137
Created September 16, 2019 06:56
Show Gist options
  • Save Deepayan137/b0301274edc8e0eb11181ce743ec0544 to your computer and use it in GitHub Desktop.
Save Deepayan137/b0301274edc8e0eb11181ce743ec0544 to your computer and use it in GitHub Desktop.
word prediction and ground truth alignment
from collections import Counter, defaultdict
from textdistance import levenshtein as lev
import numpy as np
import pdb
from tqdm import *
def CharMajVoting(words):
def most_frequent(list_):
counter = Counter(list_)
return counter.most_common()[0][0]
dict_ = defaultdict(list)
lengths = [len(word) for word in words]
common_length = most_frequent(lengths)
for word in words:
for i in range(len(word)):
dict_[i].append(word[i])
str_=''
for i in range(len(dict_)):
str_+=most_frequent(dict_[i])
return str_[:common_length]
def similarity(word1, word2):
return lev.normalized_distance(word1, word2)
def text_align(prWords, gtWords):
row, col = len(prWords), len(gtWords)
adjMat= np.zeros((row, col), dtype=float)
for i in trange(len(prWords)):
for j in range(len(gtWords)):
adjMat[i, j] = similarity(prWords[i], gtWords[j])
pr_aligned=[]
for i in range(len(prWords)):
nn = list(map(lambda x:gtWords[x], np.argsort(adjMat[i, :])[:1]))
pr_aligned.append((prWords[i], nn[0]))
return pr_aligned
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment