Skip to content

Instantly share code, notes, and snippets.

@kingjr
Created September 8, 2021 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kingjr/4fb5f6ac0f04a71651e8d8f3f358044e to your computer and use it in GitHub Desktop.
Save kingjr/4fb5f6ac0f04a71651e8d8f3f358044e to your computer and use it in GitHub Desktop.
match_list.py
from Levenshtein import editops
def match_list(A, B, on_replace="delete"):
"""Match two lists of different sizes and return corresponding indice
Parameters
----------
A: list | array, shape (n,)
The values of the first list
B: list | array: shape (m, )
The values of the second list
Returns
-------
A_idx : array
The indices of the A list that match those of the B
B_idx : array
The indices of the B list that match those of the A
"""
unique = np.unique(np.r_[A, B])
label_encoder = dict((k, v) for v, k in enumerate(unique))
def int_to_unicode(array: np.ndarray) -> str:
return "".join([str(chr(label_encoder[ii])) for ii in array])
changes = editops(int_to_unicode(A), int_to_unicode(B))
B_sel = np.arange(len(B)).astype(float)
A_sel = np.arange(len(A)).astype(float)
for type_, val_a, val_b in changes:
if type_ == "insert":
B_sel[val_b] = np.nan
elif type_ == "delete":
A_sel[val_a] = np.nan
elif on_replace == "delete":
# print('delete replace')
A_sel[val_a] = np.nan
B_sel[val_b] = np.nan
elif on_replace == "keep":
# print('keep replace')
pass
else:
raise NotImplementedError
B_sel = B_sel[np.where(~np.isnan(B_sel))]
A_sel = A_sel[np.where(~np.isnan(A_sel))]
assert len(B_sel) == len(A_sel)
return A_sel.astype(int), B_sel.astype(int)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment