Skip to content

Instantly share code, notes, and snippets.

@kingjr
Created September 19, 2022 16:31
Show Gist options
  • Save kingjr/0a3eb140f2d2d123c9bd1c6901a11876 to your computer and use it in GitHub Desktop.
Save kingjr/0a3eb140f2d2d123c9bd1c6901a11876 to your computer and use it in GitHub Desktop.
from Levenshtein import editops
def match_list(A, B, on_replace="delete"):
"""Match two lists of different sizes and return corresponding indice
Parameters
----------
A: list | array, shape (n,)
The values of the first list
B: list | array: shape (m, )
The values of the second list
Returns
-------
A_idx : array
The indices of the A list that match those of the B
B_idx : array
The indices of the B list that match those of the A
"""
if not isinstance(A, str):
unique = np.unique(np.r_[A, B])
label_encoder = dict((k, v) for v, k in enumerate(unique))
def int_to_unicode(array: np.ndarray) -> str:
return "".join([str(chr(label_encoder[ii])) for ii in array])
A = int_to_unicode(A)
B = int_to_unicode(B)
changes = editops(A, B)
B_sel = np.arange(len(B)).astype(float)
A_sel = np.arange(len(A)).astype(float)
for type_, val_a, val_b in changes:
if type_ == "insert":
B_sel[val_b] = np.nan
elif type_ == "delete":
A_sel[val_a] = np.nan
elif on_replace == "delete":
# print('delete replace')
A_sel[val_a] = np.nan
B_sel[val_b] = np.nan
elif on_replace == "keep":
# print('keep replace')
pass
else:
raise NotImplementedError
B_sel = B_sel[np.where(~np.isnan(B_sel))]
A_sel = A_sel[np.where(~np.isnan(A_sel))]
assert len(B_sel) == len(A_sel)
return A_sel.astype(int), B_sel.astype(int)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment