Skip to content

Instantly share code, notes, and snippets.

@sonnyksimon
Last active February 23, 2024 04:18
Show Gist options
  • Save sonnyksimon/bc1c65bdfd28844c395a4d6af751bec3 to your computer and use it in GitHub Desktop.
Save sonnyksimon/bc1c65bdfd28844c395a4d6af751bec3 to your computer and use it in GitHub Desktop.
string similarity
Can we implement all of these algorithms?
Similarity Algorithms:
- Cosine
- Fuzzy Wuzzy
- Jaccard
- Jaro
- Jaro Winkler
- Q-gram
- Sørensen DIce
Distance Algorithms:
- Damerau Levenshtein
- LCS Edit
- Levenshtein
- QSA Damerau Levenshtein
Phonetic Algorithms:
- Cologne
- Soundex
- Metaphone
/path/to/python run.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import math
def _tokenize(text):
return re.split(r'[^A-Za-z0-9]+', text)
def _get_tokens(text):
return [word.lower() for word in _tokenize(text) if not word.isnumeric()]
def _compute_frequency(arr, commons):
return [arr.count(word) for word in commons]
def _compute_vector_ab(v1, v2):
return sum(f1*f2 for f1,f2 in zip(v1,v2))
def _abs_vector(v):
return math.sqrt(sum(f*f for f in v))
def _vector_similarity(vAB, a, b):
return vAB / (a * b)
def cosine_similarity(string1, string2):
arr1 = _get_tokens(string1)
arr2 = _get_tokens(string2)
commons = arr1
v1 = _compute_frequency(arr1, commons)
v2 = _compute_frequency(arr2, commons)
vAB = _compute_vector_ab(v1, v2)
a = _abs_vector(v1)
b = _abs_vector(v2)
return _vector_similarity(vAB, a, b)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import datetime
color_prefix = "\x1b["
color_separator = ";"
color_suffix = "m"
color_thin = "20"
color_bold = "1"
color = lambda _color, bold: f"{color_prefix}{_color}{color_separator}{color_bold if bold else color_thin}{color_suffix}"
color_red = color("31", bold=False)
color_bold_red = color("31", bold = True)
color_green = color("32", bold=False)
color_bold_green = color("32", bold=True)
color_yellow = color("33", bold=False)
color_bold_yellow = color("33", bold=True)
color_blue = color("34", bold=False)
color_bold_blue = color("34", bold=True)
color_grey = color("38", bold=False)
color_reset = "\x1b[0m"
log_timestamped = lambda msgs, _level, _color: print(f"{color_bold_blue}[{datetime.datetime.now()}]{color_reset} {_color}{_level}{color_reset} {' '.join(str(m) for m in msgs)}")
log_info = lambda *msgs: log_timestamped(msgs = msgs, _level = "INFO", _color = color_bold_green)
log_error= lambda *msgs: log_timestamped(msgs = msgs, _level ="ERROR", _color = color_bold_red)
log_debug= lambda *msgs: log_timestamped(msgs = msgs, _level ="DEBUG", _color = color_bold_yellow)
_normalize_regex = re.compile(r'(?ui)\W')
_whitespace_regex = re.compile(r'\s+')
def normalize_text(text):
if text is None:
return ""
text = str(text)
text = _normalize_regex.sub(' ', text)
text = text.strip()
text = text.lower()
text = _whitespace_regex.sub(' ', text)
return text
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import cossim
from helpers import normalize_text, log_info, log_error, log_debug
s1 = "ELON MUSK"
s2 = "COLON MUSK"
log_debug("string1", s1)
log_debug("string2", s2)
s1 = normalize_text(s1)
s2 = normalize_text(s2)
log_debug("string1-normalized", s1)
log_debug("string2-normalized", s2)
try:
cosine_similarity = cossim.cosine_similarity(s1,s2)
except Exception as e:
cosine_similarity = 0.0
log_error("compute-error", str(e))
log_info("cosine-similarity", cosine_similarity)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment