Skip to content

Instantly share code, notes, and snippets.

@selimslab
Last active October 30, 2020 15:53
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save selimslab/8e80403b84c635e87bbf4e03455b9306 to your computer and use it in GitHub Desktop.
from fuzzywuzzy import process, fuzz
def match_by_fuzzy_string_search(
possible_matches: List[str], string_to_be_searched: str
) -> str:
scores = dict()
for candidate in possible_matches:
n = len(candidate.split())
n_grams = generate_ngrams(string_to_be_searched, n)
for n_gram in n_grams:
possible_match, score = process.extractOne(
n_gram, possible_matches, scorer=fuzz.ratio
)
old_score = scores.get(possible_match, 0)
if score > old_score:
scores[possible_match] = score
if scores:
most_possible_match = max(scores, key=scores.get)
most_score = scores.get(most_possible_match)
if most_score > 80:
return most_possible_match
return ""
import re
def tr_chars_to_eng(tr_str):
pairs = [("ş", "s"), ("ğ", "g"), ("ç", "c"), ("ı", "i"), ("ö", "o"), ("ü", "u")]
for pair in pairs:
tr_str = tr_str.replace(pair[0], pair[1])
return tr_str
def create_eng_char_name_map(d, eng_map):
for k, v in d.iteritems():
if isinstance(v, dict):
create_eng_char_name_map(d, eng_map)
else:
eng_map[k] = tr_chars_to_eng(k)
eng_map[v] = tr_chars_to_eng(v)
return eng_map
def generate_ngrams(s, n):
# Convert to lowercases
s = s.lower()
# Replace all none alphanumeric characters with spaces
s = re.sub(r"[^a-zA-Z0-9\s]", " ", s)
# Break sentence in the token, remove empty tokens
tokens = [token for token in s.split(" ") if token != ""]
# generate n grams
n_grams = list()
for i in range(len(tokens)):
n_gram = " ".join(tokens[i: i + n])
n_grams.append(n_gram)
return n_grams
def normalize(s: str):
return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
def string_sliding_windows(s: str):
# ("a b c", ["a", "a b", "a b c", "b", "b c", "c"])
tokens = s.split()
windows = []
for i in range(len(tokens)):
for j in range(i + 1, len(tokens) + 1):
windows.append(" ".join(tokens[i:j]))
return windows
def string_to_extending_windows(s: str, end: int = None) -> list:
# ("a b c", ["a", "a b", "a b c"])
tokens = s.split()
if not end:
end = len(tokens)
return [" ".join(tokens[:i]) for i in range(1, end + 1)]
import bs4
import requests
def get_soup(url):
r = requests.get(url, verify=False)
soup = bs4.BeautifulSoup(r.content, features="lxml")
return soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment