selimslab/fuzzy.py

## fuzzy.py
from fuzzywuzzy import process, fuzz


def match_by_fuzzy_string_search(
        possible_matches: List[str], string_to_be_searched: str
) -> str:
    scores = dict()
    for candidate in possible_matches:
        n = len(candidate.split())
        n_grams = generate_ngrams(string_to_be_searched, n)
        for n_gram in n_grams:
            possible_match, score = process.extractOne(
                n_gram, possible_matches, scorer=fuzz.ratio
            )
            old_score = scores.get(possible_match, 0)
            if score > old_score:
                scores[possible_match] = score

    if scores:
        most_possible_match = max(scores, key=scores.get)
        most_score = scores.get(most_possible_match)
        if most_score > 80:
            return most_possible_match

    return ""

## nlp.py
import re


def tr_chars_to_eng(tr_str):
    pairs = [("ş", "s"), ("ğ", "g"), ("ç", "c"), ("ı", "i"), ("ö", "o"), ("ü", "u")]
    for pair in pairs:
        tr_str = tr_str.replace(pair[0], pair[1])

    return tr_str


def create_eng_char_name_map(d, eng_map):
    for k, v in d.iteritems():
        if isinstance(v, dict):
            create_eng_char_name_map(d, eng_map)
        else:
            eng_map[k] = tr_chars_to_eng(k)
            eng_map[v] = tr_chars_to_eng(v)
    return eng_map


def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()

    # Replace all none alphanumeric characters with spaces
    s = re.sub(r"[^a-zA-Z0-9\s]", " ", s)

    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]

    # generate n grams
    n_grams = list()

    for i in range(len(tokens)):
        n_gram = " ".join(tokens[i: i + n])
        n_grams.append(n_gram)

    return n_grams


## normalize.py
def normalize(s: str):
    return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")

## slide.py
def string_sliding_windows(s: str):
    # ("a b c", ["a", "a b", "a b c", "b", "b c", "c"])
    tokens = s.split()
    windows = []
    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens) + 1):
            windows.append(" ".join(tokens[i:j]))
    return windows


def string_to_extending_windows(s: str, end: int = None) -> list:
    # ("a b c", ["a", "a b", "a b c"])
    tokens = s.split()
    if not end:
        end = len(tokens)
    return [" ".join(tokens[:i]) for i in range(1, end + 1)]

## soup.py
import bs4
import requests


def get_soup(url):
    r = requests.get(url, verify=False)
    soup = bs4.BeautifulSoup(r.content, features="lxml")
    return soup
	from fuzzywuzzy import process, fuzz


	def match_by_fuzzy_string_search(
	possible_matches: List[str], string_to_be_searched: str
	) -> str:
	scores = dict()
	for candidate in possible_matches:
	n = len(candidate.split())
	n_grams = generate_ngrams(string_to_be_searched, n)
	for n_gram in n_grams:
	possible_match, score = process.extractOne(
	n_gram, possible_matches, scorer=fuzz.ratio
	)
	old_score = scores.get(possible_match, 0)
	if score > old_score:
	scores[possible_match] = score

	if scores:
	most_possible_match = max(scores, key=scores.get)
	most_score = scores.get(most_possible_match)
	if most_score > 80:
	return most_possible_match

	return ""
	import re


	def tr_chars_to_eng(tr_str):
	pairs = [("ş", "s"), ("ğ", "g"), ("ç", "c"), ("ı", "i"), ("ö", "o"), ("ü", "u")]
	for pair in pairs:
	tr_str = tr_str.replace(pair[0], pair[1])

	return tr_str


	def create_eng_char_name_map(d, eng_map):
	for k, v in d.iteritems():
	if isinstance(v, dict):
	create_eng_char_name_map(d, eng_map)
	else:
	eng_map[k] = tr_chars_to_eng(k)
	eng_map[v] = tr_chars_to_eng(v)
	return eng_map


	def generate_ngrams(s, n):
	# Convert to lowercases
	s = s.lower()

	# Replace all none alphanumeric characters with spaces
	s = re.sub(r"[^a-zA-Z0-9\s]", " ", s)

	# Break sentence in the token, remove empty tokens
	tokens = [token for token in s.split(" ") if token != ""]

	# generate n grams
	n_grams = list()

	for i in range(len(tokens)):
	n_gram = " ".join(tokens[i: i + n])
	n_grams.append(n_gram)

	return n_grams
	def normalize(s: str):
	return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
	def string_sliding_windows(s: str):
	# ("a b c", ["a", "a b", "a b c", "b", "b c", "c"])
	tokens = s.split()
	windows = []
	for i in range(len(tokens)):
	for j in range(i + 1, len(tokens) + 1):
	windows.append(" ".join(tokens[i:j]))
	return windows


	def string_to_extending_windows(s: str, end: int = None) -> list:
	# ("a b c", ["a", "a b", "a b c"])
	tokens = s.split()
	if not end:
	end = len(tokens)
	return [" ".join(tokens[:i]) for i in range(1, end + 1)]
	import bs4
	import requests


	def get_soup(url):
	r = requests.get(url, verify=False)
	soup = bs4.BeautifulSoup(r.content, features="lxml")
	return soup