Max Bade maxwellbade

## fuzzy_matchin_code.py
num_display_rows = 5 #display rows in jupyter
avg_fuzzy_score_lte = 50 #lower scores mean lower the match between fullname and email handle
min_bad_lev_dist_pct_total = .6 #the percent of levenshtein_distance to grab above; so grabbing >= x percent of lev distances (total unique distances mayb be 30, .6 of 30 = 18

# get riskiest lev distances
percent = min_bad_lev_dist_pct_total
which_col = 'levenshtein_distance'
nunique_lev = df[which_col].nunique()
nunique_max = df[which_col].max()
pcnt_of_max_lev = int(round(percent * nunique_max,0))

## fuzzy_matching.py
from fuzzywuzzy import fuzz
import Levenshtein
import re
import math

df = df.fillna('nostring')
df['full_name'] = df['full_name'].str.replace(' ','none')
df['email_handle'] = df['email_handle'].str.replace(' ','none')
df['fuzzy_score'] = df[['full_name','email_handle']].apply(lambda x : fuzz.partial_ratio(*x),axis=1)
df['jaro_score'] = df[['full_name','email_handle']].apply(lambda x: Levenshtein.jaro_winkler(*x), axis=1) * 100

## python_clean.py
clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower()

## sql_regex.sql
regexp_replace(z.full_name, r'[^a-zA-Z]', '') full_name
regexp_replace(z.email_handle, r'[^a-zA-Z]', '') email_handle

## nostril.py
#have to insalle this first!!!
# !pip install git+https://github.com/casics/nostril.git
from nostril import ng
from nostril import nonsense_detector as nd

def is_nonsense_text(text):
    if nd.nonsense(text):
        return 1
    else:
        return 0

## entropy.py
def is_gibberish(input_string, threshold=gibberish_threshold):
    clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower()
    entropy = calculate_entropy(clean_string)
    if entropy < threshold:
        return 1
    else:
        return 0

def calculate_entropy(s):
    # Calculate the probability of each character

## ml.py
def ml_df(which_df,which_feature_col_start,which_target_col,threshold,test_size,agg,count_agg,group_by_date):
    start_time = datetime.now()
    print('ML start time:', start_time)

    from ast import literal_eval
    from difflib import SequenceMatcher
    from collections import Counter
    from itertools import groupby
    from sklearn import model_selection
    from sklearn.model_selection import train_test_split

## auto_features.py
def auto_features(which_df,feature_fraud_rate,new_rules_fraud_rate_col,cols_to_move,attr_col,features_start_col,cols_to_drop):
    #create target col
    which_df['target'] = np.where(which_df['is_risky'] == 'risky',1,0)
    test_df = new_rules.copy()
    # display(test_df.head())

    #filter highest risk
    df_features = test_df[
        (test_df[new_rules_fraud_rate_col] >= feature_fraud_rate)
        # & (test_df['pct_caught'] <= .1)

## column_value_pairs.py
def fraud_metrics(df,cols,num_cols,count_risky,minrate,total,agg):
    start_time = datetime.today()
    print('Start Time: ', start_time)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)

    multiple_defect_df = multiple_defect(df=df
                                         ,agg=agg
                                         ,cols=cols
                                         ,n=num_cols

## fraud_cols.py
def fraud_rate(df, agg, cols=None, threshold=None, limit=None, days=None, minrate=None, maxrate=None):
    if isinstance(cols, str):
        groupcols = [cols]
    elif cols is None:
        groupcols = []
    else:
        try:
            groupcols = list(cols)
        except:
            raise TypeError('Unable to convert cols to a list.')
	num_display_rows = 5 #display rows in jupyter
	avg_fuzzy_score_lte = 50 #lower scores mean lower the match between fullname and email handle
	min_bad_lev_dist_pct_total = .6 #the percent of levenshtein_distance to grab above; so grabbing >= x percent of lev distances (total unique distances mayb be 30, .6 of 30 = 18

	# get riskiest lev distances
	percent = min_bad_lev_dist_pct_total
	which_col = 'levenshtein_distance'
	nunique_lev = df[which_col].nunique()
	nunique_max = df[which_col].max()
	pcnt_of_max_lev = int(round(percent * nunique_max,0))
	from fuzzywuzzy import fuzz
	import Levenshtein
	import re
	import math

	df = df.fillna('nostring')
	df['full_name'] = df['full_name'].str.replace(' ','none')
	df['email_handle'] = df['email_handle'].str.replace(' ','none')
	df['fuzzy_score'] = df[['full_name','email_handle']].apply(lambda x : fuzz.partial_ratio(*x),axis=1)
	df['jaro_score'] = df[['full_name','email_handle']].apply(lambda x: Levenshtein.jaro_winkler(x), axis=1) 100
	regexp_replace(z.full_name, r'[^a-zA-Z]', '') full_name
	regexp_replace(z.email_handle, r'[^a-zA-Z]', '') email_handle
	#have to insalle this first!!!
	# !pip install git+https://github.com/casics/nostril.git
	from nostril import ng
	from nostril import nonsense_detector as nd

	def is_nonsense_text(text):
	if nd.nonsense(text):
	return 1
	else:
	return 0
	def is_gibberish(input_string, threshold=gibberish_threshold):
	clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower()
	entropy = calculate_entropy(clean_string)
	if entropy < threshold:
	return 1
	else:
	return 0

	def calculate_entropy(s):
	# Calculate the probability of each character
	def ml_df(which_df,which_feature_col_start,which_target_col,threshold,test_size,agg,count_agg,group_by_date):
	start_time = datetime.now()
	print('ML start time:', start_time)

	from ast import literal_eval
	from difflib import SequenceMatcher
	from collections import Counter
	from itertools import groupby
	from sklearn import model_selection
	from sklearn.model_selection import train_test_split
	def auto_features(which_df,feature_fraud_rate,new_rules_fraud_rate_col,cols_to_move,attr_col,features_start_col,cols_to_drop):
	#create target col
	which_df['target'] = np.where(which_df['is_risky'] == 'risky',1,0)
	test_df = new_rules.copy()
	# display(test_df.head())

	#filter highest risk
	df_features = test_df[
	(test_df[new_rules_fraud_rate_col] >= feature_fraud_rate)
	# & (test_df['pct_caught'] <= .1)
	def fraud_metrics(df,cols,num_cols,count_risky,minrate,total,agg):
	start_time = datetime.today()
	print('Start Time: ', start_time)
	pd.set_option('display.max_rows', None)
	pd.set_option('display.max_columns', None)

	multiple_defect_df = multiple_defect(df=df
	,agg=agg
	,cols=cols
	,n=num_cols
	def fraud_rate(df, agg, cols=None, threshold=None, limit=None, days=None, minrate=None, maxrate=None):
	if isinstance(cols, str):
	groupcols = [cols]
	elif cols is None:
	groupcols = []
	else:
	try:
	groupcols = list(cols)
	except:
	raise TypeError('Unable to convert cols to a list.')