Skip to content

Instantly share code, notes, and snippets.

View maxwellbade's full-sized avatar

Max Bade maxwellbade

View GitHub Profile
num_display_rows = 5 #display rows in jupyter
avg_fuzzy_score_lte = 50 #lower scores mean lower the match between fullname and email handle
min_bad_lev_dist_pct_total = .6 #the percent of levenshtein_distance to grab above; so grabbing >= x percent of lev distances (total unique distances mayb be 30, .6 of 30 = 18
# get riskiest lev distances
percent = min_bad_lev_dist_pct_total
which_col = 'levenshtein_distance'
nunique_lev = df[which_col].nunique()
nunique_max = df[which_col].max()
pcnt_of_max_lev = int(round(percent * nunique_max,0))
from fuzzywuzzy import fuzz
import Levenshtein
import re
import math
df = df.fillna('nostring')
df['full_name'] = df['full_name'].str.replace(' ','none')
df['email_handle'] = df['email_handle'].str.replace(' ','none')
df['fuzzy_score'] = df[['full_name','email_handle']].apply(lambda x : fuzz.partial_ratio(*x),axis=1)
df['jaro_score'] = df[['full_name','email_handle']].apply(lambda x: Levenshtein.jaro_winkler(*x), axis=1) * 100
clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower()
regexp_replace(z.full_name, r'[^a-zA-Z]', '') full_name
regexp_replace(z.email_handle, r'[^a-zA-Z]', '') email_handle
#have to insalle this first!!!
# !pip install git+https://github.com/casics/nostril.git
from nostril import ng
from nostril import nonsense_detector as nd
def is_nonsense_text(text):
if nd.nonsense(text):
return 1
else:
return 0
def is_gibberish(input_string, threshold=gibberish_threshold):
clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower()
entropy = calculate_entropy(clean_string)
if entropy < threshold:
return 1
else:
return 0
def calculate_entropy(s):
# Calculate the probability of each character
def ml_df(which_df,which_feature_col_start,which_target_col,threshold,test_size,agg,count_agg,group_by_date):
start_time = datetime.now()
print('ML start time:', start_time)
from ast import literal_eval
from difflib import SequenceMatcher
from collections import Counter
from itertools import groupby
from sklearn import model_selection
from sklearn.model_selection import train_test_split
def auto_features(which_df,feature_fraud_rate,new_rules_fraud_rate_col,cols_to_move,attr_col,features_start_col,cols_to_drop):
#create target col
which_df['target'] = np.where(which_df['is_risky'] == 'risky',1,0)
test_df = new_rules.copy()
# display(test_df.head())
#filter highest risk
df_features = test_df[
(test_df[new_rules_fraud_rate_col] >= feature_fraud_rate)
# & (test_df['pct_caught'] <= .1)
def fraud_metrics(df,cols,num_cols,count_risky,minrate,total,agg):
start_time = datetime.today()
print('Start Time: ', start_time)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
multiple_defect_df = multiple_defect(df=df
,agg=agg
,cols=cols
,n=num_cols
def fraud_rate(df, agg, cols=None, threshold=None, limit=None, days=None, minrate=None, maxrate=None):
if isinstance(cols, str):
groupcols = [cols]
elif cols is None:
groupcols = []
else:
try:
groupcols = list(cols)
except:
raise TypeError('Unable to convert cols to a list.')