This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| num_display_rows = 5 #display rows in jupyter | |
| avg_fuzzy_score_lte = 50 #lower scores mean lower the match between fullname and email handle | |
| min_bad_lev_dist_pct_total = .6 #the percent of levenshtein_distance to grab above; so grabbing >= x percent of lev distances (total unique distances mayb be 30, .6 of 30 = 18 | |
| # get riskiest lev distances | |
| percent = min_bad_lev_dist_pct_total | |
| which_col = 'levenshtein_distance' | |
| nunique_lev = df[which_col].nunique() | |
| nunique_max = df[which_col].max() | |
| pcnt_of_max_lev = int(round(percent * nunique_max,0)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from fuzzywuzzy import fuzz | |
| import Levenshtein | |
| import re | |
| import math | |
| df = df.fillna('nostring') | |
| df['full_name'] = df['full_name'].str.replace(' ','none') | |
| df['email_handle'] = df['email_handle'].str.replace(' ','none') | |
| df['fuzzy_score'] = df[['full_name','email_handle']].apply(lambda x : fuzz.partial_ratio(*x),axis=1) | |
| df['jaro_score'] = df[['full_name','email_handle']].apply(lambda x: Levenshtein.jaro_winkler(*x), axis=1) * 100 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| regexp_replace(z.full_name, r'[^a-zA-Z]', '') full_name | |
| regexp_replace(z.email_handle, r'[^a-zA-Z]', '') email_handle |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #have to insalle this first!!! | |
| # !pip install git+https://github.com/casics/nostril.git | |
| from nostril import ng | |
| from nostril import nonsense_detector as nd | |
| def is_nonsense_text(text): | |
| if nd.nonsense(text): | |
| return 1 | |
| else: | |
| return 0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def is_gibberish(input_string, threshold=gibberish_threshold): | |
| clean_string = re.sub(r'[^a-zA-Z]', '', input_string).lower() | |
| entropy = calculate_entropy(clean_string) | |
| if entropy < threshold: | |
| return 1 | |
| else: | |
| return 0 | |
| def calculate_entropy(s): | |
| # Calculate the probability of each character |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def ml_df(which_df,which_feature_col_start,which_target_col,threshold,test_size,agg,count_agg,group_by_date): | |
| start_time = datetime.now() | |
| print('ML start time:', start_time) | |
| from ast import literal_eval | |
| from difflib import SequenceMatcher | |
| from collections import Counter | |
| from itertools import groupby | |
| from sklearn import model_selection | |
| from sklearn.model_selection import train_test_split |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def auto_features(which_df,feature_fraud_rate,new_rules_fraud_rate_col,cols_to_move,attr_col,features_start_col,cols_to_drop): | |
| #create target col | |
| which_df['target'] = np.where(which_df['is_risky'] == 'risky',1,0) | |
| test_df = new_rules.copy() | |
| # display(test_df.head()) | |
| #filter highest risk | |
| df_features = test_df[ | |
| (test_df[new_rules_fraud_rate_col] >= feature_fraud_rate) | |
| # & (test_df['pct_caught'] <= .1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def fraud_metrics(df,cols,num_cols,count_risky,minrate,total,agg): | |
| start_time = datetime.today() | |
| print('Start Time: ', start_time) | |
| pd.set_option('display.max_rows', None) | |
| pd.set_option('display.max_columns', None) | |
| multiple_defect_df = multiple_defect(df=df | |
| ,agg=agg | |
| ,cols=cols | |
| ,n=num_cols |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def fraud_rate(df, agg, cols=None, threshold=None, limit=None, days=None, minrate=None, maxrate=None): | |
| if isinstance(cols, str): | |
| groupcols = [cols] | |
| elif cols is None: | |
| groupcols = [] | |
| else: | |
| try: | |
| groupcols = list(cols) | |
| except: | |
| raise TypeError('Unable to convert cols to a list.') |
NewerOlder