Skip to content

Instantly share code, notes, and snippets.

View VincentTatan's full-sized avatar

vincentkernn VincentTatan

View GitHub Profile
def fixed_width_cut(df,feature,labels=['Low','Medium','High']):
feature_slice, retbins = pd.cut(df[feature], len(labels) ,retbins=True, labels=labels)
retbins = [ '%.2f' % elem for elem in retbins ]
return feature_slice,retbins
def quartile_cut(df,feature,labels=['Low','Medium','High']):
feature_slice, retbins = pd.qcut(df[feature], q=len(labels),retbins=True,labels=labels)
retbins = [ '%.2f' % elem for elem in retbins ]
return feature_slice,retbins
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=5)
print("\nThe Word Index = " , word_index)
print("\nThe Sequences = " , sequences)
print("\nThe Padded Sequences:")
print(padded)
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
'I eat chicken',
'I do not eat fish',
'Did you eat fish?'
]
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
def generate_logs_from_classifiers(classifiers):
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
for clf in classifiers:
name = clf.__class__.__name__
print('Processing {} classifier'.format(name))
clf.fit(X_train, y_train)
train_predictions = clf.predict(X_test)
def take_roc_curve(X_test,model):
y_preds = model.predict_proba(X_test)
preds = y_preds[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
precision, recall, _ = metrics.precision_recall_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.figure(figsize=(10,5))
def plot_confusion_matrix(cm_list,target_names,title_list,cmap=None,normalize=True,float_format_str='{:,.2f}'):
plt.figure(figsize=(10,5))
print('{}_count={:d}\n{}_count={:d}'.format(target_names[0],cm_list[0][0].sum(),target_names[1],cm_list[0][1].sum()))
stats_list = []
for i in range(len(cm_list)):
model_name = title_list[i]
cm = cm_list[i]
actual_phishy= cm[0]
def create_and_visualize_tree(X_train,y_train,max_depth=3):
decision_tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=1,random_state=1)
decision_tree = decision_tree.fit(X_train, y_train)
tree_str = export_graphviz(decision_tree, feature_names=X_train.columns,
filled=True, out_file=None)
graph = pydotplus.graph_from_dot_data(tree_str)
graph.write_png('dt.png')
display(Image('dt.png'))
return decision_tree
def create_logistic_regressions(X_train,y_train,figsize=(10,10)):
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train, y_train)
coefficients = logreg.coef_
intercept = logreg.intercept_
df_logreg = pd.DataFrame({'Feature':X_train.columns,'Coef':logreg.coef_[0]})
fig, ax = plt.subplots(figsize=figsize)
sns.barplot(x="Coef", y="Feature", data=df_logreg, ax=ax)
def split_train_test (df, target_feature,test_size=0.2):
filtered_list = list(df._get_numeric_data().columns)
filtered_list.remove(target_feature)
X = df[filtered_list]
y = df[target_feature]
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size=test_size)
print(''' X_train size : {}
y_train size : {}
X_test size : {}
y_test size : {}