Skip to content

Instantly share code, notes, and snippets.

View Navjotbians's full-sized avatar

Nav Navjotbians

  • University of Ottawa
  • Ottawa
View GitHub Profile
@Navjotbians
Navjotbians / j_score.py
Last active May 22, 2021 20:21
Jaccard Score
def j_score(y_true, y_pred):
jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
return jaccard.mean()*100
def print_score(y_pred, y_test, clf):
print("Clf: ",clf.__class__.__name__)
print("Jaccard score: {}".format(j_score(pd.DataFrame(y_test), pd.DataFrame(y_pred))))
print("F1 Score : {}".format(f1_score(y_test, y_pred,average='macro')))
@Navjotbians
Navjotbians / train.py
Created May 22, 2021 20:11
train fuction
from sklearn.multiclass import OneVsRestClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.metrics import multilabel_confusion_matrix
### OneVsRestClassifier
def train_model(classifier,X, y, max_feature = 1000, embedding= 'bow' ):
#Train-test split
print("... Performing train test split")
@Navjotbians
Navjotbians / word_embeddings.py
Last active May 22, 2021 20:03
Text to vector conversion
from sklearn import feature_extraction
def get_embeddings(X_train, X_test, max_feature = 1000, embedding_type = "tfidf"):
if embedding_type == "bow":
vectorizer = feature_extraction.text.CountVectorizer(max_features= max_feature)
vectorizer.fit_transform(X_train).toarray()
train_feat = vectorizer.transform(X_train).toarray()
test_feat = vectorizer.transform(X_test).toarray()
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def process_txt(input, stemm = False,lemm = True):
### Clean input data
processed_text = clean(input)
### Tokenization
@Navjotbians
Navjotbians / clean_comments.py
Last active May 22, 2021 19:34
Clean comment
import re
import string
def clean(input_str):
input_str = input_str.lower()
for sub in SUBSTITUTIONS:
input_str = re.sub(sub[0], sub[1], input_str)
# Eliminate punchuation
@Navjotbians
Navjotbians / patterns.py
Created May 22, 2021 16:50
Patterns present in the dataset
SUBSTITUTIONS = [
(r'\d+', ''), # Delete digits
(r"n't", " not "), # Replace pattern n't -> not
(r"can't", "cannot "), # Replace pattern can't -> cannot
(r"what's", "what is "), # Replace pattern what's -> what is
(r"\'s", " "), # Delete pattern 's
(r"\'ve", " have "), # Replace pattern 've -> have
(r"\'re", " are "), # Replace pattern 're -> are
(r"\'d", " would "), # Replace pattern 'd -> would
(r"\'ll", " will "), # Replace pattern 'll -> will