This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Loading training data | |
df_train=pd.read_csv('./train.csv') | |
print('Shape=>',df_train.shape) | |
df_train.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip | |
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For handling data | |
import pandas as pd | |
# For numerical computing | |
import numpy as np | |
# For supressing FutureWarning | |
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ranking_ir('nba championship michael jordan') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ranking_ir('michael jordan') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ranking_ir(query): | |
# pre-process Query | |
query=query.lower() | |
query=expand_contractions(query) | |
query=clean_text(query) | |
query=re.sub(' +',' ',query) | |
# generating vector | |
vector=get_embedding_w2v(query.split()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics.pairwise import cosine_similarity | |
# Function for calculating average precision for a query | |
def average_precision(qid,qvector): | |
# Getting the ground truth and document vectors | |
qresult=testing_result.loc[testing_result['qid']==qid,['docid','rel']] | |
qcorpus=testing_corpus.loc[testing_corpus['docid'].isin(qresult['docid']),['docid','vector']] | |
qresult=pd.merge(qresult,qcorpus,on='docid') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function returning vector reperesentation of a document | |
def get_embedding_w2v(doc_tokens): | |
embeddings = [] | |
if len(doc_tokens)<1: | |
return np.zeros(300) | |
else: | |
for tok in doc_tokens: | |
if tok in w2v_model.wv.vocab: | |
embeddings.append(w2v_model.wv.word_vec(tok)) | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Vocabulary size | |
print('Vocabulary size:', len(w2v_model.wv.vocab)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models import Word2Vec | |
# Creating data for the model training | |
train_data=[] | |
for i in combined_training: | |
train_data.append(i.split()) | |
# Training a word2vec model from the given data set | |
w2v_model = Word2Vec(train_data, size=300, min_count=2,window=5, sg=1,workers=4) |