Skip to content

Instantly share code, notes, and snippets.

View abhishek-shrm's full-sized avatar

ABHISHEK SHARMA abhishek-shrm

  • ZS Associates
  • New Delhi, India
View GitHub Profile
# Loading training data
df_train=pd.read_csv('./train.csv')
print('Shape=>',df_train.shape)
df_train.head()
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
# For handling data
import pandas as pd
# For numerical computing
import numpy as np
# For supressing FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
ranking_ir('nba championship michael jordan')
ranking_ir('michael jordan')
def ranking_ir(query):
# pre-process Query
query=query.lower()
query=expand_contractions(query)
query=clean_text(query)
query=re.sub(' +',' ',query)
# generating vector
vector=get_embedding_w2v(query.split())
from sklearn.metrics.pairwise import cosine_similarity
# Function for calculating average precision for a query
def average_precision(qid,qvector):
# Getting the ground truth and document vectors
qresult=testing_result.loc[testing_result['qid']==qid,['docid','rel']]
qcorpus=testing_corpus.loc[testing_corpus['docid'].isin(qresult['docid']),['docid','vector']]
qresult=pd.merge(qresult,qcorpus,on='docid')
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens):
embeddings = []
if len(doc_tokens)<1:
return np.zeros(300)
else:
for tok in doc_tokens:
if tok in w2v_model.wv.vocab:
embeddings.append(w2v_model.wv.word_vec(tok))
else:
# Vocabulary size
print('Vocabulary size:', len(w2v_model.wv.vocab))
from gensim.models import Word2Vec
# Creating data for the model training
train_data=[]
for i in combined_training:
train_data.append(i.split())
# Training a word2vec model from the given data set
w2v_model = Word2Vec(train_data, size=300, min_count=2,window=5, sg=1,workers=4)