Skip to content

Instantly share code, notes, and snippets.

Avatar

ABHISHEK SHARMA abhishek-shrm

  • Analytics Vidhya
  • New Delhi, India
View GitHub Profile
View IR-W2V-28.py
ranking_ir('nba championship michael jordan')
View IR-W2V-26.py
def ranking_ir(query):
# pre-process Query
query=query.lower()
query=expand_contractions(query)
query=clean_text(query)
query=re.sub(' +',' ',query)
# generating vector
vector=get_embedding_w2v(query.split())
View IR-W2V-25.py
from sklearn.metrics.pairwise import cosine_similarity
# Function for calculating average precision for a query
def average_precision(qid,qvector):
# Getting the ground truth and document vectors
qresult=testing_result.loc[testing_result['qid']==qid,['docid','rel']]
qcorpus=testing_corpus.loc[testing_corpus['docid'].isin(qresult['docid']),['docid','vector']]
qresult=pd.merge(qresult,qcorpus,on='docid')
View IR-W2V-24.py
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens):
embeddings = []
if len(doc_tokens)<1:
return np.zeros(300)
else:
for tok in doc_tokens:
if tok in w2v_model.wv.vocab:
embeddings.append(w2v_model.wv.word_vec(tok))
else:
View IR-W2V-23.py
# Vocabulary size
print('Vocabulary size:', len(w2v_model.wv.vocab))
View IR-W2V-22.py
from gensim.models import Word2Vec
# Creating data for the model training
train_data=[]
for i in combined_training:
train_data.append(i.split())
# Training a word2vec model from the given data set
w2v_model = Word2Vec(train_data, size=300, min_count=2,window=5, sg=1,workers=4)
View IR-W2V-21.py
# Combining corpus and queries for training
combined_training=pd.concat([training_corpus.rename(columns={'lemmatized':'text'})['text'],\
training_queries.rename(columns={'cleaned':'text'})['text']])\
.sample(frac=1).reset_index(drop=True)
View IR-W2V-20.py
# Lowercasing the text
training_queries['cleaned']=training_queries['query'].apply(lambda x:x.lower())
testing_queries['cleaned']=testing_queries['query'].apply(lambda x:x.lower())
# Expanding contractions
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x:expand_contractions(x))
testing_queries['cleaned']=testing_queries['cleaned'].apply(lambda x:expand_contractions(x))
# Cleaning queries using RegEx
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x: clean_text(x))
View IR-W2V-19.py
# Stopwords removal & Lemmatizing tokens using SpaCy
import spacy
nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
nlp.max_length=5000000
# Removing Stopwords and Lemmatizing words
training_corpus['lemmatized']=training_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
testing_corpus['lemmatized']=testing_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
You can’t perform that action at this time.