Skip to content

Instantly share code, notes, and snippets.

View abhishek-shrm's full-sized avatar

ABHISHEK SHARMA abhishek-shrm

  • ZS Associates
  • New Delhi, India
View GitHub Profile
# Combining corpus and queries for training
combined_training=pd.concat([training_corpus.rename(columns={'lemmatized':'text'})['text'],\
training_queries.rename(columns={'cleaned':'text'})['text']])\
.sample(frac=1).reset_index(drop=True)
# Lowercasing the text
training_queries['cleaned']=training_queries['query'].apply(lambda x:x.lower())
testing_queries['cleaned']=testing_queries['query'].apply(lambda x:x.lower())
# Expanding contractions
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x:expand_contractions(x))
testing_queries['cleaned']=testing_queries['cleaned'].apply(lambda x:expand_contractions(x))
# Cleaning queries using RegEx
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x: clean_text(x))
# Stopwords removal & Lemmatizing tokens using SpaCy
import spacy
nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
nlp.max_length=5000000
# Removing Stopwords and Lemmatizing words
training_corpus['lemmatized']=training_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
testing_corpus['lemmatized']=testing_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
# Removing extra spaces
training_corpus['cleaned']=training_corpus['cleaned'].apply(lambda x: re.sub(' +',' ',x))
testing_corpus['cleaned']=testing_corpus['cleaned'].apply(lambda x: re.sub(' +',' ',x))
# Function for Cleaning Text
def clean_text(text):
text=re.sub('\w*\d\w*','', text)
text=re.sub('\n',' ',text)
text=re.sub(r"http\S+", "", text)
text=re.sub('[^a-z]',' ',text)
return text
# Cleaning corpus using RegEx
training_corpus['cleaned']=training_corpus['cleaned'].apply(lambda x: clean_text(x))
import re
# Lowercasing the text
training_corpus['cleaned']=training_corpus['body'].apply(lambda x:x.lower())
testing_corpus['cleaned']=testing_corpus['body'].apply(lambda x:x.lower())
# Dictionary of english Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
for i,v in enumerate(training_queries['query'].sample(10)):
print(i,'=>',v)
temp_doc=training_corpus.sample(1)
print('Title=>',temp_doc.title.values)
print('Body:\n',temp_doc.body.values)
testing_corpus=create_corpus(testing_result)
testing_corpus.head()
df=dd.read_table('msmarco-docs.tsv',blocksize=100e6,header=None)
df.columns=['docid','url','title','body']
df.head()