This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Combining corpus and queries for training | |
combined_training=pd.concat([training_corpus.rename(columns={'lemmatized':'text'})['text'],\ | |
training_queries.rename(columns={'cleaned':'text'})['text']])\ | |
.sample(frac=1).reset_index(drop=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Lowercasing the text | |
training_queries['cleaned']=training_queries['query'].apply(lambda x:x.lower()) | |
testing_queries['cleaned']=testing_queries['query'].apply(lambda x:x.lower()) | |
# Expanding contractions | |
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x:expand_contractions(x)) | |
testing_queries['cleaned']=testing_queries['cleaned'].apply(lambda x:expand_contractions(x)) | |
# Cleaning queries using RegEx | |
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x: clean_text(x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Stopwords removal & Lemmatizing tokens using SpaCy | |
import spacy | |
nlp = spacy.load('en_core_web_sm',disable=['ner','parser']) | |
nlp.max_length=5000000 | |
# Removing Stopwords and Lemmatizing words | |
training_corpus['lemmatized']=training_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)])) | |
testing_corpus['lemmatized']=testing_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Removing extra spaces | |
training_corpus['cleaned']=training_corpus['cleaned'].apply(lambda x: re.sub(' +',' ',x)) | |
testing_corpus['cleaned']=testing_corpus['cleaned'].apply(lambda x: re.sub(' +',' ',x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function for Cleaning Text | |
def clean_text(text): | |
text=re.sub('\w*\d\w*','', text) | |
text=re.sub('\n',' ',text) | |
text=re.sub(r"http\S+", "", text) | |
text=re.sub('[^a-z]',' ',text) | |
return text | |
# Cleaning corpus using RegEx | |
training_corpus['cleaned']=training_corpus['cleaned'].apply(lambda x: clean_text(x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Lowercasing the text | |
training_corpus['cleaned']=training_corpus['body'].apply(lambda x:x.lower()) | |
testing_corpus['cleaned']=testing_corpus['body'].apply(lambda x:x.lower()) | |
# Dictionary of english Contractions | |
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have", | |
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have", | |
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for i,v in enumerate(training_queries['query'].sample(10)): | |
print(i,'=>',v) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
temp_doc=training_corpus.sample(1) | |
print('Title=>',temp_doc.title.values) | |
print('Body:\n',temp_doc.body.values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
testing_corpus=create_corpus(testing_result) | |
testing_corpus.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df=dd.read_table('msmarco-docs.tsv',blocksize=100e6,header=None) | |
df.columns=['docid','url','title','body'] | |
df.head() |