Last active
May 30, 2021 10:04
-
-
Save vivek1240/259473cbc64c7a70c14391d0a6afb7f2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tokenize - break down each sentence into a list of words | |
import gensim | |
from gensim.utils import simple_preprocess | |
from gensim.parsing.preprocessing import STOPWORDS | |
stop_words = STOPWORDS | |
import spacy | |
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) | |
from nltk.tokenize.treebank import TreebankWordDetokenizer | |
def sent_to_words(sentence): | |
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations | |
def remove_stopwords(text): | |
return [word for word in simple_preprocess(str(text)) if word not in stop_words] | |
def lemmatization(sent, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): | |
doc = nlp(" ".join(sent)) | |
texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags] | |
return texts_out | |
def cleaning_new_text(data): | |
data_words = list(sent_to_words(data)) | |
data_words= remove_stopwords(data_words) | |
data_words = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) | |
return data_words | |
data= str('I have made a flight booking on Yatra. My PNR No are S6ZF2Y . I had booked a return flight for both the PNR. The outbound flight i.eรย "BOM -TRV"รย had been cancelled by the airways and the full amount has been credited back to my account.รย Now the Inbound flight i.e."รย TRV -รย BOM " has also been cancelled by the airways and the full amount has been processed to the yatra account on 24th Feb 2020. So I want you to refund me the amount ASAP. When would I get the refund back?Waiting for your reply') | |
data_words= cleaning_new_text(data) | |
data= TreebankWordDetokenizer().detokenize(data_words) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment