Created
September 5, 2020 01:08
-
-
Save josht-jpg/70a0e82255645a4a6fce19b416487072 to your computer and use it in GitHub Desktop.
Cleaning Labelled
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_labelled(sentence, stop_words): | |
sentence = sentence.lower() | |
sentence_tokens_clean = nltk.tokenize.RegexpTokenizer(r'\w+').\ | |
tokenize(sentence) | |
sentence_clean = pd.DataFrame(sentence_tokens_clean, columns = ['word']) | |
sentence_clean = [w for w in sentence_tokens_clean \ | |
if w not in stop_words] | |
return sentence_clean | |
labelled_train['sentence'] = labelled_train['sentence'].\ | |
apply(lambda x:clean_labelled(x, stop_words_context)) | |
labelled_test['sentence'] = labelled_test['sentence'].\ | |
apply(lambda x:clean_labelled(x, stop_words_context)) | |
def nrc_sentence(sentence): | |
total = np.zeros(10) | |
for word in sentence: | |
nrc = nrc_classify(word) | |
if(len(nrc) > 0): total = np.add(total, nrc) | |
return total | |
labelled_train['classification'] = labelled_train['sentence'].\ | |
apply(nrc_sentence) | |
labelled_test['classification'] = labelled_test['sentence'].\ | |
apply(nrc_sentence) | |
def labelled_adjust_class(labelled): | |
classification_df = pd.DataFrame.from_dict( \ | |
dict(labelled['classification'])).transpose() | |
classification_df.columns = NRC_sentiments | |
labelled = labelled.join(classification_df) | |
labelled = labelled.drop(['classification'], axis = 1) | |
return labelled | |
labelled_train = labelled_adjust_class(labelled_train) | |
labelled_test = labelled_adjust_class(labelled_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment