josht-jpg/AdaBoost_setup

## AdaBoost_setup
notes_tokens = nltk.word_tokenize(notes)
crime_tokens = nltk.word_tokenize(crime)
idiot_tokens = nltk.word_tokenize(idiot)
possessed_tokens = nltk.word_tokenize(possessed)
brothers_tokens = nltk.word_tokenize(brothers)

notes_str = " ".join(notes_tokens)
notes_sentences = notes_str.split(".")

crime_str = " ".join(crime_tokens)
crime_sentences = crime_str.split(".")

idiot_str = " ".join(idiot_tokens)
idiot_sentences = idiot_str.split(".")

possessed_str = " ".join(possessed_tokens)
possessed_sentences = possessed_str.split(".")

brothers_str = " ".join(brothers_tokens)
brothers_sentences = brothers_str.split(".")

sentences = pd.Series(notes_sentences + crime_sentences + idiot_sentences \
            + possessed_sentences + brothers_sentences)

sentences = sentences[sentences.apply(lambda x: len(x) > 1)]

np.random.seed(2020)
to_label_train = sentences.iloc[np.random.randint(0, len(sentences), 100)]

np.random.seed(42)
to_label_test = sentences.iloc[np.random.randint(0, len(sentences), 30)]
#In hindsight, I've realized there many cleaner alternatives to the above 5 lines.
#One of which would be to use Sklearn's train_test_split function.
	notes_tokens = nltk.word_tokenize(notes)
	crime_tokens = nltk.word_tokenize(crime)
	idiot_tokens = nltk.word_tokenize(idiot)
	possessed_tokens = nltk.word_tokenize(possessed)
	brothers_tokens = nltk.word_tokenize(brothers)

	notes_str = " ".join(notes_tokens)
	notes_sentences = notes_str.split(".")

	crime_str = " ".join(crime_tokens)
	crime_sentences = crime_str.split(".")

	idiot_str = " ".join(idiot_tokens)
	idiot_sentences = idiot_str.split(".")

	possessed_str = " ".join(possessed_tokens)
	possessed_sentences = possessed_str.split(".")

	brothers_str = " ".join(brothers_tokens)
	brothers_sentences = brothers_str.split(".")

	sentences = pd.Series(notes_sentences + crime_sentences + idiot_sentences \
	+ possessed_sentences + brothers_sentences)

	sentences = sentences[sentences.apply(lambda x: len(x) > 1)]

	np.random.seed(2020)
	to_label_train = sentences.iloc[np.random.randint(0, len(sentences), 100)]

	np.random.seed(42)
	to_label_test = sentences.iloc[np.random.randint(0, len(sentences), 30)]
	#In hindsight, I've realized there many cleaner alternatives to the above 5 lines.
	#One of which would be to use Sklearn's train_test_split function.