Created
March 5, 2022 19:58
-
-
Save tezansahu/04d41f33c99d68b4c95e850eb7b3f1b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1') | |
# Inspect the distribution of the number of words in the headlines | |
# to figure out the max number of tokens to be used by the tokenizer | |
num_words = df["headline"].apply(lambda x: len(x.split())) | |
plt.hist(num_words) | |
# Create labels from the sentiment values | |
labels = { | |
"negative": 0, | |
"neutral": 1, | |
"positive": 2 | |
} | |
df_new = pd.DataFrame({ | |
"text": df["headline"].apply(lambda x: x.lower()), | |
"labels": df["sentiment"].apply(lambda x: labels[x]) | |
}) | |
df_new.head() | |
# Split the data into training & evaluation sets | |
train_df, eval_df = train_test_split(df_new, test_size=0.2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment