tezansahu/mlf1_datasetExploration.py

## mlf1_datasetExploration.py
df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1')

# Inspect the distribution of the number of words in the headlines
# to figure out the max number of tokens to be used by the tokenizer
num_words = df["headline"].apply(lambda x: len(x.split()))
plt.hist(num_words)

# Create labels from the sentiment values
labels = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

df_new = pd.DataFrame({
    "text": df["headline"].apply(lambda x: x.lower()),
    "labels": df["sentiment"].apply(lambda x: labels[x])
})

df_new.head()

# Split the data into training & evaluation sets
train_df, eval_df = train_test_split(df_new, test_size=0.2)
	df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1')

	# Inspect the distribution of the number of words in the headlines
	# to figure out the max number of tokens to be used by the tokenizer
	num_words = df["headline"].apply(lambda x: len(x.split()))
	plt.hist(num_words)

	# Create labels from the sentiment values
	labels = {
	"negative": 0,
	"neutral": 1,
	"positive": 2
	}

	df_new = pd.DataFrame({
	"text": df["headline"].apply(lambda x: x.lower()),
	"labels": df["sentiment"].apply(lambda x: labels[x])
	})

	df_new.head()

	# Split the data into training & evaluation sets
	train_df, eval_df = train_test_split(df_new, test_size=0.2)