Samir Sheriff samiriff

## letter_count_value_counts.py
plt.figure(figsize=(15, 2))
letter_count_value_counts = messages_df['Letter_Count'].value_counts()
top_40_letter_count_value_counts = letter_count_value_counts.head(40)
top_40_letter_count_value_counts.plot.bar()
plt.xlabel('Letter count')
plt.ylabel('Frequency')

## word_count_value_counts.py
plt.figure(figsize=(15, 2)) # To ensure that the bar plot fits in the output cell of a Jupyter notebook
word_count_value_counts = messages_df['Word_Count'].value_counts()
top_40_word_count_value_counts = word_count_value_counts.head(40)
top_40_word_count_value_counts.plot.bar()
plt.xlabel('Word Count')
plt.ylabel('Frequency')

## letter_count_word_count.py
messages_df['Letter_Count'].sum(), messages_df['Word_Count'].sum()

## letter_count.py
total_letter_count_grouped_by_author = messages_df[['Author', 'Letter_Count']].groupby('Author').sum()
sorted_total_letter_count_grouped_by_author = total_letter_count_grouped_by_author.sort_values('Letter_Count', ascending=False)
top_10_sorted_total_letter_count_grouped_by_author = sorted_total_letter_count_grouped_by_author.head(10)
top_10_sorted_total_letter_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Letters')
plt.ylabel('Authors')

## word_count.py
total_word_count_grouped_by_author = messages_df[['Author', 'Word_Count']].groupby('Author').sum()
sorted_total_word_count_grouped_by_author = total_word_count_grouped_by_author.sort_values('Word_Count', ascending=False)
top_10_sorted_total_word_count_grouped_by_author = sorted_total_word_count_grouped_by_author.head(10)
top_10_sorted_total_word_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Words')
plt.ylabel('Authors')

## describe_aug.py
discrete_columns = [['Date', 'Time', 'Author', 'Message']]
messages_df[discrete_columns].describe()

continuous_columns = [['Letter_Count', 'Word_Count']]
messages_df[continuous_columns].describe()

## messages_df.py
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))

## cleaning.py
messages_df = df.drop(null_authors_df.index) # Drops all rows of the data frame containing messages from null authors
messages_df = messages_df.drop(media_messages_df.index) # Drops all rows of the data frame containing media messages
messages_df.head()

## media_messages.py
media_messages_df = df[df['Message'] == '<Media omitted>']
print(media_messages_df.head())

author_media_messages_value_counts = media_messages_df['Author'].value_counts()
top_10_author_media_messages_value_counts = author_media_messages_value_counts.head(10)
top_10_author_media_messages_value_counts.plot.barh()


## null_authors.py
null_authors_df = df[df['Author'].isnull()]
null_authors_df.head()
	plt.figure(figsize=(15, 2))
	letter_count_value_counts = messages_df['Letter_Count'].value_counts()
	top_40_letter_count_value_counts = letter_count_value_counts.head(40)
	top_40_letter_count_value_counts.plot.bar()
	plt.xlabel('Letter count')
	plt.ylabel('Frequency')
	plt.figure(figsize=(15, 2)) # To ensure that the bar plot fits in the output cell of a Jupyter notebook
	word_count_value_counts = messages_df['Word_Count'].value_counts()
	top_40_word_count_value_counts = word_count_value_counts.head(40)
	top_40_word_count_value_counts.plot.bar()
	plt.xlabel('Word Count')
	plt.ylabel('Frequency')
	total_letter_count_grouped_by_author = messages_df[['Author', 'Letter_Count']].groupby('Author').sum()
	sorted_total_letter_count_grouped_by_author = total_letter_count_grouped_by_author.sort_values('Letter_Count', ascending=False)
	top_10_sorted_total_letter_count_grouped_by_author = sorted_total_letter_count_grouped_by_author.head(10)
	top_10_sorted_total_letter_count_grouped_by_author.plot.barh()
	plt.xlabel('Number of Letters')
	plt.ylabel('Authors')
	total_word_count_grouped_by_author = messages_df[['Author', 'Word_Count']].groupby('Author').sum()
	sorted_total_word_count_grouped_by_author = total_word_count_grouped_by_author.sort_values('Word_Count', ascending=False)
	top_10_sorted_total_word_count_grouped_by_author = sorted_total_word_count_grouped_by_author.head(10)
	top_10_sorted_total_word_count_grouped_by_author.plot.barh()
	plt.xlabel('Number of Words')
	plt.ylabel('Authors')
	discrete_columns = [['Date', 'Time', 'Author', 'Message']]
	messages_df[discrete_columns].describe()

	continuous_columns = [['Letter_Count', 'Word_Count']]
	messages_df[continuous_columns].describe()
	messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
	messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))
	messages_df = df.drop(null_authors_df.index) # Drops all rows of the data frame containing messages from null authors
	messages_df = messages_df.drop(media_messages_df.index) # Drops all rows of the data frame containing media messages
	messages_df.head()
	media_messages_df = df[df['Message'] == '<Media omitted>']
	print(media_messages_df.head())

	author_media_messages_value_counts = media_messages_df['Author'].value_counts()
	top_10_author_media_messages_value_counts = author_media_messages_value_counts.head(10)
	top_10_author_media_messages_value_counts.plot.barh()
	null_authors_df = df[df['Author'].isnull()]
	null_authors_df.head()