sidsvash26/Word Cloud - Whatsapp chats

## Word Cloud - Whatsapp chats
# -*- coding: utf-8 -*-
"""
Created on Mon May  2 00:24:14 2016

@author: sidvash
"""

import pandas as pd

dataframe = pd.read_csv('whatsapp_chats.txt', sep=r'[0-9] -', names=['time', 'message'])

"""
Use this if your phone as am/pm time format:
dataframe = pd.read_csv('/home/sidvash/whatsapp/etms/etms.txt', sep=r'[ap]m -', names=['time', 'message'])

"""


df2 = dataframe['message'].str.split(":", expand=True,n=1)

df_all = pd.concat([dataframe, df2], axis=1)
df_all = df_all.rename(columns={'message': 'total', 0:'name', 1:'message'})
df_all.drop('total', axis=1, inplace=True)

# Pre-processing to clean the data
#replaces empty messages where they are in time column
dataframe.loc[dataframe.time.str.contains(r'[a-zA-Z]')==True, 'message'] = dataframe[dataframe.time.str.contains(r'[a-zA-Z]')==True].time

df_all.fillna('null', inplace=True)

#******************    Delete rows where  **************

#Time contains aplhabets
df_all = df_all[df_all.time.str.contains(r'[a-zA-Z]')==False]

#Name includes an activity on group
df_all = df_all[df_all.name.str.contains("added|changed|created|left")==False]


######## WORD CLOUD  #########
from PIL import Image

from os import path

import numpy as np

import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

text = ' '.join(df_all['message'])

STOPWORDS.add("media")
STOPWORDS.add("omitted")
STOPWORDS.add("<media omitted>")

snape_mask = np.array(Image.open("snape.jpg"))
wc = WordCloud(background_color="white", max_words=2000, mask=snape_mask, stopwords=STOPWORDS.add("said"))
wc.generate(text)

plt.imshow(wc)

wc.to_file("word_cloud.jpg")

#CLasses:
all_names = df_all.name.unique()
len(all_names)


corpus_dict = {x : ' '.join(df_all[df_all.name == x].message) for x in all_names}

#Generate plots of all people:
image_dict = {i: wc.generate(corpus_dict[all_names[i]]) for i in range(len(all_names))-1 }
	# -- coding: utf-8 --
	"""
	Created on Mon May 2 00:24:14 2016

	@author: sidvash
	"""

	import pandas as pd

	dataframe = pd.read_csv('whatsapp_chats.txt', sep=r'[0-9] -', names=['time', 'message'])

	"""
	Use this if your phone as am/pm time format:
	dataframe = pd.read_csv('/home/sidvash/whatsapp/etms/etms.txt', sep=r'[ap]m -', names=['time', 'message'])

	"""


	df2 = dataframe['message'].str.split(":", expand=True,n=1)

	df_all = pd.concat([dataframe, df2], axis=1)
	df_all = df_all.rename(columns={'message': 'total', 0:'name', 1:'message'})
	df_all.drop('total', axis=1, inplace=True)

	# Pre-processing to clean the data
	#replaces empty messages where they are in time column
	dataframe.loc[dataframe.time.str.contains(r'[a-zA-Z]')==True, 'message'] = dataframe[dataframe.time.str.contains(r'[a-zA-Z]')==True].time

	df_all.fillna('null', inplace=True)

	#**************** Delete rows where ************

	#Time contains aplhabets
	df_all = df_all[df_all.time.str.contains(r'[a-zA-Z]')==False]

	#Name includes an activity on group
	df_all = df_all[df_all.name.str.contains("added\|changed\|created\|left")==False]


	######## WORD CLOUD #########
	from PIL import Image

	from os import path

	import numpy as np

	import matplotlib.pyplot as plt

	from wordcloud import WordCloud, STOPWORDS

	text = ' '.join(df_all['message'])

	STOPWORDS.add("media")
	STOPWORDS.add("omitted")
	STOPWORDS.add("<media omitted>")

	snape_mask = np.array(Image.open("snape.jpg"))
	wc = WordCloud(background_color="white", max_words=2000, mask=snape_mask, stopwords=STOPWORDS.add("said"))
	wc.generate(text)

	plt.imshow(wc)

	wc.to_file("word_cloud.jpg")

	#CLasses:
	all_names = df_all.name.unique()
	len(all_names)


	corpus_dict = {x : ' '.join(df_all[df_all.name == x].message) for x in all_names}

	#Generate plots of all people:
	image_dict = {i: wc.generate(corpus_dict[all_names[i]]) for i in range(len(all_names))-1 }