Skip to content

Instantly share code, notes, and snippets.

@Abuton
Last active December 29, 2021 11:40
Show Gist options
  • Save Abuton/8d978aee5daa650aecd2d2a7acb9ccfb to your computer and use it in GitHub Desktop.
Save Abuton/8d978aee5daa650aecd2d2a7acb9ccfb to your computer and use it in GitHub Desktop.
process the uploaded chat data by removing unwanted entries and returns a dataframe
import pandas as pd
def processChatMsgs(wh_chat: list) -> pd.DataFrame:
"""
process the uploaded chat data by removing unwanted
entries and returns a dataframe
args:
-----
wh_chat: whatsapp chat data
Return:
-------
a dataframe
"""
msgs = []
pos = 0
for line in wh_chat:
if re.findall("\A\d+[/]", line):
msgs.append(line)
pos += 1
else:
take = msgs[pos - 1] + ". "+ line
msgs.append(take)
msgs.pop(pos -1)
# extract time
time = [msgs[i].split(',')[-1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time]
# extract date
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
# extract user
name = []
for i in range(len(msgs)):
try:
name.append(msgs[i].split('-')[1].split(':')[0])
except Exception as e:
name.append('Empty')
# extract msgs (content)
content = []
for i in range(len(msgs)):
try:
content.append(msgs[i].split(':')[2])
except IndexError:
content.append('Missing Text')
data = {"Date": date, "Time": time, "SenderName": name, "ChatContent": content}
return pd.DataFrame(data)
def clean_chat_data(df: pd.DataFrame) -> pd.DataFrame:
# drop the media
df = df[df['Content'] != ' <Media omitted>']
# get total message a user has sent
df['Sender'] = df['Sender'].str.strip()
df = df[df['Sender'] != 'Empty']
# remove all emoji
df['Content'] = df['Content'].apply(remove_emoji)
# drop unwanted contents
df = df[df['Content'] != 'Missing Text']
return df
def addNewColumns(df: pd.DataFrame) -> pd.DataFrame:
# add the chat length
df['chat_length'] = df['Content'].apply(len)
# add word count
df['word_count'] = df['Content'].apply(word_count)
# convert date to a datetime obj
df['Date'] = pd.to_datetime(df['Date'])
# extract day,month,year
df['day_name'] = df['Date'].dt.day_name()
df['month_name'] = df['Date'].dt.month_name()
df['year'] = df['Date'].dt.year
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment