Last active
December 29, 2021 11:40
-
-
Save Abuton/8d978aee5daa650aecd2d2a7acb9ccfb to your computer and use it in GitHub Desktop.
process the uploaded chat data by removing unwanted entries and returns a dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def processChatMsgs(wh_chat: list) -> pd.DataFrame: | |
""" | |
process the uploaded chat data by removing unwanted | |
entries and returns a dataframe | |
args: | |
----- | |
wh_chat: whatsapp chat data | |
Return: | |
------- | |
a dataframe | |
""" | |
msgs = [] | |
pos = 0 | |
for line in wh_chat: | |
if re.findall("\A\d+[/]", line): | |
msgs.append(line) | |
pos += 1 | |
else: | |
take = msgs[pos - 1] + ". "+ line | |
msgs.append(take) | |
msgs.pop(pos -1) | |
# extract time | |
time = [msgs[i].split(',')[-1].split('-')[0] for i in range(len(msgs))] | |
time = [s.strip(' ') for s in time] | |
# extract date | |
date = [msgs[i].split(',')[0] for i in range(len(msgs))] | |
# extract user | |
name = [] | |
for i in range(len(msgs)): | |
try: | |
name.append(msgs[i].split('-')[1].split(':')[0]) | |
except Exception as e: | |
name.append('Empty') | |
# extract msgs (content) | |
content = [] | |
for i in range(len(msgs)): | |
try: | |
content.append(msgs[i].split(':')[2]) | |
except IndexError: | |
content.append('Missing Text') | |
data = {"Date": date, "Time": time, "SenderName": name, "ChatContent": content} | |
return pd.DataFrame(data) | |
def clean_chat_data(df: pd.DataFrame) -> pd.DataFrame: | |
# drop the media | |
df = df[df['Content'] != ' <Media omitted>'] | |
# get total message a user has sent | |
df['Sender'] = df['Sender'].str.strip() | |
df = df[df['Sender'] != 'Empty'] | |
# remove all emoji | |
df['Content'] = df['Content'].apply(remove_emoji) | |
# drop unwanted contents | |
df = df[df['Content'] != 'Missing Text'] | |
return df | |
def addNewColumns(df: pd.DataFrame) -> pd.DataFrame: | |
# add the chat length | |
df['chat_length'] = df['Content'].apply(len) | |
# add word count | |
df['word_count'] = df['Content'].apply(word_count) | |
# convert date to a datetime obj | |
df['Date'] = pd.to_datetime(df['Date']) | |
# extract day,month,year | |
df['day_name'] = df['Date'].dt.day_name() | |
df['month_name'] = df['Date'].dt.month_name() | |
df['year'] = df['Date'].dt.year | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment