guillermo-carrasco/whatsapp.py

## whatsapp.py
import re
from collections import Counter
from datetime import datetime

import emoji
import pandas as pd

class WhatsappChat:
    DATE_FORMAT = "[%m/%d/%y, %H:%M:%S]"

    REGEX_DATE = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\]"""
    REGEX_CONTACT = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\](.+?)(: )"""
    REGEX_MESSAGE = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\](.)+(: )(.+)"""

    def __init__(self, chat_path):
        self.emojis = Counter()
        self._f_words = set()
        self.chat_path = chat_path
        self.chat = self.parse_chat()

    def emoji_count(self, s):
        count = 0
        for c in s:
            if emoji.is_emoji(c):
                count += 1
                self.emojis[c] += 1
        return count

    @staticmethod
    def is_gif(s):
        return "GIF omitted" in s

    @staticmethod
    def is_image(s):
        return "image omitted" in s

    @staticmethod
    def is_document(s):
        return "document omitted" in s

    @staticmethod
    def is_sticker(s):
        return "sticker omitted" in s

    @staticmethod
    def is_audio(s):
        return "audio omitted" in s

    @staticmethod
    def is_media(s):
        return "omitted" in s

    def contains_f_word(self, s):
        pattern = '\s?f(uck|ucking|(\*+(k|ck|ng|ing)))+'
        match = re.match(pattern, s, re.IGNORECASE)
        if match:
            self._f_words.add(match.group(0))
            return True
        return False

    def parse_chat(self):
        df_data = {
            "date": [],
            "contact": [],
            "message": [],
        }

        with open(self.chat_path, "r") as f:
            # Start with the first message
            for line in f.readlines():
                if re.search(self.REGEX_DATE, line):
                    date = datetime.strptime(
                        re.search(self.REGEX_DATE, line).group(0).replace("\u200e", ""),
                        self.DATE_FORMAT,
                    )
                    df_data["date"].append(date)
                    df_data["contact"].append(
                        re.search(self.REGEX_CONTACT, line).group(3)
                    )
                    df_data["message"].append(
                        re.search(self.REGEX_MESSAGE, line).group(5)
                    )
                else:
                    df_data["message"][-1] += line

        df = pd.DataFrame.from_dict(df_data)

        df = df.set_index("date")

        # Compute some message stats
        df["message_length"] = df.message.apply(lambda s: len(s))
        df["exclamation_marks"] = df.message.apply(lambda s: s.count("!"))
        df["emoji_count"] = df.message.apply(self.emoji_count)
        df["is_gif"] = df.message.apply(self.is_gif)
        df["is_image"] = df.message.apply(self.is_image)
        df["is_document"] = df.message.apply(self.is_document)
        df["is_sticker"] = df.message.apply(self.is_sticker)
        df["is_audio"] = df.message.apply(self.is_audio)
        df["drop_emojis"] = df["exclamation_marks"] = df.message.apply(
            lambda s: s.count("😅")
        )
        df["is_media"] = df.message.apply(self.is_media)
        df["contains_f_word"] = df.message.apply(self.contains_f_word)

        return df
	import re
	from collections import Counter
	from datetime import datetime

	import emoji
	import pandas as pd

	class WhatsappChat:
	DATE_FORMAT = "[%m/%d/%y, %H:%M:%S]"

	REGEX_DATE = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\]"""
	REGEX_CONTACT = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\](.+?)(: )"""
	REGEX_MESSAGE = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\](.)+(: )(.+)"""

	def __init__(self, chat_path):
	self.emojis = Counter()
	self._f_words = set()
	self.chat_path = chat_path
	self.chat = self.parse_chat()

	def emoji_count(self, s):
	count = 0
	for c in s:
	if emoji.is_emoji(c):
	count += 1
	self.emojis[c] += 1
	return count

	@staticmethod
	def is_gif(s):
	return "GIF omitted" in s

	@staticmethod
	def is_image(s):
	return "image omitted" in s

	@staticmethod
	def is_document(s):
	return "document omitted" in s

	@staticmethod
	def is_sticker(s):
	return "sticker omitted" in s

	@staticmethod
	def is_audio(s):
	return "audio omitted" in s

	@staticmethod
	def is_media(s):
	return "omitted" in s

	def contains_f_word(self, s):
	pattern = '\s?f(uck\|ucking\|(\*+(k\|ck\|ng\|ing)))+'
	match = re.match(pattern, s, re.IGNORECASE)
	if match:
	self._f_words.add(match.group(0))
	return True
	return False

	def parse_chat(self):
	df_data = {
	"date": [],
	"contact": [],
	"message": [],
	}

	with open(self.chat_path, "r") as f:
	# Start with the first message
	for line in f.readlines():
	if re.search(self.REGEX_DATE, line):
	date = datetime.strptime(
	re.search(self.REGEX_DATE, line).group(0).replace("\u200e", ""),
	self.DATE_FORMAT,
	)
	df_data["date"].append(date)
	df_data["contact"].append(
	re.search(self.REGEX_CONTACT, line).group(3)
	)
	df_data["message"].append(
	re.search(self.REGEX_MESSAGE, line).group(5)
	)
	else:
	df_data["message"][-1] += line

	df = pd.DataFrame.from_dict(df_data)

	df = df.set_index("date")

	# Compute some message stats
	df["message_length"] = df.message.apply(lambda s: len(s))
	df["exclamation_marks"] = df.message.apply(lambda s: s.count("!"))
	df["emoji_count"] = df.message.apply(self.emoji_count)
	df["is_gif"] = df.message.apply(self.is_gif)
	df["is_image"] = df.message.apply(self.is_image)
	df["is_document"] = df.message.apply(self.is_document)
	df["is_sticker"] = df.message.apply(self.is_sticker)
	df["is_audio"] = df.message.apply(self.is_audio)
	df["drop_emojis"] = df["exclamation_marks"] = df.message.apply(
	lambda s: s.count("😅")
	)
	df["is_media"] = df.message.apply(self.is_media)
	df["contains_f_word"] = df.message.apply(self.contains_f_word)

	return df