Skip to content

Instantly share code, notes, and snippets.

@Mr-Saxobeat
Forked from kurasaiteja/clean.py
Created October 27, 2020 02:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mr-Saxobeat/24ae11228e36f55a3eb18d3320d4532a to your computer and use it in GitHub Desktop.
Save Mr-Saxobeat/24ae11228e36f55a3eb18d3320d4532a to your computer and use it in GitHub Desktop.
def startsWithDateAndTime(s):
# regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -'
result = re.match(pattern, s)
if result:
return True
return False
# Finds username of any given format.
def FindAuthor(s):
patterns = [
'([\w]+):', # First Name
'([\w]+[\s]+[\w]+):', # First Name + Last Name
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India)
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US)
'([\w]+)[\u263a-\U0001f999]+:', # Name and Emoji
]
pattern = '^' + '|'.join(patterns)
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(', ')
message = ' '.join(splitLine[1:])
if FindAuthor(message):
splitMessage = message.split(': ')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
# Upload your file here
conversationPath = '/content/WhatsApp Chat with Blabla (1).txt' # chat file
with open(conversationPath, encoding="utf-8") as fp:
fp.readline() # Skipping first line of the file because contains information related to something about end-to-end encryption
messageBuffer = []
date, time, author = None, None, None
while True:
line = fp.readline()
if not line:
break
line = line.strip()
if startsWithDateAndTime(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPoint(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df["Date"] = pd.to_datetime(df["Date"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment