Skip to content

Instantly share code, notes, and snippets.

@weallwegot
Last active August 25, 2018 18:06
Show Gist options
  • Save weallwegot/ffcb55a6bbdff02b03f05a5d0ddfc84a to your computer and use it in GitHub Desktop.
Save weallwegot/ffcb55a6bbdff02b03f05a5d0ddfc84a to your computer and use it in GitHub Desktop.
Read and parse a text message conversation text file.
import re
from src.convo_objects.TextEquivalent import TextEquivalent
def read_and_parse_text_file(full_ass_path, block_text_threshold_seconds):
"""
:param full_ass_path: path to the file where the text data is
:full_ass_path type: str
:param block_text_threshold_seconds: number of seconds between sequential texts for
them to be considered as "one" text. they will be merged. see `merge_sequential_text_equiv()`
:block_text_threshold_seconds type: int
:returns: a list of TextEquivalent objects
:rtype: list
"""
text_equivs = []
with open(full_ass_path,'r') as whole_ass_convo:
raw_data = whole_ass_convo.readlines()
i = 0
for line in raw_data:
# search for a couple of letters then a colon, before
sender = re.search(r'^\w+\:',line)
# search for the timestamp in YYYY-MM-DD HH:MM:SS format.
timestamp = re.search(r'\|\d+\-\d+\-\d+\s?\d+\:\d+\:\d+',line)
if sender and timestamp:
# get rid of the identifying colon, last character
sender_name = sender.group()[:-1]
timestamp_string = timestamp.group()
# slice the raw input such that sender & timestamp are gone.
text_msg = line[len(sender_name):-len(timestamp_string)-1]
# get rid of the identifier pipe, first character.
te = TextEquivalent(sender_name,timestamp_string[1:],text_msg)
# determine if texts should be merged by comparing time difference
if i >= 1:
te_prev = text_equivs[i-1]
diff = te.timestamp - te_prev.timestamp
if (te_prev.sender==te.sender) and (abs(diff.seconds) < block_text_threshold_seconds):
te_prev.merge_sequential_text_equiv(te)
else:
text_equivs.append(te)
i += 1
else:
text_equivs.append(te)
i += 1
return text_equivs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment