Skip to content

Instantly share code, notes, and snippets.

@KeyWeeUsr
Created October 28, 2017 19:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KeyWeeUsr/0d622bef80a181de910386c99b87d85a to your computer and use it in GitHub Desktop.
Save KeyWeeUsr/0d622bef80a181de910386c99b87d85a to your computer and use it in GitHub Desktop.
# py3
# pip install beautifulsoup4
import os
import sys
from os.path import join, dirname, abspath
from bs4 import BeautifulSoup
# linux \n, windows \r\n
lineend = '\r\n'
DIR = join(dirname(abspath(__file__)), 'parsed_conversations')
inp = sys.argv[1]
# read everything as bytes and decode
input('Opening {}? Stop with Ctrl + C'.format(inp))
os.mkdir(DIR)
with open(inp, 'rb') as f:
cont = f.read().decode('utf-8')
# split too big FB messages file by conversations
splitted = [
'<div class="thread">' + sp if i != 0 else sp
for i, sp in enumerate(cont.split('<div class="thread">'))
]
# parse splitted threads and create separate files
for i, part in enumerate(splitted):
soup = BeautifulSoup(part, 'html.parser')
threads = soup.find_all('div', attrs={'class': 'thread'})
output = {}
for thread in threads:
# silly-ish fetching because of this:
# <div class="thread">names<div class=message">etc
thread_title = str(thread)
thread_title = thread_title.strip(
'<div class="thread">'
)
thread_title = thread_title[:thread_title.find(
'<div class="message">'
)]
# always strip \r\n \n or other raw stuff
thread_title = thread_title.strip()
messages = []
conversation = thread.find_all('div', attrs={'class': 'message'})
print('Parsing conversation of length:', len(conversation))
for message in conversation:
author = message.find_next(
'span', attrs={'class': 'user'}
).text.strip()
time = message.find_next(
'span', attrs={'class': 'meta'}
).text.strip()
text = message.find_next('p').text.strip()
# replace CRLF with LF
text = text.replace('\r\n', lineend)
separator = '-' * len(time + ' | ' + author)
messages.append(
author + ' | ' + time + lineend + separator + lineend + text
)
output[thread_title] = (lineend * 2).join(messages)
for out in output:
file = join(DIR, 'conv_{}.txt'.format(str(i).zfill(4)))
with open(file, 'wb') as f:
f.write(out.encode('utf-8'))
f.write(lineend.encode('utf-8'))
f.write(b'=' * len(out))
f.write(lineend.encode('utf-8'))
f.write(output[out].encode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment