Skip to content

Instantly share code, notes, and snippets.

@Xoma163
Created October 5, 2021 19:12
Show Gist options
  • Save Xoma163/1f93760c1877769b63043153cdb9a432 to your computer and use it in GitHub Desktop.
Save Xoma163/1f93760c1877769b63043153cdb9a432 to your computer and use it in GitHub Desktop.
Simple vk dump messages converter to json format (200k rows about 10 sec)
import json
import os
import time
from datetime import datetime
from joblib import Parallel, cpu_count, delayed
from bs4 import BeautifulSoup
class VkMessagesDumpConverter:
MONTH_TRANSLATOR = {
'янв': 'Jan',
'фев': 'Feb',
'мар': 'Mar',
'апр': 'Apr',
'мая': 'May',
'июн': 'Jun',
'июл': 'Jul',
'авг': 'Aug',
'сен': 'Sep',
'окт': 'Oct',
'ноя': 'Nov',
'дек': 'Dec',
}
DEFAULT_AUTHOR_NAME = "Вы"
def __init__(self, path, author_name=None):
"""
:param path: путь к папке, в которой хранится множество .pdf файлов с перепиской
:param author_name: имя пользователя, от которого делался дамп переписки. Нужно для замены на корректное имя
"""
if not author_name:
author_name = self.DEFAULT_AUTHOR_NAME
self.input_path = path
all_pdf_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
self.all_pdf_files = sorted(all_pdf_files, key=lambda x: self._get_file_part_number(x))
self.output_path = f"{path.split('/')[-1]}.json" # current folder. {chat_name}.json
self.author_name = author_name
@staticmethod
def _get_file_part_number(filename):
"""
Получение номера файла. Требуется для корректной сортировки
:param filename: название файла
:return: номер файла
"""
return int(filename.replace("messages", '').replace('.html', ''))
def to_json(self):
parsed = self._parse()
json_dumps = json.dumps(parsed, ensure_ascii=False, indent=2)
with open(self.output_path, 'w', encoding='utf-8', ) as file:
file.write(json_dumps)
def _parse(self):
start_time = time.time()
results = parallel(self._parse_file, self.all_pdf_files)
# results = []
# for file in self.all_pdf_files:
# results.append(self._parse_file(file))
flat_results = [item for sublist in results for item in sublist]
flat_results.reverse()
print(time.time() - start_time)
return flat_results
def _parse_file(self, file):
all_messages = []
with open(os.path.join(self.input_path, file), 'r') as pdf:
pdf_body = pdf.read()
bs4 = BeautifulSoup(pdf_body, 'html.parser')
items = bs4.select('.item')
for item in items:
parsed_message = self._parse_item(item)
all_messages.append(parsed_message)
return all_messages
def _parse_item(self, item):
author_dt_div = item.find('div', {
'class': 'message__header'
})
author, dt = author_dt_div.text.split(', ')
if author == self.DEFAULT_AUTHOR_NAME:
author = self.author_name
flag_edited = False
edited_text = ' (ред.)'
edited_pos = dt.find(edited_text)
if edited_pos != -1:
flag_edited = True
dt = dt.replace(edited_text, '')
dt_day, dt_month_rus, dt_year, _, dt_time = dt.split(' ')
dt_month_eng = self.translate_month(dt_month_rus)
datetime_obj = datetime.strptime(f"{dt_day} {dt_month_eng} {dt_year} {dt_time}", '%d %b %Y %X')
datetime_str = datetime_obj.strftime("%d.%m.%Y %X")
text_attachments_div = author_dt_div.find_next_sibling('div')
text = ''
if len(text_attachments_div.contents) > 1:
text = text_attachments_div.contents[0].strip()
flag_fwd = False
attachments = []
attachments_div = text_attachments_div.find_all('div', {
'class': 'attachment'
})
if attachments_div:
for attachment in attachments_div:
att_dict = {}
att_description = attachment.find('div', {
'class': "attachment__description"
})
if "прикрепл" in att_description.text:
flag_fwd = True
else:
att_dict = {
'type': att_description.text
}
if att_description.text in ["Запись на стене", "Стикер", "Аудиозапись",
"Запрос на денежный перевод", "История", "Комментарий на стене"]:
pass
elif att_description.text in ["Фотография", "Видеозапись", "Файл", "Ссылка"]:
att_link = attachment.find('a', {
'class': 'attachment__link'
}).attrs['href']
att_dict['link'] = att_link
if att_link.endswith('.ogg'):
att_dict['type'] = "Голосовое сообщение"
else:
print(att_description.text)
if att_dict:
attachments.append(att_dict)
parsed_message = {
'author': author,
'datetime': datetime_str,
'text': text,
'attachments': attachments,
'edited': flag_edited,
'fwd': flag_fwd
}
return parsed_message
def translate_month(self, rus_month):
return self.MONTH_TRANSLATOR[rus_month]
def parallel(method, data, threads_count=None):
"""
Разделяет задачу на множество подзадач в многопотоке
:param method: метод, в котором будет выполняться 1 поток
:param data: данные, которые будут доступны в методе
:param threads_count: кол-во потоков
"""
if threads_count is None:
threads_count = cpu_count()
return Parallel(n_jobs=threads_count)(delayed(method)(item) for item in data)
if __name__ == "__main__":
input_filename = ""
vk_mdc = VkMessagesDumpConverter("chats/%chatname%", "first_name second_name")
vk_mdc.to_json()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment