Last active
June 26, 2023 07:52
-
-
Save iwalfy/094b237fbfdc901b8c8c779181707890 to your computer and use it in GitHub Desktop.
Convert Telegram chat export to BAZMAN ready file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Convert Telegram chat export to BAZMAN ready file | |
# Version: 1.1 | |
# | |
# (c) Catware-Foundation, 2022, 2023 | |
# Mikhail Lebedinets, 2022, 2023 | |
# | |
# 26.06.2023 - Added support for HTML exports | |
# | |
import json | |
import sys | |
import os | |
def error_handler(message): | |
print("Error: {}".format(message)) | |
exit() | |
def tg2bm(export_json_obj): | |
id = export_json_obj["id"] | |
name = export_json_obj["name"] | |
messages = export_json_obj["messages"] | |
total = len(messages) | |
print("Chat: {} (id: {})".format(name, id)) | |
result_string = "" | |
for index, msg in enumerate(messages): | |
print("Working on {} of {}...".format(index, total), end="\r") | |
msg_type = msg["type"] | |
if msg_type != "message": | |
continue | |
if "forwarded_from" in msg: | |
continue | |
msg_text = msg["text"] | |
text_type = type(msg_text) | |
to_append = "" | |
if isinstance(msg_text, list): | |
for msg_part in msg_text: | |
if isinstance(msg_part, str): | |
to_append += msg_part | |
continue | |
to_append += msg_part["text"] | |
else: | |
to_append = msg_text | |
if to_append.strip(): | |
result_string += "{}\n".format(to_append) | |
print("\nDone!") | |
return result_string | |
def tg2bm_html(html): | |
try: | |
from BeautifulSoup import BeautifulSoup | |
except ImportError: | |
try: | |
from bs4 import BeautifulSoup | |
except ImportError: | |
error_handler("No BeautifulSoup installed! Use `pip install bs4` to install it.") | |
parser = BeautifulSoup(html, "html.parser") | |
posts = parser.find_all("div", attrs={"class":"body"}) | |
result = "" | |
for post in posts: | |
if post.find("div", attrs={"class": "forwarded"}): | |
continue | |
text_elem = post.find("div", attrs={"class": "text"}) | |
if text_elem: | |
text = text_elem.get_text().strip() | |
result += text + "\n" | |
return result | |
def main(): | |
if len(sys.argv) < 3: | |
print("Usage: ./tg2bm.py [--html] <export file> <output file>") | |
exit() | |
if sys.argv[1] == "--html": | |
ishtml = True | |
if len(sys.argv) < 4: | |
print("Usage: ./tg2bm.py --html <export directory> <output file>") | |
exit() | |
export_file = sys.argv[2] | |
output_file = sys.argv[3] | |
else: | |
export_file = sys.argv[1] | |
output_file = sys.argv[2] | |
if not os.path.exists(export_file): | |
error_handler("{} not exist!".format(export_file)) | |
if ishtml: | |
if not os.path.isdir(export_file): | |
error_handler("{} is not a directory!".format(export_file)) | |
else: | |
if os.path.isdir(export_file): | |
error_handler("{} is a directory!".format(export_file)) | |
if os.path.isdir(output_file): | |
error_handler("{} is a directory!".format(output_file)) | |
if ishtml: | |
print("Loading export files...") | |
files = os.listdir(export_file) | |
result = "" | |
for file in files: | |
if file.endswith(".html"): | |
print(f"Reading file {file}...") | |
with open(f"{export_file}/{file}", "r") as f: | |
result += tg2bm_html(f.read()) | |
with open(output_file, "w") as f: | |
f.write(result) | |
exit() | |
try: | |
f = open(export_file, "r") | |
export_json = f.read() | |
f.close() | |
except: | |
error_handler("Failed to read export file!") | |
try: | |
export_json_obj = json.loads(export_json) | |
except: | |
error_handler("Failed to parse export file!") | |
result = tg2bm(export_json_obj) | |
try: | |
f = open(output_file, "w") | |
f.write(result) | |
f.close() | |
except: | |
error_handler("Failed to save result!") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment