muety/whatsapp_to_telegram.py

## whatsapp_to_telegram.py
#!/bin/python

import os
import re
import shutil

# A script to help you migrate your whole WhatsApp chat history with a person to Telegram

# Instructions
# 1. Install "Backup WhatsApp Chats" extenstion to Chrome
# 2. Buy a license
# 3. Open WhatsApp web and select the chat to export
# 4. Open the plugin, choose export type 'HTML', choose to download unresolved media
# 5. Download and extract the zip file to data/exported
# 6. Open the plugin again and choose export type 'Text'
# 7. Download the text file to data/exported as well
# 8. Edit CHAT_NAME and SELF_NAME constants below
# 9. Create output directory at data/converted
# 10. Run 'python convert.py'
# 11. Copy the contents of data/converted to your phone
# 12. Open your favorite file browser, select all files and share them to Telegram
# -> Telegram should now ask you where to import the messages
# 13. Select the target chat and hit import

# Issues
# - For some reason, the current day is skipped during export
# - For some reason, copying the output files to the phone via USB will cause media files to not be imported. Instead, I uploaded them to Nextcloud, then synced them on the phone using the Nextcloud app and then shared them to Telegram using a file manager. The trick is probably to somehow get this (https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java#L1391) method to return null, so that the shared media documents are added to documentsUrisArray instead of documentsPathsArray.

# Relevant code in Telegram for Android
# - https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/messenger/MessagesController.java#L789
# - https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java#L1365
# - https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/messenger/SendMessagesHelper.java#L5826

# Constants
CHAT_NAME = 'John Doe'               # change this
SELF_NAME = 'Jane Doe'               # change this
INPUT_DIR = './data/exported'
OUTPUT_DIR = './data/converted'
OUTPUT_FILE = f'WhatsApp Chat mit {CHAT_NAME}.txt'
DATE_REGEX = '(\d{4}/\d{2}/\d{2}), (\d{2}:\d{2}:\d{2})'
LINE_REGEX = f'^{DATE_REGEX} - ({CHAT_NAME}|{SELF_NAME}): .+'
MEDIA_FILE_REGEX = '.+\.(jpg|jpeg|png|mp4|oga|webp)$'
MEDIA_INDICATOR = '<Media omitted>'
MEDIA_TARGET_SUFFIX = '(Datei angehängt)'

# Methods
def read_messages():
    lines = []
    file_name = f'{INPUT_DIR}/{CHAT_NAME}.txt'
    print(f'reading {file_name}')

    with open(file_name, 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break

            if re.match(LINE_REGEX, line):
                lines.append(line)
            elif not re.match(DATE_REGEX, line) and len(lines) > 0:
                lines[-1] += line

    print(f'read {len(lines)} messages')
    return lines

def find_media():
    return [f for f in os.listdir(INPUT_DIR) if re.match(MEDIA_FILE_REGEX, f, re.IGNORECASE)]

def extract_datetime(message):
    date_match = re.search(DATE_REGEX, message)
    date_components = date_match.group(1).split('/')
    time_components = date_match.group(2).split(':')
    return date_components, time_components

def replace_media_refs(messages):
    new_messages = [m for m in messages]
    media_files = find_media()
    print(f'found {len(media_files)} media files')

    for i, m in enumerate(new_messages):
        if not MEDIA_INDICATOR in m:
            continue

        date_components, time_components = extract_datetime(m)
        media_prefix = f'{"_".join(date_components)}_{"".join(time_components)}'
        media_candidates = [f for f in media_files if f.startswith(media_prefix)]

        if len(media_candidates) == 0:
            print(f'warning: did not find matching media file for message at {media_prefix}')
            continue

        new_messages[i] = m.replace(MEDIA_INDICATOR, f'{media_candidates[0]} {MEDIA_TARGET_SUFFIX}')
        media_files.remove(media_candidates[0])

    return new_messages

def write_messages(messages):
    out_file_name = f'{OUTPUT_DIR}/{OUTPUT_FILE}'
    print(f'saving messages to {out_file_name}')
    with open(out_file_name, 'w') as f:
        for m in messages:
            # dc, tc = extract_datetime(m)
            # m = re.sub(DATE_REGEX, f'{dc[2]}.{dc[1]}.{dc[0][2:]}, {":".join(tc[:2])}', m) if dc and tc else m
            f.write(m)

def copy_media():
    for f in find_media():
        shutil.copy2(f'{INPUT_DIR}/{f}', f'{OUTPUT_DIR}/{f}')

if __name__ == '__main__':
    print('reading messages')
    messages = read_messages()

    print('replacing media references')
    messages = replace_media_refs(messages)

    print('saving output')
    write_messages(messages)

    print('copying media')
    copy_media()

    print('done')
	#!/bin/python

	import os
	import re
	import shutil

	# A script to help you migrate your whole WhatsApp chat history with a person to Telegram

	# Instructions
	# 1. Install "Backup WhatsApp Chats" extenstion to Chrome
	# 2. Buy a license
	# 3. Open WhatsApp web and select the chat to export
	# 4. Open the plugin, choose export type 'HTML', choose to download unresolved media
	# 5. Download and extract the zip file to data/exported
	# 6. Open the plugin again and choose export type 'Text'
	# 7. Download the text file to data/exported as well
	# 8. Edit CHAT_NAME and SELF_NAME constants below
	# 9. Create output directory at data/converted
	# 10. Run 'python convert.py'
	# 11. Copy the contents of data/converted to your phone
	# 12. Open your favorite file browser, select all files and share them to Telegram
	# -> Telegram should now ask you where to import the messages
	# 13. Select the target chat and hit import

	# Issues
	# - For some reason, the current day is skipped during export
	# - For some reason, copying the output files to the phone via USB will cause media files to not be imported. Instead, I uploaded them to Nextcloud, then synced them on the phone using the Nextcloud app and then shared them to Telegram using a file manager. The trick is probably to somehow get this (https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java#L1391) method to return null, so that the shared media documents are added to documentsUrisArray instead of documentsPathsArray.

	# Relevant code in Telegram for Android
	# - https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/messenger/MessagesController.java#L789
	# - https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java#L1365
	# - https://github.com/DrKLO/Telegram/blob/368822d20f879f5ca851e4cbf13506eda4e48bfc/TMessagesProj/src/main/java/org/telegram/messenger/SendMessagesHelper.java#L5826

	# Constants
	CHAT_NAME = 'John Doe' # change this
	SELF_NAME = 'Jane Doe' # change this
	INPUT_DIR = './data/exported'
	OUTPUT_DIR = './data/converted'
	OUTPUT_FILE = f'WhatsApp Chat mit {CHAT_NAME}.txt'
	DATE_REGEX = '(\d{4}/\d{2}/\d{2}), (\d{2}:\d{2}:\d{2})'
	LINE_REGEX = f'^{DATE_REGEX} - ({CHAT_NAME}\|{SELF_NAME}): .+'
	MEDIA_FILE_REGEX = '.+\.(jpg\|jpeg\|png\|mp4\|oga\|webp)$'
	MEDIA_INDICATOR = '<Media omitted>'
	MEDIA_TARGET_SUFFIX = '(Datei angehängt)'

	# Methods
	def read_messages():
	lines = []
	file_name = f'{INPUT_DIR}/{CHAT_NAME}.txt'
	print(f'reading {file_name}')

	with open(file_name, 'r') as f:
	while True:
	line = f.readline()
	if not line:
	break

	if re.match(LINE_REGEX, line):
	lines.append(line)
	elif not re.match(DATE_REGEX, line) and len(lines) > 0:
	lines[-1] += line

	print(f'read {len(lines)} messages')
	return lines

	def find_media():
	return [f for f in os.listdir(INPUT_DIR) if re.match(MEDIA_FILE_REGEX, f, re.IGNORECASE)]

	def extract_datetime(message):
	date_match = re.search(DATE_REGEX, message)
	date_components = date_match.group(1).split('/')
	time_components = date_match.group(2).split(':')
	return date_components, time_components

	def replace_media_refs(messages):
	new_messages = [m for m in messages]
	media_files = find_media()
	print(f'found {len(media_files)} media files')

	for i, m in enumerate(new_messages):
	if not MEDIA_INDICATOR in m:
	continue

	date_components, time_components = extract_datetime(m)
	media_prefix = f'{"_".join(date_components)}_{"".join(time_components)}'
	media_candidates = [f for f in media_files if f.startswith(media_prefix)]

	if len(media_candidates) == 0:
	print(f'warning: did not find matching media file for message at {media_prefix}')
	continue

	new_messages[i] = m.replace(MEDIA_INDICATOR, f'{media_candidates[0]} {MEDIA_TARGET_SUFFIX}')
	media_files.remove(media_candidates[0])

	return new_messages

	def write_messages(messages):
	out_file_name = f'{OUTPUT_DIR}/{OUTPUT_FILE}'
	print(f'saving messages to {out_file_name}')
	with open(out_file_name, 'w') as f:
	for m in messages:
	# dc, tc = extract_datetime(m)
	# m = re.sub(DATE_REGEX, f'{dc[2]}.{dc[1]}.{dc[0][2:]}, {":".join(tc[:2])}', m) if dc and tc else m
	f.write(m)

	def copy_media():
	for f in find_media():
	shutil.copy2(f'{INPUT_DIR}/{f}', f'{OUTPUT_DIR}/{f}')

	if __name__ == '__main__':
	print('reading messages')
	messages = read_messages()

	print('replacing media references')
	messages = replace_media_refs(messages)

	print('saving output')
	write_messages(messages)

	print('copying media')
	copy_media()

	print('done')