wincentbalin/telegram_markdown_corpus.py

## telegram_markdown_corpus.py
#!/usr/bin/env python3
"""Convert Telegram channel JSON export to a Markdown text corpus"""

import sys
import json
import argparse
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.add_argument('export_json', help='Exported JSON file', type=argparse.FileType('r', encoding='utf-8'))
parser.add_argument('markdown_corpus', help='Markdown text corpus', type=argparse.FileType('w', encoding='utf-8'))
args = parser.parse_args()

export_data = json.load(args.export_json)
out_text = []
sep = '\n' * 5
for message in export_data['messages']:
    text = message['text']
    if type(text) == str:
        if text == '':
            continue
        out_text.append(text)
    elif type(text) == list:
        if len(text) == 1 and text[0]['type'] in ('link',):  # Message consists of a single link, which does not make sense
            continue
        for chunk in text:
            if type(chunk) == str:
                out_text.append(chunk)
            else:
                chunk_type = chunk['type']
                if chunk_type == 'bold':
                    out_text.append('**{text}**'.format(text=chunk['text']))
                elif chunk_type == 'italic':
                    out_text.append('_{text}_'.format(text=chunk['text']))
                elif chunk_type == 'underline':
                    out_text.append('<u>{text}</u>'.format(text=chunk['text']))
                elif chunk_type == 'strikethrough':
                    out_text.append('~~{text}~~'.format(text=chunk['text']))
                elif chunk_type == 'code':
                    out_text.append('`{text}`'.format(text=chunk['text']))
                elif chunk_type == 'link':
                    out_text.append('[{link}]({link})'.format(link=chunk['text']))
                elif chunk_type == 'text_link':
                    out_text.append('[{text}]({link})'.format(text=chunk['text'], link=chunk['href']))
                elif chunk_type == 'phone':
                    out_text.append('[{phone}](tel:{phone})'.format(phone=chunk['text']))
                elif chunk_type == 'email':
                    out_text.append('[{email}](tel:{email})'.format(email=chunk['text'].lower()))
                elif chunk_type == 'bank_card':
                    out_text.append(chunk['text'])
                elif chunk_type == 'mention':
                    out_text.append('[{mention}](https://t.me/{user})'.format(mention=chunk['text'], user=chunk['text'][1:]))
                elif chunk_type == 'hashtag':
                    out_text.append(chunk['text'])
                elif chunk_type == 'spoiler':
                    out_text.append('=={text}=='.format(text=chunk['text']))
                elif chunk_type == 'custom_emoji':
                    out_text.append(chunk['text'])
                elif chunk_type == 'bot_command':
                    out_text.append('`{text}`'.format(text=chunk['text']))
                else:
                    print(chunk)
    else:
        parser.exit(3, 'Unknown type of text: {text}\n'.format(text=text))
    out_text.append(sep)
args.markdown_corpus.write(''.join(out_text))
	#!/usr/bin/env python3
	"""Convert Telegram channel JSON export to a Markdown text corpus"""

	import sys
	import json
	import argparse
	parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
	parser.add_argument('export_json', help='Exported JSON file', type=argparse.FileType('r', encoding='utf-8'))
	parser.add_argument('markdown_corpus', help='Markdown text corpus', type=argparse.FileType('w', encoding='utf-8'))
	args = parser.parse_args()

	export_data = json.load(args.export_json)
	out_text = []
	sep = '\n' * 5
	for message in export_data['messages']:
	text = message['text']
	if type(text) == str:
	if text == '':
	continue
	out_text.append(text)
	elif type(text) == list:
	if len(text) == 1 and text[0]['type'] in ('link',): # Message consists of a single link, which does not make sense
	continue
	for chunk in text:
	if type(chunk) == str:
	out_text.append(chunk)
	else:
	chunk_type = chunk['type']
	if chunk_type == 'bold':
	out_text.append('{text}'.format(text=chunk['text']))
	elif chunk_type == 'italic':
	out_text.append('_{text}_'.format(text=chunk['text']))
	elif chunk_type == 'underline':
	out_text.append('<u>{text}</u>'.format(text=chunk['text']))
	elif chunk_type == 'strikethrough':
	out_text.append('~~{text}~~'.format(text=chunk['text']))
	elif chunk_type == 'code':
	out_text.append('`{text}`'.format(text=chunk['text']))
	elif chunk_type == 'link':
	out_text.append('[{link}]({link})'.format(link=chunk['text']))
	elif chunk_type == 'text_link':
	out_text.append('[{text}]({link})'.format(text=chunk['text'], link=chunk['href']))
	elif chunk_type == 'phone':
	out_text.append('[{phone}](tel:{phone})'.format(phone=chunk['text']))
	elif chunk_type == 'email':
	out_text.append('[{email}](tel:{email})'.format(email=chunk['text'].lower()))
	elif chunk_type == 'bank_card':
	out_text.append(chunk['text'])
	elif chunk_type == 'mention':
	out_text.append('[{mention}](https://t.me/{user})'.format(mention=chunk['text'], user=chunk['text'][1:]))
	elif chunk_type == 'hashtag':
	out_text.append(chunk['text'])
	elif chunk_type == 'spoiler':
	out_text.append('=={text}=='.format(text=chunk['text']))
	elif chunk_type == 'custom_emoji':
	out_text.append(chunk['text'])
	elif chunk_type == 'bot_command':
	out_text.append('`{text}`'.format(text=chunk['text']))
	else:
	print(chunk)
	else:
	parser.exit(3, 'Unknown type of text: {text}\n'.format(text=text))
	out_text.append(sep)
	args.markdown_corpus.write(''.join(out_text))