Skip to content

Instantly share code, notes, and snippets.

@wincentbalin
Created January 31, 2023 10:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wincentbalin/204e62af0e9ebbc11484f458954e6826 to your computer and use it in GitHub Desktop.
Save wincentbalin/204e62af0e9ebbc11484f458954e6826 to your computer and use it in GitHub Desktop.
Convert Telegram JSON export to Markdown
#!/usr/bin/env python3
"""Convert Telegram channel JSON export to a Markdown text corpus"""
import sys
import json
import argparse
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.add_argument('export_json', help='Exported JSON file', type=argparse.FileType('r', encoding='utf-8'))
parser.add_argument('markdown_corpus', help='Markdown text corpus', type=argparse.FileType('w', encoding='utf-8'))
args = parser.parse_args()
export_data = json.load(args.export_json)
out_text = []
sep = '\n' * 5
for message in export_data['messages']:
text = message['text']
if type(text) == str:
if text == '':
continue
out_text.append(text)
elif type(text) == list:
if len(text) == 1 and text[0]['type'] in ('link',): # Message consists of a single link, which does not make sense
continue
for chunk in text:
if type(chunk) == str:
out_text.append(chunk)
else:
chunk_type = chunk['type']
if chunk_type == 'bold':
out_text.append('**{text}**'.format(text=chunk['text']))
elif chunk_type == 'italic':
out_text.append('_{text}_'.format(text=chunk['text']))
elif chunk_type == 'underline':
out_text.append('<u>{text}</u>'.format(text=chunk['text']))
elif chunk_type == 'strikethrough':
out_text.append('~~{text}~~'.format(text=chunk['text']))
elif chunk_type == 'code':
out_text.append('`{text}`'.format(text=chunk['text']))
elif chunk_type == 'link':
out_text.append('[{link}]({link})'.format(link=chunk['text']))
elif chunk_type == 'text_link':
out_text.append('[{text}]({link})'.format(text=chunk['text'], link=chunk['href']))
elif chunk_type == 'phone':
out_text.append('[{phone}](tel:{phone})'.format(phone=chunk['text']))
elif chunk_type == 'email':
out_text.append('[{email}](tel:{email})'.format(email=chunk['text'].lower()))
elif chunk_type == 'bank_card':
out_text.append(chunk['text'])
elif chunk_type == 'mention':
out_text.append('[{mention}](https://t.me/{user})'.format(mention=chunk['text'], user=chunk['text'][1:]))
elif chunk_type == 'hashtag':
out_text.append(chunk['text'])
elif chunk_type == 'spoiler':
out_text.append('=={text}=='.format(text=chunk['text']))
elif chunk_type == 'custom_emoji':
out_text.append(chunk['text'])
elif chunk_type == 'bot_command':
out_text.append('`{text}`'.format(text=chunk['text']))
else:
print(chunk)
else:
parser.exit(3, 'Unknown type of text: {text}\n'.format(text=text))
out_text.append(sep)
args.markdown_corpus.write(''.join(out_text))
@wincentbalin
Copy link
Author

wincentbalin commented Jan 31, 2023

Line 60: this is the warning flag for unhandled cases. If you see any output, then you need to write a handler for cases printed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment