Parse the html export of a Telegram chat
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' The parse_telegram_export function in this gist parses the html export of a Telegram chat. | |
You should have [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/) and | |
[dateutil](https://dateutil.readthedocs.io/) installed. It extracts the sender name, the | |
message date and time, the message text and the links in the message. | |
''' | |
from bs4 import BeautifulSoup | |
import dateutil | |
def parse_telegram_export(html_str, tz_name=None): | |
''' Parses a Telegram html export. | |
Params: | |
- html_str (str): The html string containing the Telegram export. | |
- tz_name (str|None): The name of the timezone where the export was made (eg. "Italy/Rome"). | |
If None, no time zone will be set for the resulting datetime. | |
Returns (generator<(str, datetime.datetime, str, list<str>)>): A generator object | |
that yields a (from_name, date, text, links) tuple for each messages in the export, | |
where from_name is the sender name, date and text are the date and text of the message | |
and links is a list of the links eventually found in the message. | |
''' | |
soup = BeautifulSoup(html_str, 'html.parser') | |
tz = dateutil.tz.gettz(tz_name) if tz_name else None | |
for div in soup.select("div.message.default"): | |
body = div.find('div', class_='body') | |
from_name_ = body.find('div', class_='from_name') | |
if from_name_ is not None: | |
from_name = from_name_.string.strip() | |
text = body.find('div', class_='text').get_text().strip() | |
links = [l.get('href') for l in body.find_all('a')] | |
raw_date = body.find('div', class_='date')['title'] | |
naiv_date = dateutil.parser.parse(raw_date) | |
date = naiv_date.astimezone(tz) if tz else naiv_date | |
yield (from_name, date, text, links) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment