mrtj/parse_telegram_export.py

## parse_telegram_export.py
''' The parse_telegram_export function in this gist parses the html export of a Telegram chat.
You should have [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/) and
[dateutil](https://dateutil.readthedocs.io/) installed. It extracts the sender name, the
message date and time, the message text and the links in the message.
'''

from bs4 import BeautifulSoup
import dateutil

def parse_telegram_export(html_str, tz_name=None):
    ''' Parses a Telegram html export.

    Params:
      - html_str (str): The html string containing the Telegram export.
      - tz_name (str|None): The name of the timezone where the export was made (eg. "Italy/Rome").
        If None, no time zone will be set for the resulting datetime.

    Returns (generator<(str, datetime.datetime, str, list<str>)>): A generator object
        that yields a (from_name, date, text, links) tuple for each messages in the export,
        where from_name is the sender name, date and text are the date and text of the message
        and links is a list of the links eventually found in the message.
    '''
    soup = BeautifulSoup(html_str, 'html.parser')
    tz = dateutil.tz.gettz(tz_name) if tz_name else None
    for div in soup.select("div.message.default"):
        body = div.find('div', class_='body')
        from_name_ = body.find('div', class_='from_name')
        if from_name_ is not None:
            from_name = from_name_.string.strip()
        text = body.find('div', class_='text').get_text().strip()
        links = [l.get('href') for l in body.find_all('a')]
        raw_date = body.find('div', class_='date')['title']
        naiv_date = dateutil.parser.parse(raw_date)
        date = naiv_date.astimezone(tz) if tz else naiv_date
        yield (from_name, date, text, links)
	''' The parse_telegram_export function in this gist parses the html export of a Telegram chat.
	You should have [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/) and
	[dateutil](https://dateutil.readthedocs.io/) installed. It extracts the sender name, the
	message date and time, the message text and the links in the message.
	'''

	from bs4 import BeautifulSoup
	import dateutil

	def parse_telegram_export(html_str, tz_name=None):
	''' Parses a Telegram html export.

	Params:
	- html_str (str): The html string containing the Telegram export.
	- tz_name (str\|None): The name of the timezone where the export was made (eg. "Italy/Rome").
	If None, no time zone will be set for the resulting datetime.

	Returns (generator<(str, datetime.datetime, str, list<str>)>): A generator object
	that yields a (from_name, date, text, links) tuple for each messages in the export,
	where from_name is the sender name, date and text are the date and text of the message
	and links is a list of the links eventually found in the message.
	'''
	soup = BeautifulSoup(html_str, 'html.parser')
	tz = dateutil.tz.gettz(tz_name) if tz_name else None
	for div in soup.select("div.message.default"):
	body = div.find('div', class_='body')
	from_name_ = body.find('div', class_='from_name')
	if from_name_ is not None:
	from_name = from_name_.string.strip()
	text = body.find('div', class_='text').get_text().strip()
	links = [l.get('href') for l in body.find_all('a')]
	raw_date = body.find('div', class_='date')['title']
	naiv_date = dateutil.parser.parse(raw_date)
	date = naiv_date.astimezone(tz) if tz else naiv_date
	yield (from_name, date, text, links)