Skip to content

Instantly share code, notes, and snippets.

@is3ka1
Last active December 11, 2019 03:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save is3ka1/1f36f843b991f5853a66f074feb8ef1b to your computer and use it in GitHub Desktop.
Save is3ka1/1f36f843b991f5853a66f074feb8ef1b to your computer and use it in GitHub Desktop.
A script that transform messages exported from Telegram to CSV file with some config file.
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
beautifulsoup4 = "*"
pyyaml = "*"
pandas = "*"
lxml = "*"
[requires]
python_version = "3.7"
just make it be the title
messages: "div.history .message:not(.service)"
each_msg:
date:
selector: ".body:not(.forwarded) > .date[title]"
value: "v['title']"
name:
selector: ".body:not(.forwarded) > .from_name"
value: "v.get_text().strip()"
text:
selector: ".body:not(.forwarded) > .text"
value: "v.get_text().strip()"
from bs4 import BeautifulSoup
import pandas as pd
from glob import glob
from os.path import join
from yaml import load, Loader
import re
class TGMsgLoader:
def __init__(self, msg_dir, config_path):
self.file_names = glob(join(msg_dir, 'messages*.html'))
sort_key_pattern = re.compile('messages(\d*).html')
self.file_names.sort(
key=lambda file_name: int(re.search(sort_key_pattern, file_name)
.groups()[0] or 0))
with open(config_path) as fd:
self.sel_config: dict = load(fd.read(), Loader=Loader)
def extract(self):
for path in self.file_names:
print('*' * 10, path, '*' * 10) # debug
with open(path) as fd:
html_content = fd.read()
soup = BeautifulSoup(html_content, 'lxml')
msgs = soup.select(self.sel_config['messages'])
for msg in msgs:
result = dict()
for var, config in self.sel_config['each_msg'].items():
v = msg.select_one(config['selector'])
if v is not None:
v = eval(config['value'])
result[var] = v
# print('{date}\n{name}\n{text}\n{sep_}\n\n'.format(
# sep_="=" * 30, **result))
yield result
def save_to_csv(self, file_name='tmp.csv'):
df = pd.DataFrame([msg for msg in self.extract()])
# hard code
df['name'] = df['name'].ffill()
df['date'] = pd.to_datetime(df['date'], format="%d.%m.%Y %H:%M:%S")
df.to_csv(file_name)
return df
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description="A script that transform messages exported from Telegram"
" to CSV file with some config file.")
parser.add_argument('dir', help="The directory that message exported from"
" Telegram in.")
parser.add_argument('--file', help="The name CSV file will be.",
dest="file", default="tmp.csv")
parser.add_argument('--config', help="The configuration that content"
" on HTML should be", dest="config",
default="text_sels.yaml")
args = parser.parse_args()
tg_loader = TGMsgLoader(args.dir, args.config)
df = tg_loader.save_to_csv(args.file)
# tg_loader = TGMsgLoader('AIS3-chats/official', 'text_sels.yaml')
# df = tg_loader.save_to_csv('official.csv')
@is3ka1
Copy link
Author

is3ka1 commented Aug 2, 2019

usage

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment