Skip to content

Instantly share code, notes, and snippets.

@alexeygrigorev
Last active September 19, 2022 08:13
Show Gist options
  • Save alexeygrigorev/ea13d4a70cd7afb1f3ab70aa53094e46 to your computer and use it in GitHub Desktop.
Save alexeygrigorev/ea13d4a70cd7afb1f3ab70aa53094e46 to your computer and use it in GitHub Desktop.
Processing book of the week data in slack dump
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import json\n",
"import yaml\n",
"\n",
"from zipfile import ZipFile\n",
"from datetime import datetime, timedelta\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"import clipboard\n",
"import frontmatter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Some prep work\n",
"\n",
"Defininig some variables"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dtc_website_dir = Path('c:/Users/alexe/git/datatalksclub.github.io')\n",
"\n",
"dump_zip_file = 'DataTalks.Club Slack export Sep 19 2020 - Sep 19 2022.zip'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Loading emojis:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_emojis = pd.read_csv(dtc_website_dir / 'scripts' / 'emojis.csv')\n",
"emoji_map = dict(zip(df_emojis.code, df_emojis.emoji))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fidning books without answers:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dtc_website_dir / '_books'\n",
"book_files = list((dtc_website_dir / '_books').glob('202*.md'))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"no_archive = []\n",
"\n",
"for book_file in book_files:\n",
" if book_file.parts[-1] == '20210315-database-internals.md':\n",
" continue\n",
"\n",
" post = frontmatter.load(book_file)\n",
"\n",
" # looking only at events that finished\n",
" if datetime.today() < post['end']:\n",
" continue\n",
"\n",
" if 'archive' not in post.keys():\n",
" no_archive.append((book_file, post.to_dict()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Some helper functions\n",
"\n",
"We'll need them for parsing slack dump"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def repl_user_callback(match):\n",
" user_id = match.group(1)\n",
" user_name = users[user_id]\n",
" return user_name['name']\n",
"\n",
"\n",
"user_pattern = re.compile(r'<@(.+?)>')\n",
"link_pattern_text = re.compile(r'<(http.+?)\\|(.+?)>')\n",
"link_pattern = re.compile(r'<(http.+?)>')\n",
"emoji_pattern = re.compile(r':([-+0-9_a-z]+):(:[-+0-9_a-z]:)?')\n",
"\n",
"\n",
"def replace_emoji_callback(match):\n",
" code = match.group(1)\n",
" if code in emoji_map:\n",
" return emoji_map[code]\n",
" print('cannot find %s' % code)\n",
" return \":%s:\" % code\n",
"\n",
"\n",
"def prepare_text(text):\n",
" text = text.replace('\\xa0', ' ').replace('ΓÇó', '-').replace('\\n\\n', '\\n')\n",
" text = user_pattern.sub(repl_user_callback, text)\n",
" text = emoji_pattern.sub(replace_emoji_callback, text)\n",
" text = link_pattern_text.sub(r'[\\2](\\1)', text)\n",
" text = link_pattern.sub(r'[\\1](\\1)', text)\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def load_docs_zipfile(zipfile, files):\n",
" all_docs = []\n",
"\n",
" for f in files:\n",
" with zipfile.open(f) as f_in:\n",
" docs = json.load(f_in)\n",
" all_docs.extend(docs)\n",
"\n",
" return all_docs\n",
"\n",
"\n",
"def clean_user(d):\n",
" p = d['profile']\n",
" name = p['display_name']\n",
" if len(name) == 0:\n",
" name = p['real_name']\n",
" return {\n",
" 'name': name,\n",
" 'image': p['image_72']\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading slack data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, load the users:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"dump = ZipFile(dump_zip_file, 'r')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"with dump.open('users.json') as f_in:\n",
" all_users = json.load(f_in)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"users = {d['id']: clean_user(d) for d in all_users}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's find all the json files in the dump of the book-of-the-week channel:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"all_files = []\n",
"\n",
"for filename in dump.namelist():\n",
" if not filename.startswith('book-of-the-week'):\n",
" continue\n",
" if not filename.endswith('.json'):\n",
" continue\n",
" all_files.append(filename)"
]
},
{
"cell_type": "markdown",
"metadata": {
"scrolled": true
},
"source": [
"And load all messages (we'll need them for making threads):"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"all_messages_docs = load_docs_zipfile(dump, all_files)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Processing\n",
"\n",
"Keep running this until run out of books =)\n",
"\n",
"Now let's take one of the books:"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "pop from empty list",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_3032/1075073259.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mbook_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbook\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mno_archive\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbook_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparts\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mIndexError\u001b[0m: pop from empty list"
]
}
],
"source": [
"book_file, book = no_archive.pop()\n",
"print(book_file.parts[-1])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2022-01-09 00:00:00\n",
"2022-01-16 00:00:00\n"
]
},
{
"data": {
"text/plain": [
"['book-of-the-week/2022-01-10.json', 'book-of-the-week/2022-01-16.json']"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"start = book['start'] - timedelta(days=1)\n",
"end = start + timedelta(days=7)\n",
"print(start)\n",
"print(end)\n",
"\n",
"question_files = []\n",
"\n",
"for filename in all_files:\n",
" date = Path(filename).parts[-1]\n",
" date = datetime.strptime(date, '%Y-%m-%d.json')\n",
" if start <= date and date <= end:\n",
" question_files.append(filename)\n",
" \n",
"question_files"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"question_messages_docs = load_docs_zipfile(dump, question_files)\n",
"\n",
"top_messages = [d for d in question_messages_docs if 'parent_user_id' not in d]\n",
"thread_replies = [d for d in all_messages_docs if 'parent_user_id' in d]\n",
"replies_idx = {(d['user'], d['ts']): d for d in all_messages_docs}\n",
"\n",
"top_messages = [d for d in top_messages if d.get('subtype') not in ('thread_broadcast', 'channel_join')]\n",
"\n",
"\n",
"threads = []\n",
"\n",
"for top_message in top_messages:\n",
" user_id = top_message['user']\n",
" if user_id == 'USLACKBOT':\n",
" continue\n",
" \n",
" top_name = users[user_id]['name']\n",
" \n",
" if top_name == 'Francis Terence Amit':\n",
" continue\n",
"\n",
" top_text = prepare_text(top_message['text']).strip()\n",
" \n",
" if 'Hello, everyone!' in top_text and 'The book of this week is' in top_text:\n",
" continue\n",
" \n",
" if 'The lucky winners' in top_text:\n",
" continue\n",
" \n",
" if 'Please send me your emails in DM' in top_text:\n",
" continue\n",
" \n",
" replies = []\n",
"\n",
" for p in top_message.get('replies', []):\n",
" reply_id = (p['user'], p['ts'])\n",
" reply = replies_idx[reply_id]\n",
" name = users[p['user']]['name']\n",
" text = prepare_text(reply['text']).strip()\n",
"\n",
" replies.append({'name': name, 'text': text})\n",
"\n",
" thread = {\n",
" 'name': top_name,\n",
" 'text': top_text,\n",
" 'replies': replies\n",
" }\n",
" \n",
" threads.append(thread)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"yaml_snippet = yaml.dump({'archive': threads}, sort_keys=False)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"archive:\n",
"- name: Alexey Grigorev\n",
" text: I didn't get a confirmation from the author for this week, but we'll have\n",
" something next week!\n",
" replies:\n",
" - name: Varun Nayyar\n",
" text: \"Why not call someone back again? If possible?\\U0001F914 Maybe some people\\\n",
" \\ can get their questions in now i\n"
]
}
],
"source": [
"print(yaml_snippet[:300])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"clipboard.copy(yaml_snippet)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"!code {book_file}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment