Skip to content

Instantly share code, notes, and snippets.

@nicktimko
Created May 13, 2016 17:46
Show Gist options
  • Save nicktimko/2b7e21c196d9e74c69fbd7f5b79014cb to your computer and use it in GitHub Desktop.
Save nicktimko/2b7e21c196d9e74c69fbd7f5b79014cb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SE Chat Scraper\n",
"Because http://meta.stackexchange.com/questions/129374/obtaining-full-chat-transcripts"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import time\n",
"import requests\n",
"\n",
"MAX_MSG_COUNT = 500"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_messages(room_id, before=None, n=MAX_MSG_COUNT, session=None):\n",
" \"\"\"SE Chat API endpoint\"\"\"\n",
" url = 'http://chat.stackexchange.com/chats/{}/events'.format(room_id)\n",
" params = {\n",
" 'mode': 'Messages',\n",
" 'msgCount': n,\n",
" }\n",
" if before is not None:\n",
" params['before'] = before\n",
" if session is None:\n",
" session = requests.session()\n",
" \n",
" messages = session.post(url, params).json()['events']\n",
" earliest = min(msg['message_id'] for msg in messages)\n",
" done = len(messages) != n\n",
" \n",
" return messages, earliest, done\n",
"\n",
"\n",
"def get_all_messages(room_id, throttle=1):\n",
" \"\"\"\n",
" Get all messages from the provided room ID. Throttle controls the inter-\n",
" request delay to avoid being too obnoxious.\n",
" \"\"\"\n",
" messages = []\n",
"\n",
" s = requests.session()\n",
"\n",
" new_messages, earliest, done = get_messages(room_id, session=s)\n",
" messages.extend(new_messages)\n",
" while not done:\n",
" time.sleep(throttle)\n",
" new_messages, earliest, done = get_messages(room_id, before=earliest, session=s)\n",
" messages.extend(new_messages)\n",
" \n",
" return messages"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# room_id = 27364 # esolang/PCG room\n",
"room_id = 25038 # smaller JS\n",
"\n",
"messages = get_all_messages(room_id)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'content': 'Ok, legal!',\n",
" 'event_type': 1,\n",
" 'message_id': 22529004,\n",
" 'room_id': 25038,\n",
" 'time_stamp': 1435857785,\n",
" 'user_id': 144907,\n",
" 'user_name': 'gustavox'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"messages[-1]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import collections"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"most_active_users = collections.Counter(msg['user_name'] for msg in messages)\n",
"most_active_users = sorted(\n",
" (u[::-1] for u in most_active_users.items() if u[1] > 10), \n",
" reverse=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(296, 'KaduAmaral'),\n",
" (210, 'Sergio'),\n",
" (143, 'Luis'),\n",
" (89, 'gustavox'),\n",
" (88, 'Renan Rodrigues'),\n",
" (74, 'Maicon Carraro'),\n",
" (48, 'Xeoon'),\n",
" (37, 'Joao Vitor Farias Scheuermann'),\n",
" (24, 'Ricardo Henrique'),\n",
" (19, 'Ricardo'),\n",
" (16, 'stringnome'),\n",
" (16, 'Alexandre C. Caus'),\n",
" (15, 'Mayla Campos'),\n",
" (14, 'ricardo'),\n",
" (14, 'ctgPi'),\n",
" (13, 'Julio Santos'),\n",
" (12, 'Marciano.Andrade')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"most_active_users # some people changed name, but have same user_id...but not as pretty to display :P"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@Tech-Expert-Wizard
Copy link

How can I get all of the messages, not just 500?

@Anonymous941
Copy link

How can I get all of the messages, not just 500?

I think you'd have to look at the time of the last message, then set before to that.

@Tech-Expert-Wizard
Copy link

How can I get all of the messages, not just 500?

I think you'd have to look at the time of the last message, then set before to that.

@Anonymous941 Do you mean the time of the most recent message?

@Anonymous941
Copy link

@Anonymous941 Do you mean the time of the most recent message?

@Tech-Expert-Wizard I mean set before to the time of the last message (the 500th message) to get the next 500 messages.

@nicktimko
Copy link
Author

How can I get all of the messages, not just 500?

Use get_all_messages, it repeatedly calls get_messages and sets before to before whatever the last 500 were.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment