Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
slack history downloader (pub and priv) with rate limit retry :)
#https://gist.github.com/Chandler/fb7a070f52883849de35 SEE HERE
# MIT License
# Copyright (c) 2016 Chandler Abraham
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from slacker import Slacker
import json
import argparse
import os
import sys, time
class RetryError(Exception):
pass
def retryloop(attempts, timeout):
starttime = time.time()
success = set()
for i in range(attempts):
success.add(True)
yield success.clear
if success:
return
if time.time() > starttime + timeout:
break
# raise RetryError
# This script finds all channels, private channels and direct messages
# that your user participates in, downloads the complete history for
# those converations and writes each conversation out to seperate json files.
#
# This user centric history gathering is nice because the official slack data exporter
# only exports public channels.
#
# PS, this only works if your slack team has a paid account which allows for unlimited history.
#
# PPS, this use of the API is blessed by Slack.
# https://get.slack.help/hc/en-us/articles/204897248
# " If you want to export the contents of your own private groups and direct messages
# please see our API documentation."
#
# get your slack user token at the bottom of this page
# https://api.slack.com/web
#
# dependencies:
# pip install slacker # https://github.com/os/slacker
#
# usage examples
# python slack_history.py --token='123token'
# python slack_history.py --token='123token' --dryRun=True
# python slack_history.py --token='123token' --skipDirectMessages
# python slack_history.py --token='123token' --skipDirectMessages --skipPrivateChannels
# fetches the complete message history for a channel/group/im
#
# pageableObject could be:
# slack.channel
# slack.groups
# slack.im
#
# channelId is the id of the channel/group/im you want to download history for.
def getHistory(pageableObject, channelId, pageSize = 100):
messages = []
lastTimestamp = None
while(True):
response = pageableObject.history(
channel = channelId,
latest = lastTimestamp,
oldest = 0,
count = pageSize
).body
messages.extend(response['messages'])
if (response['has_more'] == True):
lastTimestamp = messages[-1]['ts'] # -1 means last element in a list
else:
break
return messages
def mkdir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
# fetch and write history for all public channels
def getChannels(slack, dryRun):
channels = slack.channels.list().body['channels']
print("\nfound channels: ")
for channel in channels:
print(channel['name'])
if not dryRun:
parentDir = "channels"
mkdir(parentDir)
for channel in channels:
print("getting history for channel {0}".format(channel['name']))
fileName = "{parent}/{file}.json".format(parent = parentDir, file = channel['name'])
for retry in retryloop(10000, timeout=2):
try:
messages = getHistory(slack.channels, channel['id'])
channelInfo = slack.channels.info(channel['id']).body['channel']
with open(fileName, 'w') as outFile:
print("writing {0} records to {1}".format(len(messages), fileName))
json.dump({'channel_info': channelInfo, 'messages': messages }, outFile, indent=4)
except Exception:
retry()
# fetch and write history for all direct message conversations
# also known as IMs in the slack API.
def getDirectMessages(slack, ownerId, userIdNameMap, dryRun):
dms = slack.im.list().body['ims']
print("\nfound direct messages (1:1) with the following users:")
for dm in dms:
print(userIdNameMap.get(dm['user'], dm['user'] + " (name unknown)"))
if not dryRun:
parentDir = "direct_messages"
mkdir(parentDir)
for dm in dms:
name = userIdNameMap.get(dm['user'], dm['user'] + " (name unknown)")
print("getting history for direct messages with {0}".format(name))
fileName = "{parent}/{file}.json".format(parent = parentDir, file = name)
for retry in retryloop(10000, timeout=2):
try:
messages = getHistory(slack.im, dm['id'])
channelInfo = {'members': [dm['user'], ownerId]}
with open(fileName, 'w') as outFile:
print("writing {0} records to {1}".format(len(messages), fileName))
json.dump({'channel_info': channelInfo, 'messages': messages}, outFile, indent=4)
except Exception:
retry()
# fetch and write history for all private channels
# also known as groups in the slack API.
def getPrivateChannels(slack, dryRun):
groups = slack.groups.list().body['groups']
print("\nfound private channels:")
for group in groups:
print("{0}: ({1} members)".format(group['name'], len(group['members'])))
if not dryRun:
parentDir = "private_channels"
mkdir(parentDir)
for group in groups:
messages = []
print("getting history for private channel {0} with id {1}".format(group['name'], group['id']))
fileName = "{parent}/{file}.json".format(parent = parentDir, file = group['name'])
for retry in retryloop(10000, timeout=2):
try:
messages = getHistory(slack.groups, group['id'])
channelInfo = slack.groups.info(group['id']).body['group']
with open(fileName, 'w') as outFile:
print("writing {0} records to {1}".format(len(messages), fileName))
json.dump({'channel_info': channelInfo, 'messages': messages}, outFile, indent=4)
except Exception:
retry()
# fetch all users for the channel and return a map userId -> userName
def getUserMap(slack):
#get all users in the slack organization
users = slack.users.list().body['members']
userIdNameMap = {}
for user in users:
userIdNameMap[user['id']] = user['name']
print("found {0} users ".format(len(users)))
return userIdNameMap
# get basic info about the slack channel to ensure the authentication token works
def doTestAuth(slack):
testAuth = slack.auth.test().body
teamName = testAuth['team']
currentUser = testAuth['user']
print("Successfully authenticated for team {0} and user {1} ".format(teamName, currentUser))
return testAuth
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='download slack history')
parser.add_argument('--token', help="an api token for a slack user")
parser.add_argument(
'--dryRun',
action='store_true',
default=False,
help="if dryRun is true, don't fetch/write history only get channel names")
parser.add_argument(
'--skipPrivateChannels',
action='store_true',
default=False,
help="skip fetching history for private channels")
parser.add_argument(
'--skipChannels',
action='store_true',
default=False,
help="skip fetching history for channels")
parser.add_argument(
'--skipDirectMessages',
action='store_true',
default=False,
help="skip fetching history for directMessages")
args = parser.parse_args()
slack = Slacker(args.token)
testAuth = doTestAuth(slack)
userIdNameMap = getUserMap(slack)
dryRun = args.dryRun
if not dryRun:
with open('metadata.json', 'w') as outFile:
print("writing metadata")
metadata = {
'auth_info': testAuth,
'users': userIdNameMap
}
json.dump(metadata, outFile, indent=4)
if not args.skipChannels:
getChannels(slack, dryRun)
if not args.skipPrivateChannels:
getPrivateChannels(slack, dryRun)
if not args.skipDirectMessages:
getDirectMessages(slack, testAuth['user_id'], userIdNameMap, dryRun)
@Benoit99

This comment has been minimized.

Copy link

@Benoit99 Benoit99 commented Feb 2, 2018

Thanks for working out the rate limiting!!

@Benoit99

This comment has been minimized.

Copy link

@Benoit99 Benoit99 commented Feb 2, 2018

I've run this twice and it appears it is skipping some channels all together. I didn't notice at first, but I realized one of our biggest channels "general" wasn't in the resulting channels file with the other json files. I noticed a few other channels missing too. Is there something I need to change in the script? Thanks!

@Benoit99

This comment has been minimized.

Copy link

@Benoit99 Benoit99 commented Feb 5, 2018

It may not be pretty but I was able to throttle it enough to get all of our channels. The changes I made are in bold.
if not dryRun:
parentDir = "channels"
mkdir(parentDir)
for channel in channels:
print("getting history for channel {0}".format(channel['name']))
fileName = "{parent}/{file}.json".format(parent = parentDir, file = channel['name'])
time.sleep(30)
for retry in retryloop(10000, timeout=300):

I also realized why I didn't get an error when it by passed the larger channels. At the end of the "def retryloop" I saw that "raise RetryError" was commented out which resulted in channels getting skipped without me realizing it. I took the leading # off because I want to get the error. I suppose others might not, so to each their own.
We have 138 public channels, almost 300K public messages. Our general channel is over 10K messages. It definitely took a while to run, but the upside was Slack didn't stop the request as it apparently was sufficiently throttled.
I'm still learning, so if there is an easier/better way to throttle this, I'm all ears. In the meantime, I know this works.

@kquinsland

This comment has been minimized.

Copy link

@kquinsland kquinsland commented Mar 14, 2018

@Benoit99, There's a simpler way to prevent the rate-limits from choking you out!

You'd need to add a small sleep() call to the script where it determines if it needs to fetch the next page or not.

Here's what the updated portion of my getHistory function looks like:

    if (response['has_more'] == True):
      lastTimestamp = messages[-1]['ts'] # -1 means last element in a list
      print("Sleeping a second to avoid rate limits....")
      sleep(2)
    else:
      break
  return messages
@escowles

This comment has been minimized.

Copy link

@escowles escowles commented Mar 23, 2018

I'd recommend updating this to note that tokens are now "legacy" but can still be generated at https://api.slack.com/custom-integrations/legacy-tokens

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment