Skip to content

Instantly share code, notes, and snippets.

@nolanlawson
Created December 4, 2017 00:36
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save nolanlawson/52657eb931243c596004adaccc6956fd to your computer and use it in GitHub Desktop.
Script to backup media (images) in a Twitter archive
#!/usr/bin/env python
# Given a Twitter archive, replace all references to Twitter hosted images with a downloaded local image.
# Usage: place in the directory of the Twitter export (at the same level as the index.html file) and run it.
# Note: doesn't work for video. Skips images that fail to download.
import errno
import os
import re
import requests
media_num = 0
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path): pass
else: raise
mkdir_p('media_backup')
failed_count = 0
success_count = 0
for tweetjs in os.listdir('data/js/tweets'):
if not tweetjs.endswith('.js'):
continue
print "Processing %s..." % tweetjs
tweet_js_filename = os.path.join('data/js/tweets', tweetjs)
javascript = open(tweet_js_filename, 'r').read()
urls = re.findall('''"media_url" : "([^"]+)"''', javascript) + re.findall('''"media_url_https" : "([^"]+)"''', javascript)
old_failed_count = failed_count
old_success_count = success_count
for url in urls:
response = requests.get(url.replace('\/', '/'))
if response.status_code != requests.codes.ok: # 404, do nothing
failed_count += 1
continue
success_count += 1
extension = re.search('\.[^\.]+$', url).group(0)
media_num += 1
filename = os.path.join('media_backup', 'media_%d%s' % (media_num, extension))
with open(filename, 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
javascript = javascript.replace(url, filename)
fileout = open(tweet_js_filename, 'wb').write(javascript)
print " Backed up %d, failed on %d" % (success_count - old_success_count, failed_count - old_failed_count)
print "Successfully backed up %d pieces of media, failed on %d" % (success_count, failed_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment