vrypan/improve_twitter_archive.py

## improve_twitter_archive.py
#! /usr/bin/env python

"""
This script will parse an unzipped Twitter archive export,
look for media links and download them localy, and replace
the links in the export to point to the local media copies.
It will also do the same for user avatars.

For more info visit: https://blog.vrypan.net/2015/12/24/how-to-archive-your-tweets/
"""
import urllib2
from urlparse import urlparse
import json
import os
import sys

import httplib
import sqlite3
import codecs

db_conn = sqlite3.connect(
	os.path.join(os.path.expanduser('~'), 'expanded_urls.db'),
	isolation_level="IMMEDIATE"
	)
db_conn.row_factory = sqlite3.Row
db_cur = db_conn.cursor()

db_cur.execute("""CREATE TABLE IF NOT EXISTS REDIRECTS (
	src TEXT unique not null,
	dst TEXT not null);
	""")

def expand_url(url, depth=0):
	global db_cur
	print "Expanding %s..." % url
	if depth>10:
		return unicode(url)
	parsed = urlparse(url)

	try :
		h = httplib.HTTPConnection(parsed.netloc)
		h.request('HEAD', url)
		response = h.getresponse()

		print response.status, response.reason

		if response.status in range(300, 400) and response.getheader('Location'):
			return expand_url(response.getheader('Location'), depth+1)
		else:
			return unicode(url)
	except:
		print '** Failed to expand %s' % url
		return unicode(url)

def _update_links_urls(i):
	global db_conn
	global db_cur
	if type(i) is dict:
		for k in i:
			if k == 'urls' and len(i[k])>0 :
				urls = i[k]
				for u in urls:
					if 'original_url' in u:
						continue

					db_cur.execute("SELECT * FROM REDIRECTS WHERE src=?", (u['url'],))
					r = db_cur.fetchone()
					if r:
						x_url = r['dst']
					else:
						x_url = expand_url(u['url'])
						db_cur.execute("INSERT INTO REDIRECTS VALUES(?,?)", (u['url'], x_url) )
						db_conn.commit()

					u['original_url'] = u['url']
					u['url'] = x_url

					u['expanded_url'] = u['url']
					parsed_url = urlparse(u['url'])
					u['display_url'] = "%s%s%s" % ( parsed_url.netloc, parsed_url.path, parsed_url.query )
					if len(u['display_url']) > 27 :
						u['display_url'] = u['display_url'][0:26] + '...'
			else:
				if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list :
					i[k] = _update_links_urls(i[k])
	if type(i) is list or type(i) is tuple:
		i = [ _update_links_urls(j) for j in i ]
	return i

def _update_media_urls(i):
	if type(i) is dict:
		for k in i:
			if k in ('media_url','media_url_https', 'profile_image_url_https'):
				url = i[k]
				local_file = os.path.join('img', urlparse(url).path.split('/')[-1] )
				if not os.path.isfile(local_file):
					try:
						media_file = urllib2.urlopen(url)
						output = codecs.open(local_file, encoding='utf-8',  mode='wb')
						output.write(media_file.read())
						output.close()
						i[k] = local_file
					except:
						print '** Failed to download %s' % url
				else:
					i[k] = local_file
			else:
				if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list :
					i[k] = _update_media_urls(i[k])
	if type(i) is list or type(i) is tuple:
		i = [ _update_media_urls(j) for j in i ]
	return i


if len(sys.argv) == 1:
	print './improve_twitter_archive.py <path to unzipped Twitter export>'
else:
	path = sys.argv[1]

	for data_file in os.listdir(os.path.join(path, 'data','js','tweets')):
		print '\nParsing %s.' % data_file
		data_raw = codecs.open(os.path.join(path, 'data','js','tweets', data_file), mode='r', encoding='utf-8').read().splitlines(True)
		if data_raw:
			data_js = json.loads('\n'.join(data_raw[1:]))
			data_js_updated = json.dumps(
				_update_links_urls(_update_links_urls(data_js)),
				indent=4, separators=(',', ': '))
			fp = codecs.open( os.path.join(path, 'data','js','tweets', data_file), encoding='utf=8', mode='w')
			fp.write(data_raw[0])
			fp.write(data_js_updated)
			fp.close()
	#! /usr/bin/env python

	"""
	This script will parse an unzipped Twitter archive export,
	look for media links and download them localy, and replace
	the links in the export to point to the local media copies.
	It will also do the same for user avatars.

	For more info visit: https://blog.vrypan.net/2015/12/24/how-to-archive-your-tweets/
	"""
	import urllib2
	from urlparse import urlparse
	import json
	import os
	import sys

	import httplib
	import sqlite3
	import codecs

	db_conn = sqlite3.connect(
	os.path.join(os.path.expanduser('~'), 'expanded_urls.db'),
	isolation_level="IMMEDIATE"
	)
	db_conn.row_factory = sqlite3.Row
	db_cur = db_conn.cursor()

	db_cur.execute("""CREATE TABLE IF NOT EXISTS REDIRECTS (
	src TEXT unique not null,
	dst TEXT not null);
	""")

	def expand_url(url, depth=0):
	global db_cur
	print "Expanding %s..." % url
	if depth>10:
	return unicode(url)
	parsed = urlparse(url)

	try :
	h = httplib.HTTPConnection(parsed.netloc)
	h.request('HEAD', url)
	response = h.getresponse()

	print response.status, response.reason

	if response.status in range(300, 400) and response.getheader('Location'):
	return expand_url(response.getheader('Location'), depth+1)
	else:
	return unicode(url)
	except:
	print '** Failed to expand %s' % url
	return unicode(url)

	def _update_links_urls(i):
	global db_conn
	global db_cur
	if type(i) is dict:
	for k in i:
	if k == 'urls' and len(i[k])>0 :
	urls = i[k]
	for u in urls:
	if 'original_url' in u:
	continue

	db_cur.execute("SELECT * FROM REDIRECTS WHERE src=?", (u['url'],))
	r = db_cur.fetchone()
	if r:
	x_url = r['dst']
	else:
	x_url = expand_url(u['url'])
	db_cur.execute("INSERT INTO REDIRECTS VALUES(?,?)", (u['url'], x_url) )
	db_conn.commit()

	u['original_url'] = u['url']
	u['url'] = x_url

	u['expanded_url'] = u['url']
	parsed_url = urlparse(u['url'])
	u['display_url'] = "%s%s%s" % ( parsed_url.netloc, parsed_url.path, parsed_url.query )
	if len(u['display_url']) > 27 :
	u['display_url'] = u['display_url'][0:26] + '...'
	else:
	if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list :
	i[k] = _update_links_urls(i[k])
	if type(i) is list or type(i) is tuple:
	i = [ _update_links_urls(j) for j in i ]
	return i

	def _update_media_urls(i):
	if type(i) is dict:
	for k in i:
	if k in ('media_url','media_url_https', 'profile_image_url_https'):
	url = i[k]
	local_file = os.path.join('img', urlparse(url).path.split('/')[-1] )
	if not os.path.isfile(local_file):
	try:
	media_file = urllib2.urlopen(url)
	output = codecs.open(local_file, encoding='utf-8', mode='wb')
	output.write(media_file.read())
	output.close()
	i[k] = local_file
	except:
	print '** Failed to download %s' % url
	else:
	i[k] = local_file
	else:
	if type(i[k]) is dict or type(i[k]) is tuple or type(i[k]) is list :
	i[k] = _update_media_urls(i[k])
	if type(i) is list or type(i) is tuple:
	i = [ _update_media_urls(j) for j in i ]
	return i


	if len(sys.argv) == 1:
	print './improve_twitter_archive.py <path to unzipped Twitter export>'
	else:
	path = sys.argv[1]

	for data_file in os.listdir(os.path.join(path, 'data','js','tweets')):
	print '\nParsing %s.' % data_file
	data_raw = codecs.open(os.path.join(path, 'data','js','tweets', data_file), mode='r', encoding='utf-8').read().splitlines(True)
	if data_raw:
	data_js = json.loads('\n'.join(data_raw[1:]))
	data_js_updated = json.dumps(
	_update_links_urls(_update_links_urls(data_js)),
	indent=4, separators=(',', ': '))
	fp = codecs.open( os.path.join(path, 'data','js','tweets', data_file), encoding='utf=8', mode='w')
	fp.write(data_raw[0])
	fp.write(data_js_updated)
	fp.close()