Skip to content

Instantly share code, notes, and snippets.

@azam
Forked from hugs/archive-twitpic-data.py
Last active August 29, 2015 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save azam/4de9426ff3beb379ae9d to your computer and use it in GitHub Desktop.
Save azam/4de9426ff3beb379ae9d to your computer and use it in GitHub Desktop.
Twitpic Image and Data Archiver. Added exception handling, timeout, retry to page info and image file download
# Archive your Twitpic photos and metadata
#
# A cleaned-up fork of Terence Eden's original archiver:
# http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/
#
# License: MIT
import shutil
import urllib2
import socket
import json
import time
import os
USERNAME = "your_username_goes_here"
TMP_FILE_NAME = "tmpfile"
MAX_RETRIES = 5
SLEEP_TIME = 2
TIMEOUT = 5000
page = 1
has_more_page = True
# Target Page
api = "https://api.twitpic.com/2/users/show.json?username=%s&page=" % USERNAME
# Get the data about the target page
while has_more_page:
print "Processing page: " + str(page)
has_page_error = True
for i in range(MAX_RETRIES):
try:
raw_data = urllib2.urlopen(api + str(page), timeout=TIMEOUT)
has_page_error = False
if i > 0:
print "Retry successful page: " + str(page)
break
except urllib2.URLError, e:
print "Failed retrieving page: " + str(page)
time.sleep(SLEEP_TIME)
except socket.timeout:
print "Timeout retrieving page: " + str(page)
time.sleep(SLEEP_TIME)
if has_page_error:
has_more_page = False
break
json_data = json.load(raw_data)
# Save the page data
page_file = open("page-%s.json" % page,"w")
page_file.write(json.dumps(json_data, indent=2))
page_file.close()
# Get the info about each image on the page
images = json_data["images"]
page += 1
for item in images:
file_id = item["short_id"]
file_type = item["type"]
file_time = time.mktime(time.strptime(item["timestamp"], "%Y-%m-%d %H:%M:%S"))
file_url = "https://twitpic.com/show/full/"+file_id
file_name = file_id + "." + file_type
if not os.path.exists(file_name):
for i in range(MAX_RETRIES):
# Remove temp file if exists
try:
os.remove(TMP_FILE_NAME)
except OSError:
pass
try:
# Save the file to temporary file
req = urllib2.urlopen(file_url, timeout=TIMEOUT)
with open(TMP_FILE_NAME, "wb") as tmp_file:
shutil.copyfileobj(req, tmp_file)
# Rename to actual file
os.rename(TMP_FILE_NAME, file_name)
# Set the file time
os.utime(file_name,(file_time, file_time))
if i > 0:
print "Retry successful for image ID: " + file_id
break
except urllib2.URLError, e:
print "Failed retrieving image ID: " + file_id
time.sleep(SLEEP_TIME)
except socket.timeout:
print "Timeout retrieving image ID: " + file_id
time.sleep(SLEEP_TIME)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment