Skip to content

Instantly share code, notes, and snippets.

@icedraco
Created November 27, 2014 09:52
Show Gist options
  • Save icedraco/edbc32b12e115fefadea to your computer and use it in GitHub Desktop.
Save icedraco/edbc32b12e115fefadea to your computer and use it in GitHub Desktop.
Retrieves a library/album page from a given PhotoBucket URL and extracts all the URLs to the full images
###--# PhotoBucket URL Extractor 0.1 [20140730-2207]
#
# This script retrieves a library/album page from a given URL using urllib2 and
# extracts all the URLs to the full images featured in that page. The URLs are
# printed to stdout and can later be redirected through the shell.
#
# Author: IceDragon <icedragon at quickfox org>
import urllib2
import re
import sys
RE_FULLSIZE_URL_PART = re.compile('"fullsizeUrl":"([^"]*)"')
def clean_url(dirty_url):
'''Takes a dirty URL from the fullsizeUrl JSON data and strips off
the \ characters to clean it up
'''
return dirty_url.replace('\\', '')
def find_urls(html_data):
'''Takes a PhotoBucket HTML page and returns a list of image URLs
featured in it.
'''
global RE_FULLSIZE_URL_PART
return map(clean_url, RE_FULLSIZE_URL_PART.findall(html_data))
def main(argv):
for scan_url in argv:
for dirty_url in find_urls(urllib2.urlopen(scan_url).read()):
print clean_url(dirty_url)
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment