Created
November 27, 2014 09:52
-
-
Save icedraco/edbc32b12e115fefadea to your computer and use it in GitHub Desktop.
Retrieves a library/album page from a given PhotoBucket URL and extracts all the URLs to the full images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###--# PhotoBucket URL Extractor 0.1 [20140730-2207] | |
# | |
# This script retrieves a library/album page from a given URL using urllib2 and | |
# extracts all the URLs to the full images featured in that page. The URLs are | |
# printed to stdout and can later be redirected through the shell. | |
# | |
# Author: IceDragon <icedragon at quickfox org> | |
import urllib2 | |
import re | |
import sys | |
RE_FULLSIZE_URL_PART = re.compile('"fullsizeUrl":"([^"]*)"') | |
def clean_url(dirty_url): | |
'''Takes a dirty URL from the fullsizeUrl JSON data and strips off | |
the \ characters to clean it up | |
''' | |
return dirty_url.replace('\\', '') | |
def find_urls(html_data): | |
'''Takes a PhotoBucket HTML page and returns a list of image URLs | |
featured in it. | |
''' | |
global RE_FULLSIZE_URL_PART | |
return map(clean_url, RE_FULLSIZE_URL_PART.findall(html_data)) | |
def main(argv): | |
for scan_url in argv: | |
for dirty_url in find_urls(urllib2.urlopen(scan_url).read()): | |
print clean_url(dirty_url) | |
return 0 | |
if __name__ == "__main__": | |
raise SystemExit(main(sys.argv[1:])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment