Last active
February 4, 2017 22:41
-
-
Save cwoebker/ac77b70e62adbce29616 to your computer and use it in GitHub Desktop.
A simple script that crawls NASA's Astronomy Picture of the Day.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
********************************************* | |
Nasa's picture of the day since 1995. | |
by Cecil Woebker (http://cwoebker.com) | |
********************************************* | |
Usage: | |
$ ./apod.py | |
Saves all pictures from Nasa's picture of the day archive to the current directory. | |
""" | |
import os | |
import cPickle as pickle | |
import urllib2 | |
from bs4 import BeautifulSoup | |
from clint.textui import progress, puts, colored | |
ROOT_URL = 'http://apod.nasa.gov/apod/' | |
def load(): | |
puts("Loading archive...") | |
urls = [] | |
data = urllib2.urlopen(ROOT_URL + 'archivepix.html').read() | |
puts("Opening archive...") | |
soup = BeautifulSoup(data, 'lxml') | |
results = soup.find('b').findAll('a') | |
for result in progress.bar(results): | |
urls.append(result['href']) | |
puts(colored.green("Found %d links." % len(urls))) | |
return urls | |
def getPhotos(urls, thumbs=False): | |
puts("Locating Photos...") | |
photos = {} | |
typeErrorCount = 0 | |
keyErrorCount = 0 | |
urlErrorCount = 0 | |
for url in progress.bar(urls): | |
try: | |
data = urllib2.urlopen(ROOT_URL + url).read() | |
soup = BeautifulSoup(data, 'lxml') | |
result = soup.find('img') | |
if result is None: | |
typeErrorCount += 1 | |
continue | |
if thumbs: | |
photos[url] = result['src'] | |
else: | |
photos[url] = result.parent['href'] | |
except TypeError: | |
typeErrorCount += 1 | |
except KeyError: | |
keyErrorCount += 1 | |
except urllib2.URLError: | |
urlErrorCount += 1 | |
puts(colored.green("Found %d photos." % len(photos.values()))) | |
puts(colored.red("URL Error Count: %d" % urlErrorCount)) | |
puts(colored.red("Key Error Count: %d" % keyErrorCount)) | |
puts(colored.red("Type Error Count: %d" % typeErrorCount)) | |
with open('photos.pkl', 'wb') as output: | |
pickle.dump(photos, output, pickle.HIGHEST_PROTOCOL) | |
return photos | |
def downloadPhoto(folder, photo): | |
try: | |
u = urllib2.urlopen(photo) | |
localFile = open(os.path.join(folder, photo.split('/')[-1]), "wb") | |
localFile.write(u.read()) | |
localFile.close() | |
u.close() | |
except urllib2.HTTPError: | |
puts(colored.red("HTTPError - 404")) | |
def main(): | |
print __doc__ | |
urls = load() | |
photos = getPhotos(urls) | |
puts("--------------") | |
puts(colored.yellow("Downloading...")) | |
puts("--------------") | |
for key in progress.bar(photos.keys()): | |
name = key.split('.')[0] | |
parts = [name[i:i+2] for i in range(0, len(name), 2)] | |
folder = os.path.join(parts[1], parts[2], parts[3]) | |
if not os.path.exists(folder): | |
os.makedirs(folder) | |
item = ROOT_URL + photos[key] | |
downloadPhoto(folder, item) | |
#puts("%s done." % key) | |
puts(colored.green("Finished.")) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment