Skip to content

Instantly share code, notes, and snippets.

@sadovnychyi
Last active August 29, 2015 14:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sadovnychyi/11025054 to your computer and use it in GitHub Desktop.
Save sadovnychyi/11025054 to your computer and use it in GitHub Desktop.
Google Developers docs image downloader.
import urllib2
from bs4 import BeautifulSoup
import urllib
import os
import sys
import Queue
import threading
import logging
import time
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
filename='log.txt', mode='a', level=logging.DEBUG)
def update_picture(url, attempt=None):
if attempt is None:
attempt = 1
if attempt > 5:
logging.error('Download of %s failed after %s attempts.', url, attempt)
return
print 'Checking picture %s...' % url
try:
img = urllib.urlopen(url).read()
except (urllib2.HTTPError, urllib2.URLError, IOError):
print 'error'
update_picture(url, attempt=attempt + 1)
fname = url.replace('/', '-').replace(':', '-').replace('--', '-')
if os.path.isfile(fname):
with open(fname, 'r') as old:
old = old.read()
if old != img:
logging.critical('Image updated: %s' % url)
with open(fname, 'wb') as new:
new.write(img)
else:
logging.critical('New image found: %s' % url)
with open(fname, 'wb') as new:
new.write(img)
def parse(url='https://developers.google.com/appengine', base='https://developers.google.com/appengine'):
print 'Parsing: %s' % url
try:
c = urllib2.urlopen(url)
except (urllib2.HTTPError, urllib2.URLError, IOError):
print 'error'
return
content = c.read()
s4 = BeautifulSoup(content, 'html.parser')
for link in s4.findAll('a', href=True):
if link['href'].startswith('/'):
link['href'] = 'https://developers.google.com' + link['href']
if link['href'].startswith(url) and link['href'] != url:
parse(link['href'], base=base)
for image in s4.findAll('img', src=True):
if image['src'].startswith('/'):
image['src'] = 'https://developers.google.com' + image['src']
if image['src'].startswith(base):
update_picture(image['src'])
while True:
urls = [
'https://developers.google.com/appengine',
'https://developers.google.com/glass/',
'https://developers.google.com/bigquery/',
'https://developers.google.com/datastore/',
'https://developers.google.com/cloud-dns/',
'https://developers.google.com/compute/',
'https://developers.google.com/cloud-sql/',
'https://developers.google.com/storage/',
'https://developers.google.com/prediction/',
'https://developers.google.com/translate/',
'https://developers.google.com/mobile/',
'https://developers.google.com/startups/',
'https://developers.google.com/games/',
'https://developers.google.com/webmasters/',
'https://developers.google.com/chrome/',
'https://developers.google.com/advertise/',
'https://developers.google.com/monetize/',
'https://developers.google.com/cloud/',
'https://developers.google.com/+/',
'https://developers.google.com/google-apps/',
'https://developers.google.com/android/',
'https://developers.google.com/tv/',
'https://developers.google.com/wallet/',
'https://developers.google.com/analytics/',
'https://developers.google.com/youtube/',
'https://developers.google.com/international/',
'https://developers.google.com/maps/',
'https://developers.google.com/freebase/',
'https://developers.google.com/oauthplayground/',
'https://developers.google.com/orkut/',
'https://developers.google.com/kml/',
'https://developers.google.com/experts/',
'https://developers.google.com/eclipse/',
'https://developers.google.com/chart/',
'https://developers.google.com/loader/',
'https://developers.google.com/cast/',
'https://developers.google.com/genomics/',
'https://developers.google.com/gdata/',
'https://developers.google.com/discovery/',
'https://developers.google.com/speed/',
'https://developers.google.com/places/',
'https://developers.google.com/v8/',
'https://developers.google.com/blogger/',
'https://developers.google.com/accounts/',
'https://developers.google.com/fusiontables/',
'https://developers.google.com/octane/',
'https://developers.google.com/earth/',
'https://developers.google.com/transit/',
'https://developers.google.com/gadgets/',
'https://developers.google.com/fonts/',
'https://developers.google.com/recaptcha/',
'https://developers.google.com/picker/',
'https://developers.google.com/feed/',
'https://developers.google.com/coordinate/',
'https://developers.google.com/spectrum/',
'https://developers.google.com/games/',
'https://developers.google.com/adwords/api/',
'https://developers.google.com/console/help/',
'https://developers.google.com/wallet/objects/',
'https://developers.google.com/maps-engine/',
'https://developers.google.com/identity-toolkit/',
'https://developers.google.com/drive/',
'https://developers.google.com/doubleclick-publishers/',
'https://developers.google.com/custom-search/',
'https://developers.google.com/civic-information/',
'https://developers.google.com/cloud-print/',
'https://developers.google.com/public-data/',
'https://developers.google.com/webmaster-tools/',
'https://developers.google.com/virtual-keyboard/',
'https://developers.google.com/shopping-content/',
'https://developers.google.com/igoogle-themes/',
'https://developers.google.com/translator-toolkit/',
]
urls = list(set(urls))
for url in urls:
print parse(url, url)
# t = threading.Thread(target=parse, args=(url, url))
# t.daemon = True
# t.start()
print 'Finished...'
time.sleep(5)
@shantanuthatte
Copy link

you may want to add developer.android.com to the list....

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment