Skip to content

Instantly share code, notes, and snippets.

@sharno
Created December 21, 2015 12:11
Show Gist options
  • Save sharno/0995a5e1d21d42673f44 to your computer and use it in GitHub Desktop.
Save sharno/0995a5e1d21d42673f44 to your computer and use it in GitHub Desktop.
import urllib2
import urllib
from bs4 import BeautifulSoup
url = "http://www.gocomics.com/calvinandhobbes/1985/11/18"
domain = "http://www.gocomics.com"
def make_soup(url):
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
request = urllib2.Request(url, headers=hdr)
html = urllib2.urlopen(request).read()
return BeautifulSoup(html, "lxml")
def get_image(soup, url):
images = [img for img in soup.findAll('img', {'class': 'strip', 'width': None})]
print (str(len(images)) + "images found.")
if not images:
print('high resolution not found, downloading lower resolution')
images = [img for img in soup.findAll('img', {'class': 'strip'})]
link = images[0].get('src')
filename = url[-10:].replace('/', '-') + '.gif'
urllib.urlretrieve(link, filename)
return link
def next_page(soup):
next_url = [n.get('href') for n in soup.findAll('a', {'class': 'next'}) if 'calvinandhobbes' in n.get('href')]
return next_url[0]
soup = make_soup(url)
get_image(soup, url)
n = next_page(soup)
while n:
soup = make_soup(url)
get_image(soup, url)
n = next_page(soup)
url = domain + n;
print(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment