Created
June 13, 2017 20:19
-
-
Save shreyansh26/0d9ed62940e5ac4e830ea765427f82f4 to your computer and use it in GitHub Desktop.
A WebScraper in python to download all Xkcd comics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3 | |
# downloadXkcd.py - Downloads every single XKCD comic. | |
import requests, os, bs4 | |
url = 'http://xkcd.com' # starting url | |
os.makedirs('xkcd', exist_ok=True) # store comics in ./xkcd | |
while not url.endswith('#'): | |
# Download the page. | |
print('Downloading page %s...' % url) | |
res = requests.get(url) | |
res.raise_for_status() | |
soup = bs4.BeautifulSoup(res.text) | |
# Find the URL of the comic image. | |
comicElem = soup.select('#comic img') | |
if comicElem == []: | |
print('Could not find comic image.') | |
else: | |
comicUrl = comicElem[0].get('src') | |
# Download the image. | |
print('Downloading image %s...' % (comicUrl)) | |
res = requests.get(comicUrl) | |
res.raise_for_status() | |
# Save the image to ./xkcd | |
imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb') | |
for chunk in res.iter_content(100000): | |
imageFile.write(chunk) | |
imageFile.close() | |
# Get the Prev button's url. | |
prevLink = soup.select('a[rel="prev"]')[0] | |
url = 'http://xkcd.com' + prevLink.get('href') | |
print('Done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment