Skip to content

Instantly share code, notes, and snippets.

Last active December 12, 2017 15:57
Show Gist options
  • Save samarthbhargav/cd89204b11b70eab25eb to your computer and use it in GitHub Desktop.
Save samarthbhargav/cd89204b11b70eab25eb to your computer and use it in GitHub Desktop.
XKCD Scraper - using BeautifulSoup and requests
from bs4 import BeautifulSoup
import requests
import shutil
import glob
from os import listdir
from os.path import isfile, join
def save_image(url, filename):
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
return True
del response
return False
def scrape_xkcd_comics(c_range, folder):
existing = [ f for f in listdir(folder) if isfile(join(folder,f))]
existing = map(lambda x: x.split(".")[0], existing)
to_download = filter(lambda x: str(x) not in existing, c_range)
already_done = filter(lambda x: str(x) in existing, c_range)
print "The following are already done, so skipping it:", already_done
for c in to_download:
xkcd = requests.get("{}".format(c))
soup = BeautifulSoup(xkcd.text)
comic = soup.find("div", {"id": "comic"})
if comic is not None :
if comic.img is None:
print "{} does not have an image tag".format(c)
url = comic.img["src"]
if url.endswith("png"):
file_name = "comics/{}.png".format(c)
elif url.endswith("jpg"):
file_name = "comics/{}.jpg".format(c)
elif url.endswith("gif"):
file_name = "comics/{}.gif".format(c)
print "Skipping {}".format(c)
ret = save_image("http:{}".format(url), file_name)
if ret:
print c, "done"
print "Couldn't do {} for some reason. Damn!".format(c)
print "Couldn't scrape {}".format(c)
scrape_xkcd_comics(range(1511), "comics")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment