Skip to content

Instantly share code, notes, and snippets.

@jatinkrmalik
Created March 22, 2017 07:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jatinkrmalik/b540615e7c6b449430f456a72b897d77 to your computer and use it in GitHub Desktop.
Save jatinkrmalik/b540615e7c6b449430f456a72b897d77 to your computer and use it in GitHub Desktop.
Download all XKCD comics with their original names
#!/usr/bin/env python3
import requests,sys,re
from bs4 import BeautifulSoup
import time
from datetime import datetime
from urllib.request import urlopen
now = time.time()
import string
print('downloading xkcd comics ..')
print("time started: ",datetime.now())
for i in range(1,1813):
# 404: not found and 1525 is not an image
if i in [404,1525]:
continue
else:
res = requests.get('http://xkcd.com/'+str(i)+'/')
soup = BeautifulSoup(res.text)
imageToken = soup.find_all('img')
name = (imageToken[1].get('alt'))
if name==None:
name = ''.join(['dude',str(i)])
raw = imageToken[1].get('src')
raw = raw.strip('/')
extension = raw.split('.')
extension = extension[-1]
if 'style' in name:
pat = re.compile(r'>(.*)</span>(.*)')
sp = list(pat.search(name).groups())
else:
if '/' in name:
sp = name.split('/')
else:
sp = name.split()
name = "".join(str(x) for x in sp if x not in string.punctuation)
filename = ''.join([name,'.',extension])
pic_url ='http://'+raw
pic = requests.get(pic_url,stream=True)
with open(filename, 'wb') as fd:
for chunk in pic.iter_content(1024):
fd.write(chunk)
#with open(filename, "wb" ) as f:
# f.write(pic)
print(''.join(['download completed ','xkcd comics ',str(i)]))
then = time.time()
print("time finished: ",datetime.now())
print("total time taken: ",then-now)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment