Skip to content

Instantly share code, notes, and snippets.

@HakurouKen
Created January 24, 2016 16:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HakurouKen/ce2cdae743676fc443bb to your computer and use it in GitHub Desktop.
Save HakurouKen/ce2cdae743676fc443bb to your computer and use it in GitHub Desktop.
get xkcd comic
import urllib,urllib2,json
import re
import os
import logging
from HTMLParser import HTMLParser
class HTMLStripper(HTMLParser):
'''
strip html tags
Solution from: http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python#answer-925630
'''
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
'''
deal with some title with html tags, such as comic 472
'''
s = HTMLStripper()
s.feed(html)
return s.get_data()
class Xkcd(object):
'''
get xkcd comic
'''
link = 'http://www.xkcd.com/{}/info.0.json'
def __init__(self, index):
if not isinstance(index, int):
raise TypeError('Parameter "index" must be integer.')
self.index = index
self.link = Xkcd.link.format(index)
def get(self):
ret = {}
# No.404 xkcd is not exist just like it's name.
if self.index == 404:
return -2,'The comic 404 is a joke.'
try:
resp = urllib2.urlopen(self.link).read()
info = json.loads(resp)
img = info['img']
suffix_match = re.search(r'\.\w*?$',img)
if suffix_match:
suffix = suffix_match.group()
else:
suffix = '.jpg'
ret['img'] = img
# make the title with '/' a valid filename
title = re.sub('/','_', strip_tags(info['title']))
ret['title'] = "{index} - {title}{suffix}".format(index=self.index,title=title,suffix=suffix)
return 0,ret
except urllib2.HTTPError, error:
if error.code == 404:
return -2,'Comic {} does not exists.'.format(self.index)
else:
return -1,'Error at comic {} ,HTTPError {} happend.'.format(self.index,error.code)
except:
return -1,'Error at comic {}, unknown error happend.'.format(self.index)
def save(self,src,path):
try:
with open(path,'wb') as f:
f.write(urllib2.urlopen(src).read())
return 0
except urllib2.HTTPError, error:
# some xkcd has no picture, such as comic 1608
if error.code == 404:
return -1
else:
return -2
except:
return -2
class Progress(object):
'''
save and load the progress to file.
'''
_instance = None
def __new__(cls,*args,**kwargs):
if not cls._instance:
cls._instance = super(Progress,cls).__new__(cls,*args,**kwargs)
return cls._instance
def __init__(self,filename):
self.filename = filename
def load(self):
if not os.path.isfile(self.filename):
return 1
else:
with open(self.filename) as f:
s = f.readline().strip()
try:
return int(s)
except ValueError,e:
return 1
def save(self,index):
if not isinstance(index,int):
raise TypeError('Parameter "index" must be integer.')
with open(self.filename,'w') as f:
f.write(str(index))
if __name__ == '__main__':
logging.basicConfig(
level=logging.INFO,
format= '%(asctime)s [%(levelname)s] %(message)s',
datefmt='[%Y-%m-%d %H:%M:%S]',
filename='xkcd.log'
)
progress = Progress('progress.data')
index = progress.load()
if not os.path.isdir('comic'):
os.mkdir('comic')
while True:
comic = Xkcd(index)
ret,info = comic.get()
if ret == 0:
saved = comic.save(info['img'],os.path.join('comic',info['title']))
if saved == 0:
logging.info('Comic {} saved successfully.'.format(index))
elif saved == -1:
logging.info('Comic {} has no image.'.format(index))
else:
logging.warn('Comic {} picture saved error.'.format(index))
else:
logging.warn(info)
if ret == -2 and index != 404:
progress.save(index)
break
index += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment