HakurouKen/xkcd.py

## xkcd.py
import urllib,urllib2,json
import re
import os
import logging
from HTMLParser import HTMLParser

class HTMLStripper(HTMLParser):
    '''
        strip html tags
        Solution from: http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python#answer-925630
    '''
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    '''
        deal with some title with html tags, such as comic 472
    '''
    s = HTMLStripper()
    s.feed(html)
    return s.get_data()


class Xkcd(object):
    '''
        get xkcd comic
    '''
    link = 'http://www.xkcd.com/{}/info.0.json'

    def __init__(self, index):
        if not isinstance(index, int):
            raise TypeError('Parameter "index" must be integer.')
        self.index = index
        self.link = Xkcd.link.format(index)

    def get(self):
        ret = {}
        # No.404 xkcd is not exist just like it's name.
        if self.index == 404:
            return -2,'The comic 404 is a joke.'
        try:
            resp = urllib2.urlopen(self.link).read()
            info = json.loads(resp)
            img = info['img']
            suffix_match = re.search(r'\.\w*?$',img)
            if suffix_match:
                suffix = suffix_match.group()
            else:
                suffix = '.jpg'
            ret['img'] = img
            # make the title with '/' a valid filename
            title = re.sub('/','_', strip_tags(info['title']))
            ret['title'] = "{index} - {title}{suffix}".format(index=self.index,title=title,suffix=suffix)
            return 0,ret
        except urllib2.HTTPError, error:
            if error.code == 404:
                return -2,'Comic {} does not exists.'.format(self.index)
            else:
                return -1,'Error at comic {} ,HTTPError {} happend.'.format(self.index,error.code)
        except:
            return -1,'Error at comic {}, unknown error happend.'.format(self.index)

    def save(self,src,path):
        try:
            with open(path,'wb') as f:
                f.write(urllib2.urlopen(src).read())
            return 0
        except urllib2.HTTPError, error:
            # some xkcd has no picture, such as comic 1608
            if error.code == 404:
                return -1
            else:
                return -2
        except:
            return -2


class Progress(object):
    '''
        save and load the progress to file.
    '''
    _instance = None
    def __new__(cls,*args,**kwargs):
        if not cls._instance:
            cls._instance = super(Progress,cls).__new__(cls,*args,**kwargs)
        return cls._instance

    def __init__(self,filename):
        self.filename = filename

    def load(self):
        if not os.path.isfile(self.filename):
            return 1
        else:
            with open(self.filename) as f:
                s = f.readline().strip()
                try:
                    return int(s)
                except ValueError,e:
                    return 1

    def save(self,index):
        if not isinstance(index,int):
            raise TypeError('Parameter "index" must be integer.')
        with open(self.filename,'w') as f:
            f.write(str(index))


if __name__ == '__main__':
    logging.basicConfig(
        level=logging.INFO,
        format= '%(asctime)s [%(levelname)s] %(message)s',
        datefmt='[%Y-%m-%d %H:%M:%S]',
        filename='xkcd.log'
    )
    progress = Progress('progress.data')
    index = progress.load()
    if not os.path.isdir('comic'):
        os.mkdir('comic')
    while True:
        comic = Xkcd(index)
        ret,info = comic.get()
        if ret == 0:
            saved = comic.save(info['img'],os.path.join('comic',info['title']))
            if saved == 0:
                logging.info('Comic {} saved successfully.'.format(index))
            elif saved == -1:
                logging.info('Comic {} has no image.'.format(index))
            else:
                logging.warn('Comic {} picture saved error.'.format(index))
        else:
            logging.warn(info)
            if ret == -2 and index != 404:
                progress.save(index)
                break
        index += 1
	import urllib,urllib2,json
	import re
	import os
	import logging
	from HTMLParser import HTMLParser

	class HTMLStripper(HTMLParser):
	'''
	strip html tags
	Solution from: http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python#answer-925630
	'''
	def __init__(self):
	self.reset()
	self.fed = []
	def handle_data(self, d):
	self.fed.append(d)
	def get_data(self):
	return ''.join(self.fed)


	def strip_tags(html):
	'''
	deal with some title with html tags, such as comic 472
	'''
	s = HTMLStripper()
	s.feed(html)
	return s.get_data()



	class Xkcd(object):
	'''
	get xkcd comic
	'''
	link = 'http://www.xkcd.com/{}/info.0.json'

	def __init__(self, index):
	if not isinstance(index, int):
	raise TypeError('Parameter "index" must be integer.')
	self.index = index
	self.link = Xkcd.link.format(index)

	def get(self):
	ret = {}
	# No.404 xkcd is not exist just like it's name.
	if self.index == 404:
	return -2,'The comic 404 is a joke.'
	try:
	resp = urllib2.urlopen(self.link).read()
	info = json.loads(resp)
	img = info['img']
	suffix_match = re.search(r'\.\w*?$',img)
	if suffix_match:
	suffix = suffix_match.group()
	else:
	suffix = '.jpg'
	ret['img'] = img
	# make the title with '/' a valid filename
	title = re.sub('/','_', strip_tags(info['title']))
	ret['title'] = "{index} - {title}{suffix}".format(index=self.index,title=title,suffix=suffix)
	return 0,ret
	except urllib2.HTTPError, error:
	if error.code == 404:
	return -2,'Comic {} does not exists.'.format(self.index)
	else:
	return -1,'Error at comic {} ,HTTPError {} happend.'.format(self.index,error.code)
	except:
	return -1,'Error at comic {}, unknown error happend.'.format(self.index)

	def save(self,src,path):
	try:
	with open(path,'wb') as f:
	f.write(urllib2.urlopen(src).read())
	return 0
	except urllib2.HTTPError, error:
	# some xkcd has no picture, such as comic 1608
	if error.code == 404:
	return -1
	else:
	return -2
	except:
	return -2


	class Progress(object):
	'''
	save and load the progress to file.
	'''
	_instance = None
	def __new__(cls,args,*kwargs):
	if not cls._instance:
	cls._instance = super(Progress,cls).__new__(cls,args,*kwargs)
	return cls._instance

	def __init__(self,filename):
	self.filename = filename

	def load(self):
	if not os.path.isfile(self.filename):
	return 1
	else:
	with open(self.filename) as f:
	s = f.readline().strip()
	try:
	return int(s)
	except ValueError,e:
	return 1

	def save(self,index):
	if not isinstance(index,int):
	raise TypeError('Parameter "index" must be integer.')
	with open(self.filename,'w') as f:
	f.write(str(index))


	if __name__ == '__main__':
	logging.basicConfig(
	level=logging.INFO,
	format= '%(asctime)s [%(levelname)s] %(message)s',
	datefmt='[%Y-%m-%d %H:%M:%S]',
	filename='xkcd.log'
	)
	progress = Progress('progress.data')
	index = progress.load()
	if not os.path.isdir('comic'):
	os.mkdir('comic')
	while True:
	comic = Xkcd(index)
	ret,info = comic.get()
	if ret == 0:
	saved = comic.save(info['img'],os.path.join('comic',info['title']))
	if saved == 0:
	logging.info('Comic {} saved successfully.'.format(index))
	elif saved == -1:
	logging.info('Comic {} has no image.'.format(index))
	else:
	logging.warn('Comic {} picture saved error.'.format(index))
	else:
	logging.warn(info)
	if ret == -2 and index != 404:
	progress.save(index)
	break
	index += 1