Skip to content

Instantly share code, notes, and snippets.

@zheli
Created October 14, 2014 07:55
Show Gist options
  • Save zheli/a322a11594bc49404a0f to your computer and use it in GitHub Desktop.
Save zheli/a322a11594bc49404a0f to your computer and use it in GitHub Desktop.
xkcd fetcher
  • create a new virtual environment
  • pip install -r requirements.txt
  • python fetcher.py to fetch the image
  • python tests.py to run the unittest
import os
import logging
import requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO)
URL = 'http://xkcd.com/'
CHUNK_SIZE = 1024
class Crawler(object):
def __init__(self, url):
self._logger = logging.getLogger(self.__class__.__name__)
self._url = url
self._logger.info('Fetching [{}]'.format(url))
self._soup = BeautifulSoup(requests.get(self._url).text)
def find_block_by_id(self, id):
return self._soup.find(id=id)
def get_tag_attribute(self, soup_obj, tag, attribute):
try:
tag_obj = getattr(soup_obj, tag)
attribute_value = tag_obj.get(attribute)
except AttributeError:
self._logger.error('Cannot find tag {}!!'.format(tag))
raise
else:
return attribute_value
class ImageFetcher(Crawler):
def __init__(self, url, image_div_id):
super(ImageFetcher, self).__init__(url)
self._image_div_id = image_div_id
def _get_image_div_block(self):
return self.find_block_by_id(self._image_div_id)
def _get_image_url(self):
self._logger.info('Parsing image url')
return self.get_tag_attribute(self._get_image_div_block(), 'img', 'src')
def _write_file_in_chunk(self, file_response, file_path):
with open(file_path, 'wb') as image_file:
for chunk in file_response.iter_content(CHUNK_SIZE):
image_file.write(chunk)
self._logger.info('Image saved at {}'.format(file_path))
def download(self, path):
image_url = self._get_image_url()
if image_url:
filename = image_url.split('/')[-1]
self._logger.info('Downloading image file from [{}]'.format(image_url))
file_response = requests.get(image_url, stream=True)
self._write_file_in_chunk(file_response, os.path.join(path, filename))
else:
self._logger.error('No image url found!')
raise Exception
if __name__ == '__main__':
crawler = ImageFetcher(URL, 'comic')
crawler.download(os.path.dirname(os.path.abspath(__file__)))
beautifulsoup4==4.3.2
mock==1.0.1
requests==2.4.3
import unittest
import mock
import fetcher
FAKE_URL = 'http://doesn.t.exist/'
FAKE_PATH = '/doesn_t_exist/'
class TestImageFetcher(unittest.TestCase):
def setUp(self):
mock.patch('fetcher.requests').start()
mock.patch('fetcher.BeautifulSoup').start()
self.image_fetcher = fetcher.ImageFetcher(FAKE_URL, 'fake_id')
self.addCleanup(mock.patch.stopall)
def test_download_empty_image_url(self):
self.image_fetcher._get_image_url = mock.Mock(return_value=None)
self.assertRaises(Exception, self.image_fetcher.download, FAKE_PATH)
def test_download_image_url(self):
self.image_fetcher._get_image_url = mock.Mock(return_value='http://123.com/123.png')
self.image_fetcher.requests = mock.MagicMock()
self.image_fetcher._write_file_in_chunk = mock.MagicMock()
self.image_fetcher.download(FAKE_PATH)
self.assertEqual(self.image_fetcher._write_file_in_chunk.called, True)
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment