- create a new virtual environment
pip install -r requirements.txt
python fetcher.py
to fetch the imagepython tests.py
to run the unittest
Created
October 14, 2014 07:55
-
-
Save zheli/a322a11594bc49404a0f to your computer and use it in GitHub Desktop.
xkcd fetcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import logging | |
import requests | |
from bs4 import BeautifulSoup | |
logging.basicConfig(level=logging.INFO) | |
URL = 'http://xkcd.com/' | |
CHUNK_SIZE = 1024 | |
class Crawler(object): | |
def __init__(self, url): | |
self._logger = logging.getLogger(self.__class__.__name__) | |
self._url = url | |
self._logger.info('Fetching [{}]'.format(url)) | |
self._soup = BeautifulSoup(requests.get(self._url).text) | |
def find_block_by_id(self, id): | |
return self._soup.find(id=id) | |
def get_tag_attribute(self, soup_obj, tag, attribute): | |
try: | |
tag_obj = getattr(soup_obj, tag) | |
attribute_value = tag_obj.get(attribute) | |
except AttributeError: | |
self._logger.error('Cannot find tag {}!!'.format(tag)) | |
raise | |
else: | |
return attribute_value | |
class ImageFetcher(Crawler): | |
def __init__(self, url, image_div_id): | |
super(ImageFetcher, self).__init__(url) | |
self._image_div_id = image_div_id | |
def _get_image_div_block(self): | |
return self.find_block_by_id(self._image_div_id) | |
def _get_image_url(self): | |
self._logger.info('Parsing image url') | |
return self.get_tag_attribute(self._get_image_div_block(), 'img', 'src') | |
def _write_file_in_chunk(self, file_response, file_path): | |
with open(file_path, 'wb') as image_file: | |
for chunk in file_response.iter_content(CHUNK_SIZE): | |
image_file.write(chunk) | |
self._logger.info('Image saved at {}'.format(file_path)) | |
def download(self, path): | |
image_url = self._get_image_url() | |
if image_url: | |
filename = image_url.split('/')[-1] | |
self._logger.info('Downloading image file from [{}]'.format(image_url)) | |
file_response = requests.get(image_url, stream=True) | |
self._write_file_in_chunk(file_response, os.path.join(path, filename)) | |
else: | |
self._logger.error('No image url found!') | |
raise Exception | |
if __name__ == '__main__': | |
crawler = ImageFetcher(URL, 'comic') | |
crawler.download(os.path.dirname(os.path.abspath(__file__))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.3.2 | |
mock==1.0.1 | |
requests==2.4.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import mock | |
import fetcher | |
FAKE_URL = 'http://doesn.t.exist/' | |
FAKE_PATH = '/doesn_t_exist/' | |
class TestImageFetcher(unittest.TestCase): | |
def setUp(self): | |
mock.patch('fetcher.requests').start() | |
mock.patch('fetcher.BeautifulSoup').start() | |
self.image_fetcher = fetcher.ImageFetcher(FAKE_URL, 'fake_id') | |
self.addCleanup(mock.patch.stopall) | |
def test_download_empty_image_url(self): | |
self.image_fetcher._get_image_url = mock.Mock(return_value=None) | |
self.assertRaises(Exception, self.image_fetcher.download, FAKE_PATH) | |
def test_download_image_url(self): | |
self.image_fetcher._get_image_url = mock.Mock(return_value='http://123.com/123.png') | |
self.image_fetcher.requests = mock.MagicMock() | |
self.image_fetcher._write_file_in_chunk = mock.MagicMock() | |
self.image_fetcher.download(FAKE_PATH) | |
self.assertEqual(self.image_fetcher._write_file_in_chunk.called, True) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment