Created
March 16, 2019 17:52
-
-
Save domodomodomo/4483bbd94c3752cc209542966aa40e30 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""script for scraping images. | |
1) overview | |
input: site URL | |
output: image files | |
2) usage | |
$ python scrape_image.py https://pt.wikipedia.org/wiki/Veneza | |
3) prerequisite | |
$ pip3 install requests, bs4 | |
4) concept | |
1) Get thumnail URL. | |
2) Exchange it to an original image URL. | |
3) Download the original image file. | |
5) You should define two parameters | |
and one method for each concreate class of Site. | |
1) regex | |
- reguration expression for img src of target thumbnail image. | |
2) exchange_url_from_thumbnail_to_original | |
""" | |
import abc | |
import os | |
import re | |
import sys | |
import bs4 | |
import requests | |
import urllib | |
def scrape_image(page_url): | |
# 1. Instantiate a site object. | |
page = Site(page_url) | |
# 2. Get image urls. | |
image_url_list = page.make_image_url_list() | |
# 3. Make a directory. | |
dir_name = os.path.join(page.site_name, page.page_name) | |
os.makedirs(dir_name, exist_ok=True) | |
for image_url in image_url_list: | |
# 4. Download an image. | |
response = requests.get(image_url) # requests | |
# 5. Save an image. | |
file_name = image_url.split('/')[-1] | |
file_path = os.path.join(dir_name, file_name) | |
with open(file_path, 'wb') as file: | |
file.write(response.content) | |
print(file.name) | |
# | |
# abstract base class | |
# | |
class SiteRegister(abc.ABCMeta): | |
site_dispatch = {} | |
def __init__(self, name, bases, name_space): | |
self.site_dispatch[self.site_url] = self | |
self.site_name = name.lower() | |
class Site(metaclass=SiteRegister): | |
def __new__(cls, page_url): | |
site_url = cls._make_site_url(page_url) | |
if site_url in cls.site_dispatch: | |
ConcreteSite = cls.site_dispatch[site_url] | |
return super().__new__(ConcreteSite) | |
else: | |
raise Exception(f'{site_url} is not registered.') | |
def __init__(self, page_url): | |
response = requests.get(page_url) # requests | |
self.soup = bs4.BeautifulSoup(response.text, 'html.parser') # bs4 | |
self.page_name = self.soup.title.text | |
self.page_url = page_url | |
# page_name_by_percent = page_url.split('/')[-1].split('.')[0] | |
# page_name = urllib.parse.unquote(page_name_by_percent) | |
def make_image_url_list(self): | |
image_url_list = [ | |
self.make_image_url(thumnail_url) | |
for thumnail_url in self._make_thumbnail_url_list() | |
] | |
return image_url_list | |
def _make_thumbnail_url_list(self): | |
thumbnail_url_or_none_generator = ( | |
re.search(self.thumnail_url_regex, str(img_tag)) # re | |
for img_tag in self.soup('img') | |
) | |
# print(*(str(img_tag) for img_tag in soup('img')), sep='\n') | |
# print(*thumbnail_url_or_none_generator, sep='\n') | |
thumbnail_url_list = [ | |
thumbnail_url.group() | |
for thumbnail_url in filter(bool, thumbnail_url_or_none_generator) | |
] | |
# print(*thumbnail_url_list, sep='\n') | |
return thumbnail_url_list | |
@staticmethod | |
def _make_site_url(page_url): | |
return urllib.parse.urlparse(page_url).hostname | |
# | |
# | |
# | |
site_url = '' | |
thumbnail_url_regex = ( | |
r'' | |
) | |
@abc.abstractstaticmethod | |
def make_image_url(thumnail_url): | |
raise NotImplementedError | |
# | |
# concreate class | |
# | |
class Wikipedia(Site): | |
site_url = 'ja.wikipedia.org' | |
thumnail_url_regex = ( | |
r'upload\.wikimedia\.org' | |
r'/wikipedia/commons/thumb/\w+/\w+/.*?\.(jpg|png)' | |
) | |
@staticmethod | |
def make_image_url(thumbnail_url): | |
return 'https://' + thumbnail_url.replace('/thumb', '') | |
class Vippers(Site): | |
site_url = 'vippers.jp' | |
thumnail_url_regex = ( | |
r'http://livedoor\.blogimg\.jp' | |
r'/vsokuvip/imgs/\w+/\w+/.*\.(jpg|gif)' | |
) | |
@staticmethod | |
def make_image_url(thumbnail_url): | |
return thumbnail_url.replace('-s', '') | |
# | |
# main code | |
# | |
if __name__ == "__main__": | |
site_url = sys.argv[1] | |
scrape_image(site_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment