Skip to content

Instantly share code, notes, and snippets.

@domodomodomo
Created March 16, 2019 17:52
Show Gist options
  • Save domodomodomo/4483bbd94c3752cc209542966aa40e30 to your computer and use it in GitHub Desktop.
Save domodomodomo/4483bbd94c3752cc209542966aa40e30 to your computer and use it in GitHub Desktop.
"""script for scraping images.
1) overview
input: site URL
output: image files
2) usage
$ python scrape_image.py https://pt.wikipedia.org/wiki/Veneza
3) prerequisite
$ pip3 install requests, bs4
4) concept
1) Get thumnail URL.
2) Exchange it to an original image URL.
3) Download the original image file.
5) You should define two parameters
and one method for each concreate class of Site.
1) regex
- reguration expression for img src of target thumbnail image.
2) exchange_url_from_thumbnail_to_original
"""
import abc
import os
import re
import sys
import bs4
import requests
import urllib
def scrape_image(page_url):
# 1. Instantiate a site object.
page = Site(page_url)
# 2. Get image urls.
image_url_list = page.make_image_url_list()
# 3. Make a directory.
dir_name = os.path.join(page.site_name, page.page_name)
os.makedirs(dir_name, exist_ok=True)
for image_url in image_url_list:
# 4. Download an image.
response = requests.get(image_url) # requests
# 5. Save an image.
file_name = image_url.split('/')[-1]
file_path = os.path.join(dir_name, file_name)
with open(file_path, 'wb') as file:
file.write(response.content)
print(file.name)
#
# abstract base class
#
class SiteRegister(abc.ABCMeta):
site_dispatch = {}
def __init__(self, name, bases, name_space):
self.site_dispatch[self.site_url] = self
self.site_name = name.lower()
class Site(metaclass=SiteRegister):
def __new__(cls, page_url):
site_url = cls._make_site_url(page_url)
if site_url in cls.site_dispatch:
ConcreteSite = cls.site_dispatch[site_url]
return super().__new__(ConcreteSite)
else:
raise Exception(f'{site_url} is not registered.')
def __init__(self, page_url):
response = requests.get(page_url) # requests
self.soup = bs4.BeautifulSoup(response.text, 'html.parser') # bs4
self.page_name = self.soup.title.text
self.page_url = page_url
# page_name_by_percent = page_url.split('/')[-1].split('.')[0]
# page_name = urllib.parse.unquote(page_name_by_percent)
def make_image_url_list(self):
image_url_list = [
self.make_image_url(thumnail_url)
for thumnail_url in self._make_thumbnail_url_list()
]
return image_url_list
def _make_thumbnail_url_list(self):
thumbnail_url_or_none_generator = (
re.search(self.thumnail_url_regex, str(img_tag)) # re
for img_tag in self.soup('img')
)
# print(*(str(img_tag) for img_tag in soup('img')), sep='\n')
# print(*thumbnail_url_or_none_generator, sep='\n')
thumbnail_url_list = [
thumbnail_url.group()
for thumbnail_url in filter(bool, thumbnail_url_or_none_generator)
]
# print(*thumbnail_url_list, sep='\n')
return thumbnail_url_list
@staticmethod
def _make_site_url(page_url):
return urllib.parse.urlparse(page_url).hostname
#
#
#
site_url = ''
thumbnail_url_regex = (
r''
)
@abc.abstractstaticmethod
def make_image_url(thumnail_url):
raise NotImplementedError
#
# concreate class
#
class Wikipedia(Site):
site_url = 'ja.wikipedia.org'
thumnail_url_regex = (
r'upload\.wikimedia\.org'
r'/wikipedia/commons/thumb/\w+/\w+/.*?\.(jpg|png)'
)
@staticmethod
def make_image_url(thumbnail_url):
return 'https://' + thumbnail_url.replace('/thumb', '')
class Vippers(Site):
site_url = 'vippers.jp'
thumnail_url_regex = (
r'http://livedoor\.blogimg\.jp'
r'/vsokuvip/imgs/\w+/\w+/.*\.(jpg|gif)'
)
@staticmethod
def make_image_url(thumbnail_url):
return thumbnail_url.replace('-s', '')
#
# main code
#
if __name__ == "__main__":
site_url = sys.argv[1]
scrape_image(site_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment