RyanEager/van_down.py

## van_down.py
"""
Scrape full sized images from vangoghmuseum.nl
---- requrires.txt ------------------------------------------------------------
beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2018.4.16
chardet==3.0.4
idna==2.7
Pillow==5.2.0
python-slugify==1.2.5
requests==2.19.1
Unidecode==1.0.22
urllib3==1.23
-------------------------------------------------------------------------------
"""

import re
import requests

from io import BytesIO

from PIL import Image
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from slugify import slugify

def download_img(id_val, file_name):
    response = requests.get('https://vangoghmuseum-assetserver.appspot.com/tiles?id=%s' % id_val)
    data = response.json()
    img_tiles = data['levels'][0]['tiles']
    width = data['levels'][0]['width']
    height = data['levels'][0]['height']

    new_img = Image.new('RGB', (width, height))

    x_offset = 0
    y_offset = 0
    last_y = 0
    last_y_height = 0

    for tile in img_tiles:
        img_rsp = requests.get(tile['url'])
        img_tile = Image.open(BytesIO(img_rsp.content))
        last_y_height = img_tile.size[0]

        if tile['y'] != last_y:
            last_y = tile['y']
            y_offset += last_y_height
            x_offset = 0

        new_img.paste(img_tile, (x_offset, y_offset))
        x_offset += img_tile.size[0]

    new_img.save(file_name)

def scrape_urls(num):
    '''
    change this too limit to other artists or filters on the serach page i.e.
    https://www.vangoghmuseum.nl/en/search/collection?q=&artist=Vincent%20van%20Gogh&pagesize= to just get van Gogh's work
    '''
    resp = requests.get("https://www.vangoghmuseum.nl/en/search/collection?q=&pagesize=" + str(num))
    http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding)

    urls = []
    for link in soup.find_all('a', href=True):
        if link['href'].startswith('/en/collection/'):
            urls.append(link['href'])

    for url in urls:
        resp = requests.get('https://www.vangoghmuseum.nl' + url)
        http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
        encoding = html_encoding or http_encoding

        soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding)

        title = slugify(soup.find('a', attrs={'name': 'info'}).contents[0])
        data_id = soup.find(attrs={'data-id': re.compile("\d+")})['data-id']
        info = soup.find('div', attrs={'data-role': 'info'}).getText()

        # clean up info text
        info = re.sub(r'\n{3,}', '\n', info)
        info = re.sub(r'Search in the collection:.*(?=\nObject data)', '', info, flags=re.DOTALL)

        with open(title + '.txt','wb') as f:
            f.write(info.encode('utf8'))

        print 'Downloading: %s' % title
        download_img(data_id, title + '.jpg')

if __name__ == '__main__':
    # as of 2018-07-10 there are 1744 work so 2000 should get them all.
    scrape_urls(2000)
	"""
	Scrape full sized images from vangoghmuseum.nl
	---- requrires.txt ------------------------------------------------------------
	beautifulsoup4==4.6.0
	bs4==0.0.1
	certifi==2018.4.16
	chardet==3.0.4
	idna==2.7
	Pillow==5.2.0
	python-slugify==1.2.5
	requests==2.19.1
	Unidecode==1.0.22
	urllib3==1.23
	-------------------------------------------------------------------------------
	"""

	import re
	import requests

	from io import BytesIO

	from PIL import Image
	from bs4 import BeautifulSoup
	from bs4.dammit import EncodingDetector
	from slugify import slugify

	def download_img(id_val, file_name):
	response = requests.get('https://vangoghmuseum-assetserver.appspot.com/tiles?id=%s' % id_val)
	data = response.json()
	img_tiles = data['levels'][0]['tiles']
	width = data['levels'][0]['width']
	height = data['levels'][0]['height']

	new_img = Image.new('RGB', (width, height))

	x_offset = 0
	y_offset = 0
	last_y = 0
	last_y_height = 0

	for tile in img_tiles:
	img_rsp = requests.get(tile['url'])
	img_tile = Image.open(BytesIO(img_rsp.content))
	last_y_height = img_tile.size[0]

	if tile['y'] != last_y:
	last_y = tile['y']
	y_offset += last_y_height
	x_offset = 0

	new_img.paste(img_tile, (x_offset, y_offset))
	x_offset += img_tile.size[0]

	new_img.save(file_name)

	def scrape_urls(num):
	'''
	change this too limit to other artists or filters on the serach page i.e.
	https://www.vangoghmuseum.nl/en/search/collection?q=&artist=Vincent%20van%20Gogh&pagesize= to just get van Gogh's work
	'''
	resp = requests.get("https://www.vangoghmuseum.nl/en/search/collection?q=&pagesize=" + str(num))
	http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
	html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
	encoding = html_encoding or http_encoding
	soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding)

	urls = []
	for link in soup.find_all('a', href=True):
	if link['href'].startswith('/en/collection/'):
	urls.append(link['href'])

	for url in urls:
	resp = requests.get('https://www.vangoghmuseum.nl' + url)
	http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
	html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
	encoding = html_encoding or http_encoding

	soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding)

	title = slugify(soup.find('a', attrs={'name': 'info'}).contents[0])
	data_id = soup.find(attrs={'data-id': re.compile("\d+")})['data-id']
	info = soup.find('div', attrs={'data-role': 'info'}).getText()

	# clean up info text
	info = re.sub(r'\n{3,}', '\n', info)
	info = re.sub(r'Search in the collection:.*(?=\nObject data)', '', info, flags=re.DOTALL)

	with open(title + '.txt','wb') as f:
	f.write(info.encode('utf8'))

	print 'Downloading: %s' % title
	download_img(data_id, title + '.jpg')

	if __name__ == '__main__':
	# as of 2018-07-10 there are 1744 work so 2000 should get them all.
	scrape_urls(2000)