Skip to content

Instantly share code, notes, and snippets.

@RyanEager
Last active May 18, 2023 06:52
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save RyanEager/a747215c259ffecb9beb1dfec00e9d00 to your computer and use it in GitHub Desktop.
Save RyanEager/a747215c259ffecb9beb1dfec00e9d00 to your computer and use it in GitHub Desktop.
Scrape full sized images form vangoghmuseum.nl
"""
Scrape full sized images from vangoghmuseum.nl
---- requrires.txt ------------------------------------------------------------
beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2018.4.16
chardet==3.0.4
idna==2.7
Pillow==5.2.0
python-slugify==1.2.5
requests==2.19.1
Unidecode==1.0.22
urllib3==1.23
-------------------------------------------------------------------------------
"""
import re
import requests
from io import BytesIO
from PIL import Image
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from slugify import slugify
def download_img(id_val, file_name):
response = requests.get('https://vangoghmuseum-assetserver.appspot.com/tiles?id=%s' % id_val)
data = response.json()
img_tiles = data['levels'][0]['tiles']
width = data['levels'][0]['width']
height = data['levels'][0]['height']
new_img = Image.new('RGB', (width, height))
x_offset = 0
y_offset = 0
last_y = 0
last_y_height = 0
for tile in img_tiles:
img_rsp = requests.get(tile['url'])
img_tile = Image.open(BytesIO(img_rsp.content))
last_y_height = img_tile.size[0]
if tile['y'] != last_y:
last_y = tile['y']
y_offset += last_y_height
x_offset = 0
new_img.paste(img_tile, (x_offset, y_offset))
x_offset += img_tile.size[0]
new_img.save(file_name)
def scrape_urls(num):
'''
change this too limit to other artists or filters on the serach page i.e.
https://www.vangoghmuseum.nl/en/search/collection?q=&artist=Vincent%20van%20Gogh&pagesize= to just get van Gogh's work
'''
resp = requests.get("https://www.vangoghmuseum.nl/en/search/collection?q=&pagesize=" + str(num))
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding)
urls = []
for link in soup.find_all('a', href=True):
if link['href'].startswith('/en/collection/'):
urls.append(link['href'])
for url in urls:
resp = requests.get('https://www.vangoghmuseum.nl' + url)
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding)
title = slugify(soup.find('a', attrs={'name': 'info'}).contents[0])
data_id = soup.find(attrs={'data-id': re.compile("\d+")})['data-id']
info = soup.find('div', attrs={'data-role': 'info'}).getText()
# clean up info text
info = re.sub(r'\n{3,}', '\n', info)
info = re.sub(r'Search in the collection:.*(?=\nObject data)', '', info, flags=re.DOTALL)
with open(title + '.txt','wb') as f:
f.write(info.encode('utf8'))
print 'Downloading: %s' % title
download_img(data_id, title + '.jpg')
if __name__ == '__main__':
# as of 2018-07-10 there are 1744 work so 2000 should get them all.
scrape_urls(2000)
@RyanEager
Copy link
Author

Saves both images and painting info.

@serycjon
Copy link

serycjon commented Jul 11, 2018

I have noticed weird arctifacts in most of the images downloaded this way, notice the wrong stripe in the bottom, any idea whats wrong?
an-old-woman-of-arles
It is always the last row of tiles that is placed incorrectly...

@serycjon
Copy link

You don't get all images, because their names are not unique, e.g. there are 14 paintings named "woman".

@imluckie
Copy link

It no longer works. I couldn't run the script successfully.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment