Skip to content

Instantly share code, notes, and snippets.

@worbit
Last active January 17, 2023 14:54
Show Gist options
  • Save worbit/e22c43059b0ffedbee63 to your computer and use it in GitHub Desktop.
Save worbit/e22c43059b0ffedbee63 to your computer and use it in GitHub Desktop.
Download images from moma.org
# required libraries in order of appearance
import pandas as pd
import urllib
from bs4 import BeautifulSoup
from skimage import io
# create pandas dataframe from MoMA csv
df = pd.read_csv('Artworks.csv',encoding='utf-8')
# only take entries with a URL
dfurl = df[df.URL.notnull()]
# definition of method to save (first only) image for item with index i
def save_image(i):
url = dfurl.URL.iloc[i]
try:
site = urllib.urlopen(url)
soup = BeautifulSoup(site)
bild = soup.select('section div img')
# ev insert loop for cases where there are several images
b = bild[0]
if 'srcset' in b.attrs:
prts = b['srcset'].split(', ')
# prts[-1] takes the biggest resolution (2000 px) available
# alternative resolutions are 320, 640, ..., 1440 px
pth = prts[-1].split(' ')[0]
pic = io.imread('http://www.moma.org/'+pth)
io.imsave(str(dfurl.ObjectID.iloc[i])+'.png',pic)
else:
# if no image available
print url
except:
# if loading unsuccessful
print 'ERROR', i
# sample query
for i in range(10):
save_image(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment