worbit/moma_img_download.py

## moma_img_download.py
# required libraries in order of appearance
import pandas as pd
import urllib
from bs4 import BeautifulSoup
from skimage import io

# create pandas dataframe from MoMA csv
df = pd.read_csv('Artworks.csv',encoding='utf-8')
# only take entries with a URL
dfurl = df[df.URL.notnull()]

# definition of method to save (first only) image for item with index i
def save_image(i):
    url = dfurl.URL.iloc[i]
    try:
        site = urllib.urlopen(url)
        soup = BeautifulSoup(site)
        bild = soup.select('section div img')
        # ev insert loop for cases where there are several images
        b = bild[0]
        if 'srcset' in b.attrs:
            prts = b['srcset'].split(', ')
            # prts[-1] takes the biggest resolution (2000 px) available
            # alternative resolutions are 320, 640, ..., 1440 px
            pth = prts[-1].split(' ')[0]
            pic = io.imread('http://www.moma.org/'+pth)
            io.imsave(str(dfurl.ObjectID.iloc[i])+'.png',pic)
        else:
            # if no image available
            print url
    except:
        # if loading unsuccessful
        print 'ERROR', i

# sample query
for i in range(10):
    save_image(i)
	# required libraries in order of appearance
	import pandas as pd
	import urllib
	from bs4 import BeautifulSoup
	from skimage import io

	# create pandas dataframe from MoMA csv
	df = pd.read_csv('Artworks.csv',encoding='utf-8')
	# only take entries with a URL
	dfurl = df[df.URL.notnull()]

	# definition of method to save (first only) image for item with index i
	def save_image(i):
	url = dfurl.URL.iloc[i]
	try:
	site = urllib.urlopen(url)
	soup = BeautifulSoup(site)
	bild = soup.select('section div img')
	# ev insert loop for cases where there are several images
	b = bild[0]
	if 'srcset' in b.attrs:
	prts = b['srcset'].split(', ')
	# prts[-1] takes the biggest resolution (2000 px) available
	# alternative resolutions are 320, 640, ..., 1440 px
	pth = prts[-1].split(' ')[0]
	pic = io.imread('http://www.moma.org/'+pth)
	io.imsave(str(dfurl.ObjectID.iloc[i])+'.png',pic)
	else:
	# if no image available
	print url
	except:
	# if loading unsuccessful
	print 'ERROR', i

	# sample query
	for i in range(10):
	save_image(i)