Skip to content

Instantly share code, notes, and snippets.

@eliascotto
Last active July 18, 2021 06:30
Show Gist options
  • Save eliascotto/b83a7509345b2d2956f1e3a1df9e8cb9 to your computer and use it in GitHub Desktop.
Save eliascotto/b83a7509345b2d2956f1e3a1df9e8cb9 to your computer and use it in GitHub Desktop.
"Best Artworks of All Time" dataset downloader for kaggle - https://www.kaggle.com/ikarus777/best-artworks-of-all-time
import os, sys
import json, re
import random
import wikipedia
from urllib import request
from time import sleep
import pandas as pd
MIN_SET = 31
MAX_SET = 118
BASE_URL = 'http://artchallenge.me/painters/'
FILE_NAME = '/data.json'
OUTPUT = 'artists.csv'
IMAGE_PATH = 'images/'
artist_set = [1,4,7,9,14,15,17,19,21,22,24,26,27,28,29,30,32,33,34,35,36,39,40,41,42,43,45,46,49,50,53,54,55,57,58,61,62,63,69,73,75,77,79,80,82,83,94,95,112,118]
def createDir(path):
if not os.path.isdir(path):
os.makedirs(path)
def getSummary(name):
try:
summ = wikipedia.summary(name)
except:
summ = wikipedia.summary(name.split(' ')[1])
finally:
return summ.split('\n')[0]
def downloadImages(artist):
count = artist['paintings']
filename = artist['name'].replace(' ', '_')
createDir(f'{IMAGE_PATH}{filename}/')
for n in range(1, count + 1):
artist_id = str(artist['id'])
url = f'{BASE_URL}{artist_id}/{n}.jpg'
request.urlretrieve(url, f'{IMAGE_PATH}{filename}/{filename}_{n}.jpg')
sleep(random.uniform(50, 250))
createDir(f'{IMAGE_PATH}{filename}/resized/')
command = f'mogrify -path "{IMAGE_PATH}{filename}/resized/" -resize 60% -quality 82 "{IMAGE_PATH}{filename}/*.jpg"'
os.system(command)
name = artist['name']
print(f'Downloaded {name}, {count} images.')
def extractData():
dataset = []
for n in artist_set:
with request.urlopen(BASE_URL + str(n) + FILE_NAME) as url:
data = json.loads(url.read().decode())
dataset.append({
'name': data['name'],
'years': data['years'],
'genre': ','.join(data['genre']),
'nationality': ','.join(data['nationality']),
'bio': getSummary(data['name']), # comment for download
'wikipedia': data['link']['wikipedia']['en'],
'paintings': data['paintings']
})
downloadImages(data)
return pd.DataFrame(dataset,
columns=['name','years','genre','nationality','bio','wikipedia','paintings'])
if __name__ == '__main__':
createDir(IMAGE_PATH)
df = extractData()
df.index.rename('id', inplace=True)
df.to_csv(OUTPUT, sep=',')
print('Finished!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment