Skip to content

Instantly share code, notes, and snippets.

@Riksi
Last active July 22, 2022 08:34
Show Gist options
  • Save Riksi/a01d8835648f47cc24c658c4b2a7f1e7 to your computer and use it in GitHub Desktop.
Save Riksi/a01d8835648f47cc24c658c4b2a7f1e7 to your computer and use it in GitHub Desktop.
FID files
from bs4 import BeautifulSoup as Soup
from PIL import Image
import requests
import os
import glob
def download_save(link):
try:
page = requests.get(link)
name = link.split('/')[-1]
s = Soup(page.text)
title = s.find('h1', class_='artwork-title')
text = title.text.lower()
tags = s.find('ul', class_='tags')
if tags is not None:
text = text + tags.text.lower()
except Exception as err:
print(err)
print('Failed to download from {}'.format(link))
return link
img_path = glob.glob('data/*/{}.jpg'.format(name))
if len(img_path) == 0:
classes = []
for cl in ['rose', 'daisy', 'tulip', 'sunflower',
'marguerite', 'dandelion', 'daisies']:
if cl in text:
if cl in ['daisies', 'marguerite']:
classes.append('daisy')
else:
classes.append(cl)
classes = list(set(classes))
if len(classes) != 1:
folder = 'mixed'
else:
[folder] = classes
path = os.path.join('data', folder)
if not os.path.exists(path):
os.makedirs(path)
img_path = os.path.join(path, name + '.jpg')
src = s.find('div', class_='single_img').img['src']
img_res = requests.get(src)
with open(img_path, 'wb') as f:
f.write(img_res.content)
print('Saved {}, found class {}'.format(name, ', '.join(classes)))
else:
print('Saved already {}'.format(name))
if isinstance(img_path, list):
img_path = img_path[0]
w, h, *f = Image.open(img_path).size
return (h, w) + tuple(f)
if __name__ == '__main__':
with open('links.txt') as f:
links = list(filter(len, [i.strip() for i in f.read().split()]))
shapes = []
failures = []
for i, link in enumerate(links):
s = (download_save(link))
if isinstance(s, tuple):
shapes.append(s)
else:
failures.append(s)
if (i % 50) == 0:
time.sleep(1)
with open('shapes.pkl', 'rb') as f:
pickle.dump(obj=shapes, file=f)
with open('failures.pkl', 'rb') as f:
pickle.dump(obj=failures, file=f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment