Skip to content

Instantly share code, notes, and snippets.

@michaelneu
Last active January 3, 2019 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaelneu/b8a192687487aa044c3f07b3a57d4cc2 to your computer and use it in GitHub Desktop.
Save michaelneu/b8a192687487aa044c3f07b3a57d4cc2 to your computer and use it in GitHub Desktop.
Scrape all images from hide the pain harold
#!/usr/bin/env python3
import urllib.request
import os
import re
import sys
from concurrent.futures import ProcessPoolExecutor as PoolExecutor, as_completed
from PIL import Image
image_page_link_pattern = re.compile(r"src=\"https:\/\/thumbs\.dreamstime\.com\/t\/([^\"]+)")
next_page_link_pattern = re.compile(r"class=\"next-button-v2 black\" href=\"([^\"]+)\"")
def log_debug(message):
print("[*]", message)
def log_info(message):
print("[+]", message)
def read_webpage(url):
log_debug("getting '%s'" % url)
with urllib.request.urlopen(url) as page:
return page.read()
def find_image_pages(url):
log_info("searching images on '%s'" % url)
html = read_webpage(url).decode("utf-8")
filenames = image_page_link_pattern.findall(html)
urls = ["https://thumbs.dreamstime.com/z/%s" % filename for filename in filenames]
next_page_match = next_page_link_pattern.search(html)
if next_page_match:
next_page_url = next_page_match.group(1)
log_debug("following 'next' button")
urls += find_image_pages(next_page_url)
return urls
def partition_list(list, partition_count):
partition_size = len(list) // partition_count + 1
return [list[i:i + partition_size] for i in range(0, len(list), partition_size)]
def download_images(urls, download_folder):
successful_download_count = 0
for index, url in urls:
log_debug("downloading image %d at '%s'" % (index, url))
filename = "%03d-%s" % (index, os.path.basename(url))
path = os.path.join(download_folder, filename)
with open(path, "wb") as image_file:
try:
image_bytes = read_webpage(url)
image_file.write(image_bytes)
successful_download_count += 1
except:
pass
return successful_download_count
def remove_watermark_strip(filenames):
removed_count = 0
watermark_strip_size_percentage = 0.1
for filename in filenames:
log_debug("removing watermark strip from '%s'" % filename)
image = Image.open(filename)
width, height = image.size
is_landscape = width > height
if is_landscape:
watermark_strip_size = int(height * watermark_strip_size_percentage)
box = (0, 0, width, height - watermark_strip_size)
else:
watermark_strip_size = int(width * watermark_strip_size_percentage)
box = (0, 0, width - watermark_strip_size, height)
try:
cropped = image.crop(box)
cropped.save(filename)
removed_count += 1
except:
pass
return removed_count
if __name__ == "__main__":
worker_count = 32
download_folder = "harold"
start_url = "http://www.dreamstime.com/same-stock-photo-model-image33471644"
if not os.path.exists(download_folder):
os.mkdir(download_folder)
image_urls = find_image_pages(start_url)
log_debug("found %d image urls" % len(image_urls))
with PoolExecutor(worker_count) as executor:
image_urls_with_indices = list(enumerate(image_urls))
image_url_partitions = partition_list(image_urls_with_indices, worker_count)
futures = [executor.submit(download_images, partition, download_folder) for partition in image_url_partitions]
image_count = 0
for future in as_completed(futures):
image_count += future.result()
log_info("downloaded %d out of %d images" % (image_count, len(image_urls)))
files = [os.path.join(download_folder, filename) for filename in os.listdir(download_folder)]
file_partitions = partition_list(files, worker_count)
with PoolExecutor(worker_count) as executor:
futures = [executor.submit(remove_watermark_strip, partition) for partition in file_partitions]
removed_count = 0
for future in as_completed(futures):
removed_count += future.result()
log_info("removed watermark strips from %d images" % removed_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment