michaelneu/harold-scraper.py

## harold-scraper.py
#!/usr/bin/env python3

import urllib.request
import os
import re
import sys
from concurrent.futures import ProcessPoolExecutor as PoolExecutor, as_completed
from PIL import Image

image_page_link_pattern = re.compile(r"src=\"https:\/\/thumbs\.dreamstime\.com\/t\/([^\"]+)")
next_page_link_pattern = re.compile(r"class=\"next-button-v2 black\" href=\"([^\"]+)\"")

def log_debug(message):
    print("[*]", message)

def log_info(message):
    print("[+]", message)

def read_webpage(url):
    log_debug("getting '%s'" % url)

    with urllib.request.urlopen(url) as page:
        return page.read()

def find_image_pages(url):
    log_info("searching images on '%s'" % url)

    html = read_webpage(url).decode("utf-8")
    filenames = image_page_link_pattern.findall(html)
    urls = ["https://thumbs.dreamstime.com/z/%s" % filename for filename in filenames]
    next_page_match = next_page_link_pattern.search(html)

    if next_page_match:
        next_page_url = next_page_match.group(1)
        log_debug("following 'next' button")
        urls += find_image_pages(next_page_url)

    return urls

def partition_list(list, partition_count):
    partition_size = len(list) // partition_count + 1
    return [list[i:i + partition_size] for i in range(0, len(list), partition_size)]

def download_images(urls, download_folder):
    successful_download_count = 0

    for index, url in urls:
        log_debug("downloading image %d at '%s'" % (index, url))

        filename = "%03d-%s" % (index, os.path.basename(url))
        path = os.path.join(download_folder, filename)

        with open(path, "wb") as image_file:
            try:
                image_bytes = read_webpage(url)
                image_file.write(image_bytes)
                successful_download_count += 1
            except:
                pass

    return successful_download_count

def remove_watermark_strip(filenames):
    removed_count = 0
    watermark_strip_size_percentage = 0.1

    for filename in filenames:
        log_debug("removing watermark strip from '%s'" % filename)

        image = Image.open(filename)
        width, height = image.size
        is_landscape = width > height

        if is_landscape:
            watermark_strip_size = int(height * watermark_strip_size_percentage)
            box = (0, 0, width, height - watermark_strip_size)
        else:
            watermark_strip_size = int(width * watermark_strip_size_percentage)
            box = (0, 0, width - watermark_strip_size, height)

        try:
            cropped = image.crop(box)
            cropped.save(filename)

            removed_count += 1
        except:
            pass

    return removed_count

if __name__ == "__main__":
    worker_count = 32
    download_folder = "harold"
    start_url = "http://www.dreamstime.com/same-stock-photo-model-image33471644"

    if not os.path.exists(download_folder):
        os.mkdir(download_folder)

    image_urls = find_image_pages(start_url)
    log_debug("found %d image urls" % len(image_urls))

    with PoolExecutor(worker_count) as executor:
        image_urls_with_indices = list(enumerate(image_urls))
        image_url_partitions = partition_list(image_urls_with_indices, worker_count)
        futures = [executor.submit(download_images, partition, download_folder) for partition in image_url_partitions]

        image_count = 0

        for future in as_completed(futures):
            image_count += future.result()

        log_info("downloaded %d out of %d images" % (image_count, len(image_urls)))

    files = [os.path.join(download_folder, filename) for filename in os.listdir(download_folder)]
    file_partitions = partition_list(files, worker_count)

    with PoolExecutor(worker_count) as executor:
        futures = [executor.submit(remove_watermark_strip, partition) for partition in file_partitions]
        removed_count = 0

        for future in as_completed(futures):
            removed_count += future.result()

        log_info("removed watermark strips from %d images" % removed_count)

## requirements.txt
Pillow==5.3.0
	#!/usr/bin/env python3

	import urllib.request
	import os
	import re
	import sys
	from concurrent.futures import ProcessPoolExecutor as PoolExecutor, as_completed
	from PIL import Image

	image_page_link_pattern = re.compile(r"src=\"https:\/\/thumbs\.dreamstime\.com\/t\/([^\"]+)")
	next_page_link_pattern = re.compile(r"class=\"next-button-v2 black\" href=\"([^\"]+)\"")

	def log_debug(message):
	print("[*]", message)

	def log_info(message):
	print("[+]", message)

	def read_webpage(url):
	log_debug("getting '%s'" % url)

	with urllib.request.urlopen(url) as page:
	return page.read()

	def find_image_pages(url):
	log_info("searching images on '%s'" % url)

	html = read_webpage(url).decode("utf-8")
	filenames = image_page_link_pattern.findall(html)
	urls = ["https://thumbs.dreamstime.com/z/%s" % filename for filename in filenames]
	next_page_match = next_page_link_pattern.search(html)

	if next_page_match:
	next_page_url = next_page_match.group(1)
	log_debug("following 'next' button")
	urls += find_image_pages(next_page_url)

	return urls

	def partition_list(list, partition_count):
	partition_size = len(list) // partition_count + 1
	return [list[i:i + partition_size] for i in range(0, len(list), partition_size)]

	def download_images(urls, download_folder):
	successful_download_count = 0

	for index, url in urls:
	log_debug("downloading image %d at '%s'" % (index, url))

	filename = "%03d-%s" % (index, os.path.basename(url))
	path = os.path.join(download_folder, filename)

	with open(path, "wb") as image_file:
	try:
	image_bytes = read_webpage(url)
	image_file.write(image_bytes)
	successful_download_count += 1
	except:
	pass

	return successful_download_count

	def remove_watermark_strip(filenames):
	removed_count = 0
	watermark_strip_size_percentage = 0.1

	for filename in filenames:
	log_debug("removing watermark strip from '%s'" % filename)

	image = Image.open(filename)
	width, height = image.size
	is_landscape = width > height

	if is_landscape:
	watermark_strip_size = int(height * watermark_strip_size_percentage)
	box = (0, 0, width, height - watermark_strip_size)
	else:
	watermark_strip_size = int(width * watermark_strip_size_percentage)
	box = (0, 0, width - watermark_strip_size, height)

	try:
	cropped = image.crop(box)
	cropped.save(filename)

	removed_count += 1
	except:
	pass

	return removed_count

	if __name__ == "__main__":
	worker_count = 32
	download_folder = "harold"
	start_url = "http://www.dreamstime.com/same-stock-photo-model-image33471644"

	if not os.path.exists(download_folder):
	os.mkdir(download_folder)

	image_urls = find_image_pages(start_url)
	log_debug("found %d image urls" % len(image_urls))

	with PoolExecutor(worker_count) as executor:
	image_urls_with_indices = list(enumerate(image_urls))
	image_url_partitions = partition_list(image_urls_with_indices, worker_count)
	futures = [executor.submit(download_images, partition, download_folder) for partition in image_url_partitions]

	image_count = 0

	for future in as_completed(futures):
	image_count += future.result()

	log_info("downloaded %d out of %d images" % (image_count, len(image_urls)))

	files = [os.path.join(download_folder, filename) for filename in os.listdir(download_folder)]
	file_partitions = partition_list(files, worker_count)

	with PoolExecutor(worker_count) as executor:
	futures = [executor.submit(remove_watermark_strip, partition) for partition in file_partitions]
	removed_count = 0

	for future in as_completed(futures):
	removed_count += future.result()

	log_info("removed watermark strips from %d images" % removed_count)