Last active
January 3, 2019 16:15
-
-
Save michaelneu/b8a192687487aa044c3f07b3a57d4cc2 to your computer and use it in GitHub Desktop.
Scrape all images from hide the pain harold
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import urllib.request | |
import os | |
import re | |
import sys | |
from concurrent.futures import ProcessPoolExecutor as PoolExecutor, as_completed | |
from PIL import Image | |
image_page_link_pattern = re.compile(r"src=\"https:\/\/thumbs\.dreamstime\.com\/t\/([^\"]+)") | |
next_page_link_pattern = re.compile(r"class=\"next-button-v2 black\" href=\"([^\"]+)\"") | |
def log_debug(message): | |
print("[*]", message) | |
def log_info(message): | |
print("[+]", message) | |
def read_webpage(url): | |
log_debug("getting '%s'" % url) | |
with urllib.request.urlopen(url) as page: | |
return page.read() | |
def find_image_pages(url): | |
log_info("searching images on '%s'" % url) | |
html = read_webpage(url).decode("utf-8") | |
filenames = image_page_link_pattern.findall(html) | |
urls = ["https://thumbs.dreamstime.com/z/%s" % filename for filename in filenames] | |
next_page_match = next_page_link_pattern.search(html) | |
if next_page_match: | |
next_page_url = next_page_match.group(1) | |
log_debug("following 'next' button") | |
urls += find_image_pages(next_page_url) | |
return urls | |
def partition_list(list, partition_count): | |
partition_size = len(list) // partition_count + 1 | |
return [list[i:i + partition_size] for i in range(0, len(list), partition_size)] | |
def download_images(urls, download_folder): | |
successful_download_count = 0 | |
for index, url in urls: | |
log_debug("downloading image %d at '%s'" % (index, url)) | |
filename = "%03d-%s" % (index, os.path.basename(url)) | |
path = os.path.join(download_folder, filename) | |
with open(path, "wb") as image_file: | |
try: | |
image_bytes = read_webpage(url) | |
image_file.write(image_bytes) | |
successful_download_count += 1 | |
except: | |
pass | |
return successful_download_count | |
def remove_watermark_strip(filenames): | |
removed_count = 0 | |
watermark_strip_size_percentage = 0.1 | |
for filename in filenames: | |
log_debug("removing watermark strip from '%s'" % filename) | |
image = Image.open(filename) | |
width, height = image.size | |
is_landscape = width > height | |
if is_landscape: | |
watermark_strip_size = int(height * watermark_strip_size_percentage) | |
box = (0, 0, width, height - watermark_strip_size) | |
else: | |
watermark_strip_size = int(width * watermark_strip_size_percentage) | |
box = (0, 0, width - watermark_strip_size, height) | |
try: | |
cropped = image.crop(box) | |
cropped.save(filename) | |
removed_count += 1 | |
except: | |
pass | |
return removed_count | |
if __name__ == "__main__": | |
worker_count = 32 | |
download_folder = "harold" | |
start_url = "http://www.dreamstime.com/same-stock-photo-model-image33471644" | |
if not os.path.exists(download_folder): | |
os.mkdir(download_folder) | |
image_urls = find_image_pages(start_url) | |
log_debug("found %d image urls" % len(image_urls)) | |
with PoolExecutor(worker_count) as executor: | |
image_urls_with_indices = list(enumerate(image_urls)) | |
image_url_partitions = partition_list(image_urls_with_indices, worker_count) | |
futures = [executor.submit(download_images, partition, download_folder) for partition in image_url_partitions] | |
image_count = 0 | |
for future in as_completed(futures): | |
image_count += future.result() | |
log_info("downloaded %d out of %d images" % (image_count, len(image_urls))) | |
files = [os.path.join(download_folder, filename) for filename in os.listdir(download_folder)] | |
file_partitions = partition_list(files, worker_count) | |
with PoolExecutor(worker_count) as executor: | |
futures = [executor.submit(remove_watermark_strip, partition) for partition in file_partitions] | |
removed_count = 0 | |
for future in as_completed(futures): | |
removed_count += future.result() | |
log_info("removed watermark strips from %d images" % removed_count) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Pillow==5.3.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment