Created
September 1, 2013 12:57
-
-
Save jeffThompson/6404296 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python | |
import subprocess | |
import os | |
from PIL import Image | |
''' | |
SORT PDFs BY BRIGHTNESS | |
Jeff Thompson | 2013 | www.jeffreythompson | |
Experiment in auto-curating for a residency with Internet Archive: all PDFs with mention | |
of the word "brigthness", sorted by brightness. | |
BRIGHTNESS MEASURE: | |
via: http://stackoverflow.com/a/596243/1167783 | |
+ standard, objective: (0.2126*R) + (0.7152*G) + (0.0722*B) | |
+ perceived, option 1: (0.299*R + 0.587*G + 0.114*B) | |
+ perceived, option 2, slower to calculate: sqrt(0.241*R^2 + 0.691*G^2 + 0.068*B^2) | |
IMAGE EXPORT SETTINGS: | |
+ density - sets resolution; we downsample to save space | |
+ background - prevents images from having transparent backgrounds (default) | |
REQUIRES: | |
+ ImageMagick | |
+ Ghostscript | |
''' | |
split_pdfs = False | |
input_dir = "InputPDFs/" | |
output_dir = "SplitPNGs/" | |
csv_file = "brightness.csv" | |
# create output directory if it doesn't exist | |
def create_dir(path): | |
if not os.path.exists(path): | |
os.makedirs(path) | |
# get first color pdf in the directory (not ending with '_bw') | |
def get_first_pdf(id): | |
for file in os.listdir(input_dir + id): | |
if file.endswith(".pdf") and not file.endswith("_bw.pdf"): | |
return file | |
def get_image_brightness(path): | |
print " " + path | |
brightness = 0.0 | |
img = Image.open(path) | |
img = img.convert("RGB") # required for comparison | |
width, height = img.size | |
if width == 0 or height == 0: | |
return 0.0 | |
for y in range(height): | |
for x in range(width): | |
r, g, b = img.getpixel((x,y)) | |
brightness += (0.2126 * r) + (0.7152 * g) + (0.0722 * b) | |
return(brightness/(width * height)) | |
# clear terminal window first :) | |
os.system('cls' if os.name=='nt' else 'clear') | |
# write header to csv file | |
with open(csv_file, 'a') as csv: | |
csv.write('id,brightness,url') | |
# iterate pdfs and split into png files | |
if split_pdfs: | |
for id in os.listdir(input_dir): | |
if not id.startswith('.'): | |
create_dir(output_dir + id) | |
pdf = get_first_pdf(id) | |
print id + "/" + pdf | |
print " splitting into PNG files...\n" | |
subprocess.call(["convert", "-density", "72", "-background", "white", input_dir + "/" + id + "/" + pdf, output_dir + id + "/%04d.png"]) | |
# iterate all pngs, calculate average brightness | |
for id in os.listdir(output_dir): | |
if not id.startswith('.'): | |
# list all pngs in the folder, extract overall brightness for pdf | |
print id | |
brightness = 0.0 | |
num_images = 0 | |
for png in os.listdir(output_dir + id): | |
if png.endswith('.png'): | |
brightness += get_image_brightness(output_dir + id + "/" + png) | |
num_images += 1 | |
if num_images == 0: | |
continue | |
brightness /= num_images | |
url = '<a href="http://www.archive.org/details/' + id + '">link</a>' | |
with open(csv_file, 'a') as csv: | |
csv.write('\n' + id + ',' + str(brightness) + ',' + url) | |
print "brightness: " + str(brightness) + "\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment