Last active
August 5, 2020 09:20
-
-
Save mszegedy/f5100d7c67cad4362e6c4be44a85621a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# crops a series of screenshots, splits them based on vertical blackspace (i.e. | |
# completely black scanlines), then aligns and merges them by y-concatenation. | |
# if a piece can't be y-catted to the previous one, a new file is created for | |
# it. this has the use of ripping ebooks that have been pirated from temporary | |
# views via screenshots, such as those lent out by archive.org. | |
import os | |
from os.path import join | |
import sys | |
import numpy as np | |
from PIL import Image | |
SOURCE_PATH = 'source/' | |
OUT_PATH = 'images/' | |
CROPBOX = (538, 186, 1369, 1046) | |
MIN_HEIGHT = 830 | |
EXTENSION = 'jpg' | |
def is_black(im): | |
x, y = im.size | |
return (x*y, (0, 0, 0)) in im.getcolors(x*y) | |
def is_same(im_1, im_2): | |
return (np.asarray(im_1) == np.asarray(im_2)).all() | |
def scanline(im, i): | |
return im.crop((0, i, im.size[0], i+1)) | |
def scanlines(im): | |
return (scanline(im, i) for i in range(im.size[1])) | |
def vcat(im_1, im_2): | |
return Image.fromarray(np.vstack((np.asarray(im_1), np.asarray(im_2)))) | |
def align_and_merge(im_1, im_2): | |
width = im_1.size[0] | |
assert width == im_2.size[0] | |
offset = None | |
first_line = scanline(im_2, 0) | |
for y, line in enumerate(scanlines(im_1)): | |
if is_same(line, first_line): | |
offset = y | |
if offset == None: | |
raise ValueError('Images don\'t overlap.') | |
return vcat(im_1, im_2.crop((0, im_1.size[1]-offset, *im_2.size))) | |
def parts_from_path(path, source_im=None): | |
im = Image.open(path) | |
im = im.crop(CROPBOX) | |
parts = [] | |
black = True | |
source_i = None | |
for i in range(im.size[1]): | |
line = scanline(im, i) | |
if is_black(line): | |
black = True | |
continue | |
if black or not parts: | |
black = False | |
parts.append(line) | |
continue | |
parts[-1] = vcat(parts[-1], line) | |
if source_im is not None: | |
try: | |
parts[0] = align_and_merge(source_im, parts[0]) | |
except ValueError: | |
parts.insert(0, source_im) | |
return parts | |
def output(): | |
global last_part | |
global parts | |
global name_i | |
for i, im in enumerate(parts[:-1]): | |
if im.size[1] >= MIN_HEIGHT: | |
im.save(join(OUT_PATH, '.'.join((str(name_i+i), EXTENSION.lower())))) | |
last_part = parts[-1] | |
name_i = name_i + len(parts) - 1 | |
if __name__ == '__main__': | |
paths = os.listdir(SOURCE_PATH) | |
paths.sort() | |
last_part = None | |
name_i = 0 | |
for path in paths: | |
path = join(SOURCE_PATH, path) | |
parts = parts_from_path(path, last_part) | |
for i, im in enumerate(parts[:-1]): | |
if im.size[1] >= MIN_HEIGHT: | |
im.save(join(OUT_PATH, | |
'.'.join((str(name_i+i), EXTENSION.lower())))) | |
last_part = parts[-1] | |
name_i = name_i + len(parts) - 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment