Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save barbequesauce/4f919e2e83aacac7672bae769b045714 to your computer and use it in GitHub Desktop.

Select an option

Save barbequesauce/4f919e2e83aacac7672bae769b045714 to your computer and use it in GitHub Desktop.
Remove scanner pages from CBZs
#!/usr/bin/env python3
"""
A tool to find and remove scanner pages from cbz files. It's possible to operate on a single file
or recursively over tens of thousands of files.
Two basic use-cases:
Two-pass use-case:
1) Run in identify mode (-i), and generate a cache of images, and an index.
Compares page filenames to others in zip to find outlier names. Also uses
blacklist regex.
1a) Optional aspect ratio test (-a) is slower and will result in more false positives,
but may catch more scanner pages. Looks for outlies aspect ratio of final page.
2) Manually use OS's image browser to visually inspect and delete images that are NOT scanner pages (false positives)
3) Run in removal mode (-r), which will only delete pages that are matched in the cache. Uses index
and cache generated in step #1
Fast mode (-f):
Looks for a single page in zip that starts with a series of z's, y's, or x's. Single pass, deletes only one page
when it finds one. Use at own risk! Good for automated removal.
Limitations:
* Will only fully work on UNIX systems.
* Windows globing not done
* Requires external zip tool to remove file
* No warning in removal modes.
TODO:
extra verification and warnings for removal pass and fastmode
add arg to set white list and black list
add args to set index file, cache folder
code clean up
windows globbing
windows zip remove
Notes:
To pull page names from cache:
ls --quoting-style literal scanner_page_cache/ | sed 's/_[a-z0-9]\{64\}\.\(jpg\|png\|gif\|jpeg\)//i'|sort|uniq >>known-page-names.txt
"""
import os
import sys
import zipfile
import hashlib
import re
from argparse import ArgumentParser
from PIL import Image
from io import BytesIO
from difflib import get_close_matches
import csv
doQuiet = False
PAGE_CACHE_FOLDER = "scanner_page_cache"
PAGE_CSV_FILE = "scanner_page_list.csv"
# instant scanner page matches
BLACKLIST = [
"[zZ]+\.(jpg|JPG)",
"[xX]+\.(jpg|JPG)",
".*-*[tT]ag\.(jpg|JPG)$"
]
BLACKLIST_REGEX = re.compile("(" + "|".join(BLACKLIST) +")")
# instant non-scanner pages, to avoid excessive false positives
WHITELIST = [
".*\(.*([vV]ariant|[cC]over).*\).*",
"^(| |_|-)[cC]over",
".*[cC]over\..{3}$",
".*-Addendum[\d]{0,1}\.(jpg|JPG)",
".*-Indicia\.(jpg|JPG)",
".*-dustjacket\.(jpg|JPG)",
".*-(IFC|[Ii]fc)\.(jpg|JPG|gif)",
"acvr[abc]{0,1}\.jpg",
]
WHITELIST_REGEX = re.compile("(" + "|".join(WHITELIST) +")")
# ".*(\d{2,3}FC.*|\d{2,3}( |-)(ifc|fc|cover).*|\d{2,3}(cover|fc|ifc).*| FCi.*|letters*.)"
#-------------------------------------------
def get_recursive_filelist(pathlist):
"""Get a recursive list of of all files under all path items in the list"""
#filename_encoding = sys.getfilesystemencoding()
filelist = []
for p in pathlist:
if os.path.isdir(p):
for root, dirs, files in os.walk(p):
for f in files:
filelist.append(os.path.join(root, f))
else:
filelist.append(p)
return filelist
#-------------------------------------------
def log(output, noNewLine=False):
global doQuiet
if not doQuiet:
if noNewLine:
print(output, end='')
else:
print(output)
#-------------------------------------------
def log_image_info(text, info):
outtext = " None"
if info:
try:
ar = str(round( 1.0 * info['width'] / info['height'], 2))
except:
ar = 0
outtext = " [{:20}] {} {:10} {:4}x{:4} ({}) {}".format(
text,
info['sha256'],
info['file_size'],
info['width'],
info['height'],
ar,
info['basename'])
log(outtext)
#-------------------------------------------
def calc_checksum(info):
try:
with info['zipfile'].open(info['name']) as img:
image_data = img.read()
#get md5sum
m = hashlib.sha256()
m.update(image_data)
info['sha256'] = m.hexdigest()
except:
pass
#-------------------------------------------
def calc_dims(info):
if info['height'] != -1:
return
try:
with info['zipfile'].open(info['name']) as img:
image_data = img.read()
# image dims
image_file = BytesIO(image_data)
im = Image.open(image_file)
info['width'], info['height'] = im.size
except:
pass
#-------------------------------------------
def write_image_to_cache(zip, info, csvwriter=None):
if not info:
return
calc_checksum(info)
directory = PAGE_CACHE_FOLDER
if not os.path.exists(directory):
os.makedirs(directory)
try:
with zip.open(info['name']) as img:
image_data = img.read()
_,ext = os.path.splitext(info['basename'])
fname = os.path.join(directory, info['basename'] + "_" + info['sha256'] + ext)
if not os.path.exists(fname):
out = open(fname, "wb+")
out.write(image_data)
out.close()
if csvwriter:
csvwriter.writerow([zip.filename, info['name'], fname])
except Exception as e:
log("write_image_to_cache: {} - {}".format(info['basename'], e))
pass
#-------------------------------------------
def remove_page(filename, pagename):
log("removing page named {} from {}".format(pagename, filename))
# Escape initial '-' with a '\'
if pagename.startswith("-"):
pagename = "\\" + pagename
pagename = pagename.replace("[", "\[")
pagename = pagename.replace("]", "\]")
cmd = "zip \"{}\" -d \"{}\"".format(filename, pagename)
#log(cmd)
retval = os.system(cmd)
#-------------------------------------------
def remove_identified_pages(args):
with open(PAGE_CSV_FILE, newline='') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in csvreader:
filename = row[0]
pagename = row[1]
cache_image = row[2]
if os.path.exists(cache_image):
remove_page(filename, pagename)
else:
log("Skipping {} [{}] because it has no match in the cache!".format(pagename, filename))
#-------------------------------------------
def find_blacklisted_names(image_info_list):
"""
Quick regex of match any blacklisted patterns
"""
match_list = []
for info in image_info_list:
if BLACKLIST_REGEX.match(info['basename']):
match_list.append(info)
log(" Blacklist: {} [{}]".format(info['basename'], info["zipfile"].filename))
return match_list
#-------------------------------------------
def analyze_start_chars(image_info_list):
"""
Find a unique name that start with repeated x's, y's, or z's
Only returns a single item
"""
def find_unique_prefix(image_info_list, prefix):
# Count the frequency of filename lengths
prefix_count = 0
matched_info = None
for info in image_info_list:
if info['basename'].lower().startswith(prefix):
prefix_count += 1
if prefix_count == 1:
matched_info = info
else:
# prefix is not unique
matched_info = None
break
return matched_info
# More recent scanners use a number of z's or x's at the start of their page
for letter in ['z','y','x']:
for count in range(5,0,-1):
info = find_unique_prefix(image_info_list, letter * count)
if info:
return info
return None
#-------------------------------------------
def analyze_filename_diffs(image_info_list):
"""
Find probable scanner pages based on the relative name uniqueness
compared to the other page names.
"""
# don't consider any whitelisted patterns
analyze_list = []
for info in image_info_list:
if not WHITELIST_REGEX.match(info['basename']):
analyze_list.append(info)
pagename_list = []
for info in analyze_list:
pagename_list.append(info['basename'])
# For each page name, count how many others are similar. The less similar the name is
# to the others, the more likely it is to be a scanner page
likely_scanner_pages = []
for info in analyze_list:
info['similarity_count'] = len(get_close_matches(info['basename'], pagename_list,len(pagename_list)))
if info['similarity_count'] < 3:
log(" Dissimilar: {} [{}]".format( info['basename'], info["zipfile"].filename))
likely_scanner_pages.append(info)
return likely_scanner_pages
#-------------------------------------------
def analyze_image_aspect_ratio(image_info_list):
# Some page names for the scanner page might
# be unremarkable. If so, it's almost certainly
# alphabeticaly last.
# Make a historgram or group together pages with common aspect ratios
# If the final page is an outlier, it might be a scanner
# page
ar_buckets = {}
for info in image_info_list:
calc_dims(info)
if info['height'] == -1:
break
aspect_ratio = str(round( 1.0 * info['width'] / info['height'], 2))
if aspect_ratio not in ar_buckets:
ar_buckets[aspect_ratio] = []
ar_buckets[aspect_ratio].append(info)
# Now look for a singular aspect ratios
for key in ar_buckets.keys():
#log(" {}: {}".format(key , len(ar_buckets[key]) ))
if len(ar_buckets[key]) == 1 and image_info_list[-1] in ar_buckets[key]:
log(" Aspect: {} [{}]".format(info['basename'], info["zipfile"].filename))
return image_info_list[-1]
return None
#-------------------------------------------
"""
def analyze_image_dimensions(image_info_list):
# For most comics, we expect the heights of all pages to be same
# (the width may vary for double page spreads) Often the scanner page
# will have unique dimensions. Compare heights, and look for a page
# with a unique height
# Count the frequency of page heights
height_counts = {}
for info in image_info_list:
calc_dims(info)
if info['height'] == -1:
break
height_value = info['height']
if height_value in height_counts:
height_counts[height_value] += 1
else:
height_counts[height_value] = 1
# see if there is only one height occurance that is unique (count of 1)
unique_height = None
unique_height_ocurrances = 0
for k in height_counts.keys():
if height_counts[k] == 1:
unique_height = k
unique_height_ocurrances += 1
if unique_height_ocurrances > 1:
unique_height = None
break
if unique_height:
# find the matching info item
for info in image_info_list:
if info['height'] == unique_height:
return info
"""
#-------------------------------------------
def extract_image_info(zip):
"""
Get a list of dicts with info for each image in a zipfile
"""
# only look at image files
image_name_list = []
image_exts = [".jpg", ".jpeg", ".png", ".gif", ".webp"]
for f in zip.namelist():
base,ext = os.path.splitext(f)
if ext.lower() in image_exts:
image_name_list.append(f)
# find info about each image in zip file
image_info_list = []
for name in image_name_list:
image_info = {}
image_info['name'] = name
image_info['basename'] = name.split('/')[-1]
image_info['zipfile'] = zip
zip_info = zip.getinfo(name)
image_info['file_size'] = zip_info.file_size
# the following items are slow, so only get them as needed
image_info['height'] = -1
image_info['width'] = -1
image_info['sha256'] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
image_info_list.append(image_info)
# Sort the list by name
image_info_list = sorted(image_info_list, key=lambda k: k['name'])
return image_info_list
#-------------------------------------------
def identify_scanner_page(args, filename, csvwriter):
try:
zip = zipfile.ZipFile(filename)
except:
#log("Unable to read as zipfile: {}".format(filename))
return
if args.verbose:
log("Processing: {}".format(filename))
image_info_list = extract_image_info(zip)
guesses = []
if args.identify:
# Apply blacklist
guesses.extend(find_blacklisted_names(image_info_list))
diff_based_guesses = analyze_filename_diffs(image_info_list)
for guess in diff_based_guesses:
if guess not in guesses:
guesses.append(guess)
if args.aspectratio:
ar_based_guess = analyze_image_aspect_ratio(image_info_list)
if ar_based_guess not in guesses:
guesses.append(guess)
for info in guesses:
write_image_to_cache(zip, info, csvwriter)
elif args.fastmode:
match = analyze_start_chars(image_info_list)
# start_char match is very high confidence, we can safely delete
if match:
remove_page(filename, match['name'])
if args.debug:
for info in image_info_list:
calc_checksum(info)
calc_dims(info)
log_image_info("debug", info)
#-------------------------------------------
def parseArgs():
global alltags
global doQuiet
parser = ArgumentParser()
parser.description = "Identify or remove the scanner page from comic file (CBZ only)"
parser.add_argument("-l", "--listfile", dest="listfile", default=None,
help="read LISTFILE for the list of comic files to operate on",
metavar='LISTFILE' )
parser.add_argument('comicfile', nargs='*', metavar='COMICFILE',
help="comic file(s) to operate on",)
parser.add_argument("-f", "--fast", dest="fastmode",
action="store_true", default=False,
help="Use one-pass fast mode to identify and remove scanner pages using starting char method")
parser.add_argument("-r", "--remove", dest="remove",
action="store_true", default=False,
help="Remove scanner pages found during identify mode")
parser.add_argument("-i", "--identify", dest="identify",
action="store_true", default=False,
help="Try to identify scanner pages in comics")
parser.add_argument("-a", "--aspectratio", dest="aspectratio",
action="store_true", default=False,
help="Use aspect ratio when identifying (slower)")
parser.add_argument("-R", "--recursive", dest="recursive",
action="store_true", default=False,
help="Recursively include files in sub-folders.")
parser.add_argument("-v", "--verbose", dest="verbose",
action="store_true", default=False,
help="Verbose output")
parser.add_argument("-d", "--debug", dest="debug",
action="store_true", default=False,
help="Debug output")
args = parser.parse_args()
mode_count = 0
if args.remove:
mode_count+=1
if args.identify:
mode_count+=1
if args.fastmode:
mode_count+=1
if mode_count != 1:
parser.error("Must specify exactly one run mode: identify, remove, or fast")
sys.exit(0)
if args.listfile is None and len(args.comicfile) == 0 and not args.remove:
parser.error("Need to specify comic files on command line, or use -l to point a file with comic file list")
sys.exit(0)
return args
#-------------------------------------------
def main():
args = parseArgs()
# determine the list of comic files
filenames = []
if args.listfile is not None:
if os.path.exists(args.listfile):
filenames = [line.rstrip('\n') for line in open(args.listfile)]
else:
print("Can't find {}".format(args.listfile))
sys.exit(-1)
else:
filenames = args.comicfile
# remove empty strings from list
#filenames = filter(None, filenames)
if not args.identify and not args.fastmode and not args.remove:
log("Must specify mode!")
return
if args.remove:
remove_identified_pages(args)
return
if args.recursive:
filenames = get_recursive_filelist(filenames)
if args.identify:
csvfile = open('scanner_page_list.csv', 'w')
csvwriter = csv.writer(csvfile,
delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
else:
csvwriter = None
# process each comic file
for filename in filenames:
identify_scanner_page(args, filename, csvwriter)
if args.identify:
csvfile.flush()
if args.identify:
csvfile.close()
#-------------------------------------------
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment