Created
August 23, 2018 19:23
-
-
Save barbequesauce/4f919e2e83aacac7672bae769b045714 to your computer and use it in GitHub Desktop.
Remove scanner pages from CBZs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| A tool to find and remove scanner pages from cbz files. It's possible to operate on a single file | |
| or recursively over tens of thousands of files. | |
| Two basic use-cases: | |
| Two-pass use-case: | |
| 1) Run in identify mode (-i), and generate a cache of images, and an index. | |
| Compares page filenames to others in zip to find outlier names. Also uses | |
| blacklist regex. | |
| 1a) Optional aspect ratio test (-a) is slower and will result in more false positives, | |
| but may catch more scanner pages. Looks for outlies aspect ratio of final page. | |
| 2) Manually use OS's image browser to visually inspect and delete images that are NOT scanner pages (false positives) | |
| 3) Run in removal mode (-r), which will only delete pages that are matched in the cache. Uses index | |
| and cache generated in step #1 | |
| Fast mode (-f): | |
| Looks for a single page in zip that starts with a series of z's, y's, or x's. Single pass, deletes only one page | |
| when it finds one. Use at own risk! Good for automated removal. | |
| Limitations: | |
| * Will only fully work on UNIX systems. | |
| * Windows globing not done | |
| * Requires external zip tool to remove file | |
| * No warning in removal modes. | |
| TODO: | |
| extra verification and warnings for removal pass and fastmode | |
| add arg to set white list and black list | |
| add args to set index file, cache folder | |
| code clean up | |
| windows globbing | |
| windows zip remove | |
| Notes: | |
| To pull page names from cache: | |
| ls --quoting-style literal scanner_page_cache/ | sed 's/_[a-z0-9]\{64\}\.\(jpg\|png\|gif\|jpeg\)//i'|sort|uniq >>known-page-names.txt | |
| """ | |
| import os | |
| import sys | |
| import zipfile | |
| import hashlib | |
| import re | |
| from argparse import ArgumentParser | |
| from PIL import Image | |
| from io import BytesIO | |
| from difflib import get_close_matches | |
| import csv | |
| doQuiet = False | |
| PAGE_CACHE_FOLDER = "scanner_page_cache" | |
| PAGE_CSV_FILE = "scanner_page_list.csv" | |
| # instant scanner page matches | |
| BLACKLIST = [ | |
| "[zZ]+\.(jpg|JPG)", | |
| "[xX]+\.(jpg|JPG)", | |
| ".*-*[tT]ag\.(jpg|JPG)$" | |
| ] | |
| BLACKLIST_REGEX = re.compile("(" + "|".join(BLACKLIST) +")") | |
| # instant non-scanner pages, to avoid excessive false positives | |
| WHITELIST = [ | |
| ".*\(.*([vV]ariant|[cC]over).*\).*", | |
| "^(| |_|-)[cC]over", | |
| ".*[cC]over\..{3}$", | |
| ".*-Addendum[\d]{0,1}\.(jpg|JPG)", | |
| ".*-Indicia\.(jpg|JPG)", | |
| ".*-dustjacket\.(jpg|JPG)", | |
| ".*-(IFC|[Ii]fc)\.(jpg|JPG|gif)", | |
| "acvr[abc]{0,1}\.jpg", | |
| ] | |
| WHITELIST_REGEX = re.compile("(" + "|".join(WHITELIST) +")") | |
| # ".*(\d{2,3}FC.*|\d{2,3}( |-)(ifc|fc|cover).*|\d{2,3}(cover|fc|ifc).*| FCi.*|letters*.)" | |
| #------------------------------------------- | |
| def get_recursive_filelist(pathlist): | |
| """Get a recursive list of of all files under all path items in the list""" | |
| #filename_encoding = sys.getfilesystemencoding() | |
| filelist = [] | |
| for p in pathlist: | |
| if os.path.isdir(p): | |
| for root, dirs, files in os.walk(p): | |
| for f in files: | |
| filelist.append(os.path.join(root, f)) | |
| else: | |
| filelist.append(p) | |
| return filelist | |
| #------------------------------------------- | |
| def log(output, noNewLine=False): | |
| global doQuiet | |
| if not doQuiet: | |
| if noNewLine: | |
| print(output, end='') | |
| else: | |
| print(output) | |
| #------------------------------------------- | |
| def log_image_info(text, info): | |
| outtext = " None" | |
| if info: | |
| try: | |
| ar = str(round( 1.0 * info['width'] / info['height'], 2)) | |
| except: | |
| ar = 0 | |
| outtext = " [{:20}] {} {:10} {:4}x{:4} ({}) {}".format( | |
| text, | |
| info['sha256'], | |
| info['file_size'], | |
| info['width'], | |
| info['height'], | |
| ar, | |
| info['basename']) | |
| log(outtext) | |
| #------------------------------------------- | |
| def calc_checksum(info): | |
| try: | |
| with info['zipfile'].open(info['name']) as img: | |
| image_data = img.read() | |
| #get md5sum | |
| m = hashlib.sha256() | |
| m.update(image_data) | |
| info['sha256'] = m.hexdigest() | |
| except: | |
| pass | |
| #------------------------------------------- | |
| def calc_dims(info): | |
| if info['height'] != -1: | |
| return | |
| try: | |
| with info['zipfile'].open(info['name']) as img: | |
| image_data = img.read() | |
| # image dims | |
| image_file = BytesIO(image_data) | |
| im = Image.open(image_file) | |
| info['width'], info['height'] = im.size | |
| except: | |
| pass | |
| #------------------------------------------- | |
| def write_image_to_cache(zip, info, csvwriter=None): | |
| if not info: | |
| return | |
| calc_checksum(info) | |
| directory = PAGE_CACHE_FOLDER | |
| if not os.path.exists(directory): | |
| os.makedirs(directory) | |
| try: | |
| with zip.open(info['name']) as img: | |
| image_data = img.read() | |
| _,ext = os.path.splitext(info['basename']) | |
| fname = os.path.join(directory, info['basename'] + "_" + info['sha256'] + ext) | |
| if not os.path.exists(fname): | |
| out = open(fname, "wb+") | |
| out.write(image_data) | |
| out.close() | |
| if csvwriter: | |
| csvwriter.writerow([zip.filename, info['name'], fname]) | |
| except Exception as e: | |
| log("write_image_to_cache: {} - {}".format(info['basename'], e)) | |
| pass | |
| #------------------------------------------- | |
| def remove_page(filename, pagename): | |
| log("removing page named {} from {}".format(pagename, filename)) | |
| # Escape initial '-' with a '\' | |
| if pagename.startswith("-"): | |
| pagename = "\\" + pagename | |
| pagename = pagename.replace("[", "\[") | |
| pagename = pagename.replace("]", "\]") | |
| cmd = "zip \"{}\" -d \"{}\"".format(filename, pagename) | |
| #log(cmd) | |
| retval = os.system(cmd) | |
| #------------------------------------------- | |
| def remove_identified_pages(args): | |
| with open(PAGE_CSV_FILE, newline='') as csvfile: | |
| csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') | |
| for row in csvreader: | |
| filename = row[0] | |
| pagename = row[1] | |
| cache_image = row[2] | |
| if os.path.exists(cache_image): | |
| remove_page(filename, pagename) | |
| else: | |
| log("Skipping {} [{}] because it has no match in the cache!".format(pagename, filename)) | |
| #------------------------------------------- | |
| def find_blacklisted_names(image_info_list): | |
| """ | |
| Quick regex of match any blacklisted patterns | |
| """ | |
| match_list = [] | |
| for info in image_info_list: | |
| if BLACKLIST_REGEX.match(info['basename']): | |
| match_list.append(info) | |
| log(" Blacklist: {} [{}]".format(info['basename'], info["zipfile"].filename)) | |
| return match_list | |
| #------------------------------------------- | |
| def analyze_start_chars(image_info_list): | |
| """ | |
| Find a unique name that start with repeated x's, y's, or z's | |
| Only returns a single item | |
| """ | |
| def find_unique_prefix(image_info_list, prefix): | |
| # Count the frequency of filename lengths | |
| prefix_count = 0 | |
| matched_info = None | |
| for info in image_info_list: | |
| if info['basename'].lower().startswith(prefix): | |
| prefix_count += 1 | |
| if prefix_count == 1: | |
| matched_info = info | |
| else: | |
| # prefix is not unique | |
| matched_info = None | |
| break | |
| return matched_info | |
| # More recent scanners use a number of z's or x's at the start of their page | |
| for letter in ['z','y','x']: | |
| for count in range(5,0,-1): | |
| info = find_unique_prefix(image_info_list, letter * count) | |
| if info: | |
| return info | |
| return None | |
| #------------------------------------------- | |
| def analyze_filename_diffs(image_info_list): | |
| """ | |
| Find probable scanner pages based on the relative name uniqueness | |
| compared to the other page names. | |
| """ | |
| # don't consider any whitelisted patterns | |
| analyze_list = [] | |
| for info in image_info_list: | |
| if not WHITELIST_REGEX.match(info['basename']): | |
| analyze_list.append(info) | |
| pagename_list = [] | |
| for info in analyze_list: | |
| pagename_list.append(info['basename']) | |
| # For each page name, count how many others are similar. The less similar the name is | |
| # to the others, the more likely it is to be a scanner page | |
| likely_scanner_pages = [] | |
| for info in analyze_list: | |
| info['similarity_count'] = len(get_close_matches(info['basename'], pagename_list,len(pagename_list))) | |
| if info['similarity_count'] < 3: | |
| log(" Dissimilar: {} [{}]".format( info['basename'], info["zipfile"].filename)) | |
| likely_scanner_pages.append(info) | |
| return likely_scanner_pages | |
| #------------------------------------------- | |
| def analyze_image_aspect_ratio(image_info_list): | |
| # Some page names for the scanner page might | |
| # be unremarkable. If so, it's almost certainly | |
| # alphabeticaly last. | |
| # Make a historgram or group together pages with common aspect ratios | |
| # If the final page is an outlier, it might be a scanner | |
| # page | |
| ar_buckets = {} | |
| for info in image_info_list: | |
| calc_dims(info) | |
| if info['height'] == -1: | |
| break | |
| aspect_ratio = str(round( 1.0 * info['width'] / info['height'], 2)) | |
| if aspect_ratio not in ar_buckets: | |
| ar_buckets[aspect_ratio] = [] | |
| ar_buckets[aspect_ratio].append(info) | |
| # Now look for a singular aspect ratios | |
| for key in ar_buckets.keys(): | |
| #log(" {}: {}".format(key , len(ar_buckets[key]) )) | |
| if len(ar_buckets[key]) == 1 and image_info_list[-1] in ar_buckets[key]: | |
| log(" Aspect: {} [{}]".format(info['basename'], info["zipfile"].filename)) | |
| return image_info_list[-1] | |
| return None | |
| #------------------------------------------- | |
| """ | |
| def analyze_image_dimensions(image_info_list): | |
| # For most comics, we expect the heights of all pages to be same | |
| # (the width may vary for double page spreads) Often the scanner page | |
| # will have unique dimensions. Compare heights, and look for a page | |
| # with a unique height | |
| # Count the frequency of page heights | |
| height_counts = {} | |
| for info in image_info_list: | |
| calc_dims(info) | |
| if info['height'] == -1: | |
| break | |
| height_value = info['height'] | |
| if height_value in height_counts: | |
| height_counts[height_value] += 1 | |
| else: | |
| height_counts[height_value] = 1 | |
| # see if there is only one height occurance that is unique (count of 1) | |
| unique_height = None | |
| unique_height_ocurrances = 0 | |
| for k in height_counts.keys(): | |
| if height_counts[k] == 1: | |
| unique_height = k | |
| unique_height_ocurrances += 1 | |
| if unique_height_ocurrances > 1: | |
| unique_height = None | |
| break | |
| if unique_height: | |
| # find the matching info item | |
| for info in image_info_list: | |
| if info['height'] == unique_height: | |
| return info | |
| """ | |
| #------------------------------------------- | |
| def extract_image_info(zip): | |
| """ | |
| Get a list of dicts with info for each image in a zipfile | |
| """ | |
| # only look at image files | |
| image_name_list = [] | |
| image_exts = [".jpg", ".jpeg", ".png", ".gif", ".webp"] | |
| for f in zip.namelist(): | |
| base,ext = os.path.splitext(f) | |
| if ext.lower() in image_exts: | |
| image_name_list.append(f) | |
| # find info about each image in zip file | |
| image_info_list = [] | |
| for name in image_name_list: | |
| image_info = {} | |
| image_info['name'] = name | |
| image_info['basename'] = name.split('/')[-1] | |
| image_info['zipfile'] = zip | |
| zip_info = zip.getinfo(name) | |
| image_info['file_size'] = zip_info.file_size | |
| # the following items are slow, so only get them as needed | |
| image_info['height'] = -1 | |
| image_info['width'] = -1 | |
| image_info['sha256'] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | |
| image_info_list.append(image_info) | |
| # Sort the list by name | |
| image_info_list = sorted(image_info_list, key=lambda k: k['name']) | |
| return image_info_list | |
| #------------------------------------------- | |
| def identify_scanner_page(args, filename, csvwriter): | |
| try: | |
| zip = zipfile.ZipFile(filename) | |
| except: | |
| #log("Unable to read as zipfile: {}".format(filename)) | |
| return | |
| if args.verbose: | |
| log("Processing: {}".format(filename)) | |
| image_info_list = extract_image_info(zip) | |
| guesses = [] | |
| if args.identify: | |
| # Apply blacklist | |
| guesses.extend(find_blacklisted_names(image_info_list)) | |
| diff_based_guesses = analyze_filename_diffs(image_info_list) | |
| for guess in diff_based_guesses: | |
| if guess not in guesses: | |
| guesses.append(guess) | |
| if args.aspectratio: | |
| ar_based_guess = analyze_image_aspect_ratio(image_info_list) | |
| if ar_based_guess not in guesses: | |
| guesses.append(guess) | |
| for info in guesses: | |
| write_image_to_cache(zip, info, csvwriter) | |
| elif args.fastmode: | |
| match = analyze_start_chars(image_info_list) | |
| # start_char match is very high confidence, we can safely delete | |
| if match: | |
| remove_page(filename, match['name']) | |
| if args.debug: | |
| for info in image_info_list: | |
| calc_checksum(info) | |
| calc_dims(info) | |
| log_image_info("debug", info) | |
| #------------------------------------------- | |
| def parseArgs(): | |
| global alltags | |
| global doQuiet | |
| parser = ArgumentParser() | |
| parser.description = "Identify or remove the scanner page from comic file (CBZ only)" | |
| parser.add_argument("-l", "--listfile", dest="listfile", default=None, | |
| help="read LISTFILE for the list of comic files to operate on", | |
| metavar='LISTFILE' ) | |
| parser.add_argument('comicfile', nargs='*', metavar='COMICFILE', | |
| help="comic file(s) to operate on",) | |
| parser.add_argument("-f", "--fast", dest="fastmode", | |
| action="store_true", default=False, | |
| help="Use one-pass fast mode to identify and remove scanner pages using starting char method") | |
| parser.add_argument("-r", "--remove", dest="remove", | |
| action="store_true", default=False, | |
| help="Remove scanner pages found during identify mode") | |
| parser.add_argument("-i", "--identify", dest="identify", | |
| action="store_true", default=False, | |
| help="Try to identify scanner pages in comics") | |
| parser.add_argument("-a", "--aspectratio", dest="aspectratio", | |
| action="store_true", default=False, | |
| help="Use aspect ratio when identifying (slower)") | |
| parser.add_argument("-R", "--recursive", dest="recursive", | |
| action="store_true", default=False, | |
| help="Recursively include files in sub-folders.") | |
| parser.add_argument("-v", "--verbose", dest="verbose", | |
| action="store_true", default=False, | |
| help="Verbose output") | |
| parser.add_argument("-d", "--debug", dest="debug", | |
| action="store_true", default=False, | |
| help="Debug output") | |
| args = parser.parse_args() | |
| mode_count = 0 | |
| if args.remove: | |
| mode_count+=1 | |
| if args.identify: | |
| mode_count+=1 | |
| if args.fastmode: | |
| mode_count+=1 | |
| if mode_count != 1: | |
| parser.error("Must specify exactly one run mode: identify, remove, or fast") | |
| sys.exit(0) | |
| if args.listfile is None and len(args.comicfile) == 0 and not args.remove: | |
| parser.error("Need to specify comic files on command line, or use -l to point a file with comic file list") | |
| sys.exit(0) | |
| return args | |
| #------------------------------------------- | |
| def main(): | |
| args = parseArgs() | |
| # determine the list of comic files | |
| filenames = [] | |
| if args.listfile is not None: | |
| if os.path.exists(args.listfile): | |
| filenames = [line.rstrip('\n') for line in open(args.listfile)] | |
| else: | |
| print("Can't find {}".format(args.listfile)) | |
| sys.exit(-1) | |
| else: | |
| filenames = args.comicfile | |
| # remove empty strings from list | |
| #filenames = filter(None, filenames) | |
| if not args.identify and not args.fastmode and not args.remove: | |
| log("Must specify mode!") | |
| return | |
| if args.remove: | |
| remove_identified_pages(args) | |
| return | |
| if args.recursive: | |
| filenames = get_recursive_filelist(filenames) | |
| if args.identify: | |
| csvfile = open('scanner_page_list.csv', 'w') | |
| csvwriter = csv.writer(csvfile, | |
| delimiter=',', | |
| quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
| else: | |
| csvwriter = None | |
| # process each comic file | |
| for filename in filenames: | |
| identify_scanner_page(args, filename, csvwriter) | |
| if args.identify: | |
| csvfile.flush() | |
| if args.identify: | |
| csvfile.close() | |
| #------------------------------------------- | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment