GRAYgoose124/single_page_organizer.py

## single_page_organizer.py
#!/usr/bin/env python
import argparse
import datetime
import re
import os
import yaml
import nltk

from pathlib import Path


# TODO: Filter out malformed pages
# TODO: Whitelist titles to never remove
# TODO: better blacklist pattern matching to not accidentally delete pages


BLACKLIST = [
    'Youtube',
    'DDoS-Guard',
    'No Title',
    'Sign In',
    'Element _ _',
    'GitHub',
    'Discord',
    'Download',
    'Google Search'
    "'s gists",
    "Gmail"
]


def argparser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-dlp', '--html_download_path', type=Path, default=os.getcwd())
    parser.add_argument('-bl', '--blacklist', nargs='+', default=BLACKLIST)
    parser.add_argument('-rf', '--results_file', type=Path, default=Path('single_page_results.yml'))
    parser.add_argument('-r', '--RUN', action='store_true', default=False)
    parser.add_argument('-urf', '--USE_RESULTS_FILE', action='store_true', default=False)

    return parser


def parse_html_folder(path, blacklist=None):
    # glob all html files
    html_files = path.glob('*.html')

    # regex to extract title and time
    title = r"(.*)"
    time  = r"(\(\d{1,2}_\d{1,2}_\d{4} \d{1,2}_\d{1,2}_\d{1,2} [AP]M\))"
    optional_duplicate = r"(\(\d\))?"
    pattern = rf'{title} {time}{optional_duplicate}.html'
    html_name_regex = re.compile(pattern)

    # clean up files
    duplicates = {}
    to_remove = set()
    seen = set()
    not_yet_duplicated = {}
    for html_file in html_files:
        matched = html_name_regex.search(html_file.name)
        if matched is None:
            continue

        title, time, duplicate = matched.groups()
        if duplicate is not None:
            duplicate = int(duplicate[1:-1])
        else:
            duplicate = 0

        time = datetime.datetime.strptime(time[1:-1], '%m_%d_%Y %I_%M_%S %p')

        # blacklist
        if blacklist is not None:
            for blacklisted in blacklist:
                if blacklisted.lower() in title.lower():
                    #html_file.unlink()
                    to_remove.add(html_file)

        # duplicates
        if title in seen:
            if title not in duplicates:
                duplicates[title] = {}
            if time not in duplicates[title]:
                duplicates[title][time] = []

            if title in not_yet_duplicated:
                t, d, hf = not_yet_duplicated.pop(title)
                if t not in duplicates[title]:
                    duplicates[title][t] = []
                duplicates[title][t].append((d, hf))

            duplicates[title][time].append((duplicate, html_file))
        else:
            not_yet_duplicated[title] = time, duplicate, html_file
            seen.add(title)

    # remove duplicates, etc
    flattened = flatten_duplicates(duplicates)

    for title, files in flattened.items():
        files.sort(key=lambda x: x[0])
        to_remove.update([f for _, f in files[1:]])

    return to_remove


def flatten_duplicates(d):
    """Duplicates are indiced by title then time. This function flattens the
    dictionary so that it is indexed by title only.
    """
    flattened = {}
    for title, times in d.items():
        for time, duplicates in times.items():
            for _, html_file in duplicates:
                if title not in flattened:
                    flattened[title] = []
                flattened[title].append((time, html_file))
    return flattened


def init_yaml():
    # yaml loader for pathlib.Path
    def path_constructor(loader, node):
        return Path(*loader.construct_sequence(node))

    yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.WindowsPath', path_constructor)
    yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.PosixPath', path_constructor)


def main():
    init_yaml()

    parser = argparser()
    args = parser.parse_args()

    if args.results_file.exists() and args.USE_RESULTS_FILE:
        print(f'Loading results from {args.results_file} \U0001F98C')
        with open(args.results_file, 'r') as f:
            to_remove = yaml.full_load(f)
            LOADED_FROM_FILE = True
    else:
        to_remove = parse_html_folder(args.html_download_path, blacklist=args.blacklist)
        LOADED_FROM_FILE = False

    count = len(to_remove)
    if count == 0:
        print('No files to remove. Exiting... \U0001F98B')
        return
    else:
        print(f'Found {count} files to remove. \U0001F98E')

    if not args.RUN:
        run = input('Run? [y/N] ')

    if args.RUN or run.lower() == 'y':
        print('Removing files now...')
        # remove files
        for html_file in to_remove:
            html_file.unlink()

        if args.results_file.exists():
            args.results_file.unlink()

        print('Done!\tGoodbye.\t\t\t\U0001F98B')
    else:
        # dump results to file
        if not LOADED_FROM_FILE:
            with open(args.results_file, 'w') as f:
                yaml.dump(to_remove, f)

            print(f'Wrote results to {args.results_file} \U0001F98B')

        print(f'Dry run complete!\tGoodbye.')


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	import argparse
	import datetime
	import re
	import os
	import yaml
	import nltk

	from pathlib import Path


	# TODO: Filter out malformed pages
	# TODO: Whitelist titles to never remove
	# TODO: better blacklist pattern matching to not accidentally delete pages


	BLACKLIST = [
	'Youtube',
	'DDoS-Guard',
	'No Title',
	'Sign In',
	'Element _ _',
	'GitHub',
	'Discord',
	'Download',
	'Google Search'
	"'s gists",
	"Gmail"
	]


	def argparser():
	parser = argparse.ArgumentParser()
	parser.add_argument('-dlp', '--html_download_path', type=Path, default=os.getcwd())
	parser.add_argument('-bl', '--blacklist', nargs='+', default=BLACKLIST)
	parser.add_argument('-rf', '--results_file', type=Path, default=Path('single_page_results.yml'))
	parser.add_argument('-r', '--RUN', action='store_true', default=False)
	parser.add_argument('-urf', '--USE_RESULTS_FILE', action='store_true', default=False)

	return parser


	def parse_html_folder(path, blacklist=None):
	# glob all html files
	html_files = path.glob('*.html')

	# regex to extract title and time
	title = r"(.*)"
	time = r"(\(\d{1,2}_\d{1,2}_\d{4} \d{1,2}_\d{1,2}_\d{1,2} [AP]M\))"
	optional_duplicate = r"(\(\d\))?"
	pattern = rf'{title} {time}{optional_duplicate}.html'
	html_name_regex = re.compile(pattern)

	# clean up files
	duplicates = {}
	to_remove = set()
	seen = set()
	not_yet_duplicated = {}
	for html_file in html_files:
	matched = html_name_regex.search(html_file.name)
	if matched is None:
	continue

	title, time, duplicate = matched.groups()
	if duplicate is not None:
	duplicate = int(duplicate[1:-1])
	else:
	duplicate = 0

	time = datetime.datetime.strptime(time[1:-1], '%m_%d_%Y %I_%M_%S %p')

	# blacklist
	if blacklist is not None:
	for blacklisted in blacklist:
	if blacklisted.lower() in title.lower():
	#html_file.unlink()
	to_remove.add(html_file)

	# duplicates
	if title in seen:
	if title not in duplicates:
	duplicates[title] = {}
	if time not in duplicates[title]:
	duplicates[title][time] = []

	if title in not_yet_duplicated:
	t, d, hf = not_yet_duplicated.pop(title)
	if t not in duplicates[title]:
	duplicates[title][t] = []
	duplicates[title][t].append((d, hf))

	duplicates[title][time].append((duplicate, html_file))
	else:
	not_yet_duplicated[title] = time, duplicate, html_file
	seen.add(title)

	# remove duplicates, etc
	flattened = flatten_duplicates(duplicates)

	for title, files in flattened.items():
	files.sort(key=lambda x: x[0])
	to_remove.update([f for _, f in files[1:]])

	return to_remove


	def flatten_duplicates(d):
	"""Duplicates are indiced by title then time. This function flattens the
	dictionary so that it is indexed by title only.
	"""
	flattened = {}
	for title, times in d.items():
	for time, duplicates in times.items():
	for _, html_file in duplicates:
	if title not in flattened:
	flattened[title] = []
	flattened[title].append((time, html_file))
	return flattened


	def init_yaml():
	# yaml loader for pathlib.Path
	def path_constructor(loader, node):
	return Path(*loader.construct_sequence(node))

	yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.WindowsPath', path_constructor)
	yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.PosixPath', path_constructor)


	def main():
	init_yaml()

	parser = argparser()
	args = parser.parse_args()

	if args.results_file.exists() and args.USE_RESULTS_FILE:
	print(f'Loading results from {args.results_file} \U0001F98C')
	with open(args.results_file, 'r') as f:
	to_remove = yaml.full_load(f)
	LOADED_FROM_FILE = True
	else:
	to_remove = parse_html_folder(args.html_download_path, blacklist=args.blacklist)
	LOADED_FROM_FILE = False

	count = len(to_remove)
	if count == 0:
	print('No files to remove. Exiting... \U0001F98B')
	return
	else:
	print(f'Found {count} files to remove. \U0001F98E')

	if not args.RUN:
	run = input('Run? [y/N] ')

	if args.RUN or run.lower() == 'y':
	print('Removing files now...')
	# remove files
	for html_file in to_remove:
	html_file.unlink()

	if args.results_file.exists():
	args.results_file.unlink()

	print('Done!\tGoodbye.\t\t\t\U0001F98B')
	else:
	# dump results to file
	if not LOADED_FROM_FILE:
	with open(args.results_file, 'w') as f:
	yaml.dump(to_remove, f)

	print(f'Wrote results to {args.results_file} \U0001F98B')

	print(f'Dry run complete!\tGoodbye.')


	if __name__ == '__main__':
	main()