Skip to content

Instantly share code, notes, and snippets.

@GRAYgoose124
Last active March 26, 2023 22:06
Show Gist options
  • Save GRAYgoose124/fd0d04e3e02f9ad41ad968d1d36fe032 to your computer and use it in GitHub Desktop.
Save GRAYgoose124/fd0d04e3e02f9ad41ad968d1d36fe032 to your computer and use it in GitHub Desktop.
A script to organize and clean html files generated by SingleFile browser extension.
#!/usr/bin/env python
import argparse
import datetime
import re
import os
import yaml
import nltk
from pathlib import Path
# TODO: Filter out malformed pages
# TODO: Whitelist titles to never remove
# TODO: better blacklist pattern matching to not accidentally delete pages
BLACKLIST = [
'Youtube',
'DDoS-Guard',
'No Title',
'Sign In',
'Element _ _',
'GitHub',
'Discord',
'Download',
'Google Search'
"'s gists",
"Gmail"
]
def argparser():
parser = argparse.ArgumentParser()
parser.add_argument('-dlp', '--html_download_path', type=Path, default=os.getcwd())
parser.add_argument('-bl', '--blacklist', nargs='+', default=BLACKLIST)
parser.add_argument('-rf', '--results_file', type=Path, default=Path('single_page_results.yml'))
parser.add_argument('-r', '--RUN', action='store_true', default=False)
parser.add_argument('-urf', '--USE_RESULTS_FILE', action='store_true', default=False)
return parser
def parse_html_folder(path, blacklist=None):
# glob all html files
html_files = path.glob('*.html')
# regex to extract title and time
title = r"(.*)"
time = r"(\(\d{1,2}_\d{1,2}_\d{4} \d{1,2}_\d{1,2}_\d{1,2} [AP]M\))"
optional_duplicate = r"(\(\d\))?"
pattern = rf'{title} {time}{optional_duplicate}.html'
html_name_regex = re.compile(pattern)
# clean up files
duplicates = {}
to_remove = set()
seen = set()
not_yet_duplicated = {}
for html_file in html_files:
matched = html_name_regex.search(html_file.name)
if matched is None:
continue
title, time, duplicate = matched.groups()
if duplicate is not None:
duplicate = int(duplicate[1:-1])
else:
duplicate = 0
time = datetime.datetime.strptime(time[1:-1], '%m_%d_%Y %I_%M_%S %p')
# blacklist
if blacklist is not None:
for blacklisted in blacklist:
if blacklisted.lower() in title.lower():
#html_file.unlink()
to_remove.add(html_file)
# duplicates
if title in seen:
if title not in duplicates:
duplicates[title] = {}
if time not in duplicates[title]:
duplicates[title][time] = []
if title in not_yet_duplicated:
t, d, hf = not_yet_duplicated.pop(title)
if t not in duplicates[title]:
duplicates[title][t] = []
duplicates[title][t].append((d, hf))
duplicates[title][time].append((duplicate, html_file))
else:
not_yet_duplicated[title] = time, duplicate, html_file
seen.add(title)
# remove duplicates, etc
flattened = flatten_duplicates(duplicates)
for title, files in flattened.items():
files.sort(key=lambda x: x[0])
to_remove.update([f for _, f in files[1:]])
return to_remove
def flatten_duplicates(d):
"""Duplicates are indiced by title then time. This function flattens the
dictionary so that it is indexed by title only.
"""
flattened = {}
for title, times in d.items():
for time, duplicates in times.items():
for _, html_file in duplicates:
if title not in flattened:
flattened[title] = []
flattened[title].append((time, html_file))
return flattened
def init_yaml():
# yaml loader for pathlib.Path
def path_constructor(loader, node):
return Path(*loader.construct_sequence(node))
yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.WindowsPath', path_constructor)
yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.PosixPath', path_constructor)
def main():
init_yaml()
parser = argparser()
args = parser.parse_args()
if args.results_file.exists() and args.USE_RESULTS_FILE:
print(f'Loading results from {args.results_file} \U0001F98C')
with open(args.results_file, 'r') as f:
to_remove = yaml.full_load(f)
LOADED_FROM_FILE = True
else:
to_remove = parse_html_folder(args.html_download_path, blacklist=args.blacklist)
LOADED_FROM_FILE = False
count = len(to_remove)
if count == 0:
print('No files to remove. Exiting... \U0001F98B')
return
else:
print(f'Found {count} files to remove. \U0001F98E')
if not args.RUN:
run = input('Run? [y/N] ')
if args.RUN or run.lower() == 'y':
print('Removing files now...')
# remove files
for html_file in to_remove:
html_file.unlink()
if args.results_file.exists():
args.results_file.unlink()
print('Done!\tGoodbye.\t\t\t\U0001F98B')
else:
# dump results to file
if not LOADED_FROM_FILE:
with open(args.results_file, 'w') as f:
yaml.dump(to_remove, f)
print(f'Wrote results to {args.results_file} \U0001F98B')
print(f'Dry run complete!\tGoodbye.')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment