Last active
September 8, 2020 07:28
-
-
Save kylemcdonald/86d4cc68d52c56d0df20ddf502bcf98a to your computer and use it in GitHub Desktop.
Search for dates and times across all files in a folder.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Search for dates and times in a folder. | |
$ python3 search-dates.py facebook-kcimc/ | |
$ python3 search-dates.py Takeout/ | |
""" | |
import os | |
import sys | |
import re | |
import shutil | |
from pathlib import Path | |
from itertools import islice | |
root = sys.argv[1] # folder to search | |
ignore_keywords = ['messages'] # keywords to ignore in directories and filenames | |
valid_extensions = ['.txt', '.html', '.json', '.csv'] # extensions to search | |
min_results = 5 # how many matches per file before printing out examples | |
max_results = 1000 # how many matches per file before we stop counting | |
printed_results = 10 # how many example matches to print out | |
terminal_width = shutil.get_terminal_size((80, 20)).columns | |
padding = terminal_width // 2 | |
# find: | |
# - timestamps in seconds or milliseconds | |
# - years 2018-2020 | |
# - time of day | |
pattern = \ | |
r'\b15\d{8,11}\b|'+\ | |
r'\b20(18|19|20)\b|'+\ | |
r'\d\d?:\d\d:\d\d' | |
def yellow(e): | |
return f'\033[93m{e}\033[00m' | |
def cyan(e): | |
return f'\033[96m{e}\033[00m' | |
def highlight_matches(e, color): | |
parts = [] | |
prev = 0 | |
for match in re.finditer(pattern, e): | |
start = match.start() | |
end = match.end() | |
parts.append(e[prev:start]) | |
parts.append(color(e[start:end])) | |
prev = end | |
parts.append(e[prev:]) | |
return ''.join(parts) | |
for path in Path(root).rglob('*'): | |
if not path.is_file(): | |
continue | |
if any([e in str(path) for e in ignore_keywords]): | |
continue | |
ext = os.path.splitext(path)[1] | |
if ext not in valid_extensions: | |
continue | |
with open(path) as f: | |
raw = f.read() | |
results = [] | |
match_iter = re.finditer(pattern, raw) | |
for match in islice(match_iter, max_results): | |
start = max(0, match.start() - padding) | |
end = min(len(raw) - 1, match.end() + padding) | |
text = raw[start:end].replace('\n', ' ').replace('\t', ' ') | |
text = text[:terminal_width] | |
results.append(highlight_matches(text, cyan)) | |
if len(results) < min_results: | |
continue | |
results_str = str(len(results)) | |
if len(results) == max_results: | |
results_str += '+' | |
print(yellow(f'({results_str}) {path}')) | |
print('\n'.join(results[:printed_results])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment