Skip to content

Instantly share code, notes, and snippets.

@kylemcdonald
Last active September 8, 2020 07:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylemcdonald/86d4cc68d52c56d0df20ddf502bcf98a to your computer and use it in GitHub Desktop.
Save kylemcdonald/86d4cc68d52c56d0df20ddf502bcf98a to your computer and use it in GitHub Desktop.
Search for dates and times across all files in a folder.
"""
Search for dates and times in a folder.
$ python3 search-dates.py facebook-kcimc/
$ python3 search-dates.py Takeout/
"""
import os
import sys
import re
import shutil
from pathlib import Path
from itertools import islice
root = sys.argv[1] # folder to search
ignore_keywords = ['messages'] # keywords to ignore in directories and filenames
valid_extensions = ['.txt', '.html', '.json', '.csv'] # extensions to search
min_results = 5 # how many matches per file before printing out examples
max_results = 1000 # how many matches per file before we stop counting
printed_results = 10 # how many example matches to print out
terminal_width = shutil.get_terminal_size((80, 20)).columns
padding = terminal_width // 2
# find:
# - timestamps in seconds or milliseconds
# - years 2018-2020
# - time of day
pattern = \
r'\b15\d{8,11}\b|'+\
r'\b20(18|19|20)\b|'+\
r'\d\d?:\d\d:\d\d'
def yellow(e):
return f'\033[93m{e}\033[00m'
def cyan(e):
return f'\033[96m{e}\033[00m'
def highlight_matches(e, color):
parts = []
prev = 0
for match in re.finditer(pattern, e):
start = match.start()
end = match.end()
parts.append(e[prev:start])
parts.append(color(e[start:end]))
prev = end
parts.append(e[prev:])
return ''.join(parts)
for path in Path(root).rglob('*'):
if not path.is_file():
continue
if any([e in str(path) for e in ignore_keywords]):
continue
ext = os.path.splitext(path)[1]
if ext not in valid_extensions:
continue
with open(path) as f:
raw = f.read()
results = []
match_iter = re.finditer(pattern, raw)
for match in islice(match_iter, max_results):
start = max(0, match.start() - padding)
end = min(len(raw) - 1, match.end() + padding)
text = raw[start:end].replace('\n', ' ').replace('\t', ' ')
text = text[:terminal_width]
results.append(highlight_matches(text, cyan))
if len(results) < min_results:
continue
results_str = str(len(results))
if len(results) == max_results:
results_str += '+'
print(yellow(f'({results_str}) {path}'))
print('\n'.join(results[:printed_results]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment