Instantly share code, notes, and snippets.

Embed
What would you like to do?
[Validate README format with Python] #python
"""
Usage:
python ./scripts/validate_readmes.py
validate every README (~1min)
python ./scripts/validate_readmes.py --debug
validate every README with details on what headers are missing (~1min)
python ./scripts/validate_readmes.py --debug --service bpager
validate specific service README (time varies by service directory structure)
"""
import argparse
import glob
import re
import unicodedata
parser = argparse.ArgumentParser(description="Validate READMEs")
parser.add_argument("-d", "--debug",
help="Show failure details", action="store_true")
parser.add_argument("-s", "--service", help="Validate specific service",
default="**")
parser.set_defaults(debug=False)
args = parser.parse_args()
valid_headers = set([
"## Point of Contact and Slack channel",
"## Running the service",
"## System",
"## Runbook",
"## Monitoring",
"## Documentation",
])
title = re.compile("^\s*(#.+$)", re.MULTILINE)
invalid = re.compile("node_modules")
service = args.service
path = f"../buzzfeed/mono/{service}/README.md"
for filename in glob.iglob(path, recursive=True):
with open(filename, "r") as f:
if not (invalid.search(filename)):
print(filename, "\n\n")
contents = f.read()
contents = unicodedata.normalize("NFKD", contents)
matches = title.findall(contents)
missing_headers = valid_headers - set(matches)
is_valid = valid_headers.issubset(set(matches))
if matches:
if args.debug:
print("Found headers:\n", matches, "\n\n")
print("Expected headers:\n", valid_headers, "\n\n")
print("Missing headers:\n", missing_headers, "\n\n")
print("Is README valid?\n", is_valid)
else:
print("no matching headers found? so will presume this README is invalid")
print("\n\n---\n\n")
# Python 3.4 has no iglob recursive option so we have to walk the entire tree (very slow)
#!/usr/bin/env python3
"""
Usage:
./scripts/validate_readmes
validate every README (~40s)
./scripts/validate_readmes --missing
report how many READMEs are missing from top level service directories
./scripts/validate_readmes --debug
validate every README with details on what headers are missing (~40s+)
./scripts/validate_readmes --debug --service bpager
validate specific service README (time varies by service directory structure)
./scripts/validate_readmes --result
validate every README and show the number of valid vs invalid READMEs (~40s+)
./scripts/validate_readmes --service bpager --result
validate specific service README and show the number of valid vs invalid READMEs
./scripts/validate_readmes --csv > readmes.csv
generate CSV data highlighting valid vs invalid READMEs (~40s+)
./scripts/validate_readmes --service bpager --csv > bpager.csv
generate CSV data highlighting valid vs invalid READMEs for a specific service
"""
import argparse
import fnmatch
import json
import re
import os
import unicodedata
parser = argparse.ArgumentParser(description="Validate READMEs")
parser.add_argument("-d", "--debug",
help="Show failure details", action="store_true")
parser.add_argument("-s", "--service", help="Validate specific service",
default="")
parser.add_argument("-r", "--result", help="Show only the results of the validation checker",
action="store_true")
parser.add_argument("-c", "--csv", help="Generate CSV of results",
action="store_true")
parser.add_argument("-m", "--missing", help="Show services that have no README file", action="store_true")
parser.set_defaults(debug=False, result=False, csv=False, missing=False)
args = parser.parse_args()
valid_headers = set([
"## Point of Contact and Slack channel",
"## Running the service",
"## System",
"## Runbook",
"## Monitoring",
"## Documentation",
])
title = re.compile("^\s*(#.+$)", re.MULTILINE)
invalid = re.compile("node_modules")
service = args.service
valid = 0
csv = "file,valid\n"
top_level_dirs = {}
if args.missing:
for item in sorted(os.listdir("./")):
path = os.path.join("./", item)
is_dir = os.path.isdir(path)
if is_dir and not path.startswith("./."):
top_level_dirs[path] = False
for file in os.listdir(path):
if file == "README.md":
top_level_dirs[path] = True
if args.debug:
print(json.dumps(top_level_dirs, indent=2, sort_keys=True))
total = len(top_level_dirs)
missing = total - sum(top_level_dirs.values())
percent = round(missing / total * 100)
print("\n{}% of top level directories are missing a README ({} out of {})\n".format(percent, missing, total))
exit()
if service == "" and not args.csv:
print("hold tight, this can take up to 40 seconds to process every README in mono...")
file_matches = []
for root, dirnames, filenames in os.walk("./" + args.service):
for filename in fnmatch.filter(filenames, "README.md"):
fn = os.path.join(root, filename)
if not invalid.search(fn):
file_matches.append(fn)
for filename in file_matches:
with open(filename, "r") as f:
contents = f.read()
contents = unicodedata.normalize("NFKD", contents)
matches = title.findall(contents)
missing_headers = valid_headers - set(matches)
is_valid = valid_headers.issubset(set(matches))
if args.csv:
csv += "{},{}\n".format(filename, is_valid)
if is_valid:
valid += 1
if not args.result and not args.csv:
print(filename, "\n\n")
if matches:
if args.debug:
print("Found headers:\n", matches, "\n\n")
print("Expected headers:\n", valid_headers, "\n\n")
print("Missing headers:\n", missing_headers, "\n\n")
print("Is README valid?\n", is_valid)
else:
print("no matching headers found? so will presume this README is invalid")
print("\n\n---\n\n")
if args.result and not args.csv:
percent = round(valid / len(file_matches) * 100)
print("\n{}% of the READMEs found are valid ({} out of {})\n".format(percent, valid, len(file_matches)))
if args.csv:
print(csv)
"""
Inconsistent Numbers?
When using the `--missing` flag, the script doesn't walk every _nested_ directory (like it does when validating READMEs using the `--result` flag). This is because not all nested directories would logically have a README.
e.g. `/foo-service/random-dir/` can't necessarily be classed as _missing_ a README as there may well be no need for one there. Where as the top level directories in mono are all expected to have a README at the very least.
But some top level directories will indeed have a nested directory that _does_ have a README and if so we should at least validate those the best we can against the standard format.
e.g. `./scripts/validate_readmes -s bpager` reports there are two READMEs `./bpager/README.md` (valid) and `./bpager/tests/e2e/README.md` (invalid).
Subsequently, because `--missing` only checks the top level directories, we'll likely find that number (i.e. the number of directories "missing" a README) will be a much lower number than the number of "invalid vs valid" READMEs reported by the `--result` flag.
If you look at the examples given above, that's why the number of "found" READMEs is 378 (because it recursively searched all nested directories) where as the total number of top level directories is reported as 469. If you were to do `469-378` you won't see the same result as what is being reported as directories with "missing" READMEs (205) because that's not a comparison that can be made.
The reason the script validates all found READMEs, rather than just validating the READMEs in the top level directories, is because there's lots of Python package related READMEs (e.g. `./packages/foo-package/README.md`) that need to be validated and those would otherwise be missed. We could potentially add a whitelist or blacklist to account for "known" directories that we should recurse into but then we could in future end up skipping over directories that we might want to validate READMEs.
"""
$ python validate.py --debug
# note: this would print out 'all' READMEs...
---
../buzzfeed/mono/video_player/README.md
Found headers:
['# Video Player Service (VPS)', '## Usage', '## Data Response', '## Running the service', '## System', '## Runbook', '## Monitoring', '### Logs', '### Dashboards', '### Alarms', '## Documentation']
Expected headers:
{'## System', '## Documentation', '## Running the service', '## Monitoring', '## Point of Contact and Slack channel', '## Runbook', '## Usage'}
Missing headers:
{'## Point of Contact and Slack channel'}
Is README valid?
False
---
$ python validate.py
# note: this would print out 'all' READMEs...
---
../buzzfeed/mono/video_player/README.md
Is README valid?
False
---
$ python validate.py --debug --service bpager
# note: this would print out 'only' the bpager README...
---
../buzzfeed/mono/bpager/README.md
Found headers:
['# BPager', '## Running the service', '## Point of Contact and Slack channel', '## Runbook', '## System', '## Monitoring', '### Logs', '### Dashboards', '## Documentation']
Expected headers:
{'## System', '## Monitoring', '## Running the service', '## Runbook', '## Point of Contact and Slack channel', '## Documentation'}
Missing headers:
set()
Is README valid?
True
---
$ python validate.py --service bpager
# note: this would print out 'only' the bpager README...
---
../buzzfeed/mono/bpager/README.md
Is README valid?
True
---
./scripts/validate_readmes --service bpager --result
1 of 2 are valid
./scripts/validate_readmes --result
hold tight, this can take up to 40 seconds to process every README in mono...
19 of 1156 are valid
# no improvement in performance as the os.walk is the bottleneck and can't be sped up
import fnmatch
import itertools
import multiprocessing
import os
import re
import unicodedata
valid_headers = set([
"## Point of Contact and Slack channel",
"## Running the service",
"## System",
"## Runbook",
"## Monitoring",
"## Documentation",
])
title = re.compile("^\s*(#.+$)", re.MULTILINE)
invalid = re.compile("node_modules")
def worker(filename):
with open(filename, "r") as f:
if not invalid.search(filename):
print(filename, "\n\n")
contents = f.read()
contents = unicodedata.normalize("NFKD", contents)
matches = title.findall(contents)
missing_headers = valid_headers - set(matches)
is_valid = valid_headers.issubset(set(matches))
if matches:
print("Is README valid?\n", is_valid)
else:
print("no matching headers found? so will presume this README is invalid")
print("\n\n---\n\n")
with multiprocessing.Pool(48) as Pool: # pool of 48 processes
walk = os.walk("./") # generator
# this is the bottleneck (the walking of all files & directories)
fn_gen = itertools.chain.from_iterable((os.path.join(root, file)
for file in fnmatch.filter(files, 'README.md'))
for root, dirs, files in walk)
results_of_work = Pool.map(worker, fn_gen) # this does the parallel processing
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment