Last active
January 5, 2018 18:12
-
-
Save Integralist/03279be86a356119b1f820da6ebb8740 to your computer and use it in GitHub Desktop.
[Validate README format with Python] #python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Usage: | |
python ./scripts/validate_readmes.py | |
validate every README (~1min) | |
python ./scripts/validate_readmes.py --debug | |
validate every README with details on what headers are missing (~1min) | |
python ./scripts/validate_readmes.py --debug --service bpager | |
validate specific service README (time varies by service directory structure) | |
""" | |
import argparse | |
import glob | |
import re | |
import unicodedata | |
parser = argparse.ArgumentParser(description="Validate READMEs") | |
parser.add_argument("-d", "--debug", | |
help="Show failure details", action="store_true") | |
parser.add_argument("-s", "--service", help="Validate specific service", | |
default="**") | |
parser.set_defaults(debug=False) | |
args = parser.parse_args() | |
valid_headers = set([ | |
"## Point of Contact and Slack channel", | |
"## Running the service", | |
"## System", | |
"## Runbook", | |
"## Monitoring", | |
"## Documentation", | |
]) | |
title = re.compile("^\s*(#.+$)", re.MULTILINE) | |
invalid = re.compile("node_modules") | |
service = args.service | |
path = f"../buzzfeed/mono/{service}/README.md" | |
for filename in glob.iglob(path, recursive=True): | |
with open(filename, "r") as f: | |
if not (invalid.search(filename)): | |
print(filename, "\n\n") | |
contents = f.read() | |
contents = unicodedata.normalize("NFKD", contents) | |
matches = title.findall(contents) | |
missing_headers = valid_headers - set(matches) | |
is_valid = valid_headers.issubset(set(matches)) | |
if matches: | |
if args.debug: | |
print("Found headers:\n", matches, "\n\n") | |
print("Expected headers:\n", valid_headers, "\n\n") | |
print("Missing headers:\n", missing_headers, "\n\n") | |
print("Is README valid?\n", is_valid) | |
else: | |
print("no matching headers found? so will presume this README is invalid") | |
print("\n\n---\n\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python 3.4 has no iglob recursive option so we have to walk the entire tree (very slow) | |
#!/usr/bin/env python3 | |
""" | |
Usage: | |
./scripts/validate_readmes | |
validate every README (~40s) | |
./scripts/validate_readmes --missing | |
report how many READMEs are missing from top level service directories | |
./scripts/validate_readmes --debug | |
validate every README with details on what headers are missing (~40s+) | |
./scripts/validate_readmes --debug --service bpager | |
validate specific service README (time varies by service directory structure) | |
./scripts/validate_readmes --result | |
validate every README and show the number of valid vs invalid READMEs (~40s+) | |
./scripts/validate_readmes --service bpager --result | |
validate specific service README and show the number of valid vs invalid READMEs | |
./scripts/validate_readmes --csv > readmes.csv | |
generate CSV data highlighting valid vs invalid READMEs (~40s+) | |
./scripts/validate_readmes --service bpager --csv > bpager.csv | |
generate CSV data highlighting valid vs invalid READMEs for a specific service | |
""" | |
import argparse | |
import fnmatch | |
import json | |
import re | |
import os | |
import unicodedata | |
parser = argparse.ArgumentParser(description="Validate READMEs") | |
parser.add_argument("-d", "--debug", | |
help="Show failure details", action="store_true") | |
parser.add_argument("-s", "--service", help="Validate specific service", | |
default="") | |
parser.add_argument("-r", "--result", help="Show only the results of the validation checker", | |
action="store_true") | |
parser.add_argument("-c", "--csv", help="Generate CSV of results", | |
action="store_true") | |
parser.add_argument("-m", "--missing", help="Show services that have no README file", action="store_true") | |
parser.set_defaults(debug=False, result=False, csv=False, missing=False) | |
args = parser.parse_args() | |
valid_headers = set([ | |
"## Point of Contact and Slack channel", | |
"## Running the service", | |
"## System", | |
"## Runbook", | |
"## Monitoring", | |
"## Documentation", | |
]) | |
title = re.compile("^\s*(#.+$)", re.MULTILINE) | |
invalid = re.compile("node_modules") | |
service = args.service | |
valid = 0 | |
csv = "file,valid\n" | |
top_level_dirs = {} | |
if args.missing: | |
for item in sorted(os.listdir("./")): | |
path = os.path.join("./", item) | |
is_dir = os.path.isdir(path) | |
if is_dir and not path.startswith("./."): | |
top_level_dirs[path] = False | |
for file in os.listdir(path): | |
if file == "README.md": | |
top_level_dirs[path] = True | |
if args.debug: | |
print(json.dumps(top_level_dirs, indent=2, sort_keys=True)) | |
total = len(top_level_dirs) | |
missing = total - sum(top_level_dirs.values()) | |
percent = round(missing / total * 100) | |
print("\n{}% of top level directories are missing a README ({} out of {})\n".format(percent, missing, total)) | |
exit() | |
if service == "" and not args.csv: | |
print("hold tight, this can take up to 40 seconds to process every README in mono...") | |
file_matches = [] | |
for root, dirnames, filenames in os.walk("./" + args.service): | |
for filename in fnmatch.filter(filenames, "README.md"): | |
fn = os.path.join(root, filename) | |
if not invalid.search(fn): | |
file_matches.append(fn) | |
for filename in file_matches: | |
with open(filename, "r") as f: | |
contents = f.read() | |
contents = unicodedata.normalize("NFKD", contents) | |
matches = title.findall(contents) | |
missing_headers = valid_headers - set(matches) | |
is_valid = valid_headers.issubset(set(matches)) | |
if args.csv: | |
csv += "{},{}\n".format(filename, is_valid) | |
if is_valid: | |
valid += 1 | |
if not args.result and not args.csv: | |
print(filename, "\n\n") | |
if matches: | |
if args.debug: | |
print("Found headers:\n", matches, "\n\n") | |
print("Expected headers:\n", valid_headers, "\n\n") | |
print("Missing headers:\n", missing_headers, "\n\n") | |
print("Is README valid?\n", is_valid) | |
else: | |
print("no matching headers found? so will presume this README is invalid") | |
print("\n\n---\n\n") | |
if args.result and not args.csv: | |
percent = round(valid / len(file_matches) * 100) | |
print("\n{}% of the READMEs found are valid ({} out of {})\n".format(percent, valid, len(file_matches))) | |
if args.csv: | |
print(csv) | |
""" | |
Inconsistent Numbers? | |
When using the `--missing` flag, the script doesn't walk every _nested_ directory (like it does when validating READMEs using the `--result` flag). This is because not all nested directories would logically have a README. | |
e.g. `/foo-service/random-dir/` can't necessarily be classed as _missing_ a README as there may well be no need for one there. Where as the top level directories in mono are all expected to have a README at the very least. | |
But some top level directories will indeed have a nested directory that _does_ have a README and if so we should at least validate those the best we can against the standard format. | |
e.g. `./scripts/validate_readmes -s bpager` reports there are two READMEs `./bpager/README.md` (valid) and `./bpager/tests/e2e/README.md` (invalid). | |
Subsequently, because `--missing` only checks the top level directories, we'll likely find that number (i.e. the number of directories "missing" a README) will be a much lower number than the number of "invalid vs valid" READMEs reported by the `--result` flag. | |
If you look at the examples given above, that's why the number of "found" READMEs is 378 (because it recursively searched all nested directories) where as the total number of top level directories is reported as 469. If you were to do `469-378` you won't see the same result as what is being reported as directories with "missing" READMEs (205) because that's not a comparison that can be made. | |
The reason the script validates all found READMEs, rather than just validating the READMEs in the top level directories, is because there's lots of Python package related READMEs (e.g. `./packages/foo-package/README.md`) that need to be validated and those would otherwise be missed. We could potentially add a whitelist or blacklist to account for "known" directories that we should recurse into but then we could in future end up skipping over directories that we might want to validate READMEs. | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python validate.py --debug | |
# note: this would print out 'all' READMEs... | |
--- | |
../buzzfeed/mono/video_player/README.md | |
Found headers: | |
['# Video Player Service (VPS)', '## Usage', '## Data Response', '## Running the service', '## System', '## Runbook', '## Monitoring', '### Logs', '### Dashboards', '### Alarms', '## Documentation'] | |
Expected headers: | |
{'## System', '## Documentation', '## Running the service', '## Monitoring', '## Point of Contact and Slack channel', '## Runbook', '## Usage'} | |
Missing headers: | |
{'## Point of Contact and Slack channel'} | |
Is README valid? | |
False | |
--- | |
$ python validate.py | |
# note: this would print out 'all' READMEs... | |
--- | |
../buzzfeed/mono/video_player/README.md | |
Is README valid? | |
False | |
--- | |
$ python validate.py --debug --service bpager | |
# note: this would print out 'only' the bpager README... | |
--- | |
../buzzfeed/mono/bpager/README.md | |
Found headers: | |
['# BPager', '## Running the service', '## Point of Contact and Slack channel', '## Runbook', '## System', '## Monitoring', '### Logs', '### Dashboards', '## Documentation'] | |
Expected headers: | |
{'## System', '## Monitoring', '## Running the service', '## Runbook', '## Point of Contact and Slack channel', '## Documentation'} | |
Missing headers: | |
set() | |
Is README valid? | |
True | |
--- | |
$ python validate.py --service bpager | |
# note: this would print out 'only' the bpager README... | |
--- | |
../buzzfeed/mono/bpager/README.md | |
Is README valid? | |
True | |
--- | |
./scripts/validate_readmes --service bpager --result | |
1 of 2 are valid | |
./scripts/validate_readmes --result | |
hold tight, this can take up to 40 seconds to process every README in mono... | |
19 of 1156 are valid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# no improvement in performance as the os.walk is the bottleneck and can't be sped up | |
import fnmatch | |
import itertools | |
import multiprocessing | |
import os | |
import re | |
import unicodedata | |
valid_headers = set([ | |
"## Point of Contact and Slack channel", | |
"## Running the service", | |
"## System", | |
"## Runbook", | |
"## Monitoring", | |
"## Documentation", | |
]) | |
title = re.compile("^\s*(#.+$)", re.MULTILINE) | |
invalid = re.compile("node_modules") | |
def worker(filename): | |
with open(filename, "r") as f: | |
if not invalid.search(filename): | |
print(filename, "\n\n") | |
contents = f.read() | |
contents = unicodedata.normalize("NFKD", contents) | |
matches = title.findall(contents) | |
missing_headers = valid_headers - set(matches) | |
is_valid = valid_headers.issubset(set(matches)) | |
if matches: | |
print("Is README valid?\n", is_valid) | |
else: | |
print("no matching headers found? so will presume this README is invalid") | |
print("\n\n---\n\n") | |
with multiprocessing.Pool(48) as Pool: # pool of 48 processes | |
walk = os.walk("./") # generator | |
# this is the bottleneck (the walking of all files & directories) | |
fn_gen = itertools.chain.from_iterable((os.path.join(root, file) | |
for file in fnmatch.filter(files, 'README.md')) | |
for root, dirs, files in walk) | |
results_of_work = Pool.map(worker, fn_gen) # this does the parallel processing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment