Integralist/1. Validate README format with Python.py

## 1. Validate README format with Python.py
"""
Usage:

    python ./scripts/validate_readmes.py
        validate every README (~1min)

    python ./scripts/validate_readmes.py --debug
        validate every README with details on what headers are missing (~1min)

    python ./scripts/validate_readmes.py --debug --service bpager
        validate specific service README (time varies by service directory structure)
"""

import argparse
import glob
import re
import unicodedata

parser = argparse.ArgumentParser(description="Validate READMEs")
parser.add_argument("-d", "--debug",
                    help="Show failure details", action="store_true")
parser.add_argument("-s", "--service", help="Validate specific service",
                    default="**")
parser.set_defaults(debug=False)
args = parser.parse_args()

valid_headers = set([
    "## Point of Contact and Slack channel",
    "## Running the service",
    "## System",
    "## Runbook",
    "## Monitoring",
    "## Documentation",
])

title = re.compile("^\s*(#.+$)", re.MULTILINE)
invalid = re.compile("node_modules")
service = args.service
path = f"../buzzfeed/mono/{service}/README.md"


for filename in glob.iglob(path, recursive=True):
    with open(filename, "r") as f:
        if not (invalid.search(filename)):
            print(filename, "\n\n")

            contents = f.read()
            contents = unicodedata.normalize("NFKD", contents)
            matches = title.findall(contents)
            missing_headers = valid_headers - set(matches)
            is_valid = valid_headers.issubset(set(matches))

            if matches:
                if args.debug:
                    print("Found headers:\n", matches, "\n\n")
                    print("Expected headers:\n", valid_headers, "\n\n")
                    print("Missing headers:\n", missing_headers, "\n\n")
                print("Is README valid?\n", is_valid)
            else:
                print("no matching headers found? so will presume this README is invalid")

            print("\n\n---\n\n")

## 2. Python 3.4 compatible version for Rig VM (Mono Repo) + RESULT & CSV OPTIONS + Missing etc.py
# Python 3.4 has no iglob recursive option so we have to walk the entire tree (very slow)

#!/usr/bin/env python3

"""
Usage:

    ./scripts/validate_readmes
        validate every README (~40s)

    ./scripts/validate_readmes --missing
        report how many READMEs are missing from top level service directories

    ./scripts/validate_readmes --debug
        validate every README with details on what headers are missing (~40s+)

    ./scripts/validate_readmes --debug --service bpager
        validate specific service README (time varies by service directory structure)

    ./scripts/validate_readmes --result
        validate every README and show the number of valid vs invalid READMEs (~40s+)

    ./scripts/validate_readmes --service bpager --result
        validate specific service README and show the number of valid vs invalid READMEs

    ./scripts/validate_readmes --csv > readmes.csv
        generate CSV data highlighting valid vs invalid READMEs (~40s+)

    ./scripts/validate_readmes --service bpager --csv > bpager.csv
        generate CSV data highlighting valid vs invalid READMEs for a specific service
"""

import argparse
import fnmatch
import json
import re
import os
import unicodedata

parser = argparse.ArgumentParser(description="Validate READMEs")
parser.add_argument("-d", "--debug",
                    help="Show failure details", action="store_true")
parser.add_argument("-s", "--service", help="Validate specific service",
                    default="")
parser.add_argument("-r", "--result", help="Show only the results of the validation checker",
                    action="store_true")
parser.add_argument("-c", "--csv", help="Generate CSV of results",
                    action="store_true")
parser.add_argument("-m", "--missing", help="Show services that have no README file", action="store_true")
parser.set_defaults(debug=False, result=False, csv=False, missing=False)
args = parser.parse_args()

valid_headers = set([
    "## Point of Contact and Slack channel",
    "## Running the service",
    "## System",
    "## Runbook",
    "## Monitoring",
    "## Documentation",
])

title = re.compile("^\s*(#.+$)", re.MULTILINE)
invalid = re.compile("node_modules")
service = args.service
valid = 0
csv = "file,valid\n"
top_level_dirs = {}

if args.missing:
    for item in sorted(os.listdir("./")):
        path = os.path.join("./", item)
        is_dir = os.path.isdir(path)
        if is_dir and not path.startswith("./."):
            top_level_dirs[path] = False
            for file in os.listdir(path):
                if file == "README.md":
                    top_level_dirs[path] = True

    if args.debug:
        print(json.dumps(top_level_dirs, indent=2, sort_keys=True))

    total = len(top_level_dirs)
    missing = total - sum(top_level_dirs.values())
    percent = round(missing / total * 100)
    print("\n{}% of top level directories are missing a README ({} out of {})\n".format(percent, missing, total))
    exit()

if service == "" and not args.csv:
    print("hold tight, this can take up to 40 seconds to process every README in mono...")

file_matches = []
for root, dirnames, filenames in os.walk("./" + args.service):
    for filename in fnmatch.filter(filenames, "README.md"):
        fn = os.path.join(root, filename)
        if not invalid.search(fn):
            file_matches.append(fn)

for filename in file_matches:
    with open(filename, "r") as f:
        contents = f.read()
        contents = unicodedata.normalize("NFKD", contents)
        matches = title.findall(contents)
        missing_headers = valid_headers - set(matches)
        is_valid = valid_headers.issubset(set(matches))

        if args.csv:
            csv += "{},{}\n".format(filename, is_valid)

        if is_valid:
            valid += 1

        if not args.result and not args.csv:
            print(filename, "\n\n")

            if matches:
                if args.debug:
                    print("Found headers:\n", matches, "\n\n")
                    print("Expected headers:\n", valid_headers, "\n\n")
                    print("Missing headers:\n", missing_headers, "\n\n")
                print("Is README valid?\n", is_valid)
            else:
                print("no matching headers found? so will presume this README is invalid")

            print("\n\n---\n\n")

if args.result and not args.csv:
    percent = round(valid / len(file_matches) * 100)
    print("\n{}% of the READMEs found are valid ({} out of {})\n".format(percent, valid, len(file_matches)))

if args.csv:
    print(csv)

"""
Inconsistent Numbers?

When using the `--missing` flag, the script doesn't walk every _nested_ directory (like it does when validating READMEs using the `--result` flag). This is because not all nested directories would logically have a README.

e.g. `/foo-service/random-dir/` can't necessarily be classed as _missing_ a README as there may well be no need for one there. Where as the top level directories in mono are all expected to have a README at the very least.

But some top level directories will indeed have a nested directory that _does_ have a README and if so we should at least validate those the best we can against the standard format.

e.g. `./scripts/validate_readmes -s bpager` reports there are two READMEs `./bpager/README.md` (valid) and `./bpager/tests/e2e/README.md` (invalid).

Subsequently, because `--missing` only checks the top level directories, we'll likely find that number (i.e. the number of directories "missing" a README) will be a much lower number than the number of "invalid vs valid" READMEs reported by the `--result` flag.

If you look at the examples given above, that's why the number of "found" READMEs is 378 (because it recursively searched all nested directories) where as the total number of top level directories is reported as 469. If you were to do `469-378` you won't see the same result as what is being reported as directories with "missing" READMEs (205) because that's not a comparison that can be made.

The reason the script validates all found READMEs, rather than just validating the READMEs in the top level directories, is because there's lots of Python package related READMEs (e.g. `./packages/foo-package/README.md`) that need to be validated and those would otherwise be missed. We could potentially add a whitelist or blacklist to account for "known" directories that we should recurse into but then we could in future end up skipping over directories that we might want to validate READMEs.
"""

## 3. Example Output.sh
$ python validate.py --debug

# note: this would print out 'all' READMEs...

---


../buzzfeed/mono/video_player/README.md


Found headers:
 ['# Video Player Service (VPS)', '## Usage', '## Data Response', '## Running the service', '## System', '## Runbook', '## Monitoring', '### Logs', '### Dashboards', '### Alarms', '## Documentation']


Expected headers:
 {'## System', '## Documentation', '## Running the service', '## Monitoring', '## Point of Contact and Slack channel', '## Runbook', '## Usage'}


Missing headers:
 {'## Point of Contact and Slack channel'}


Is README valid?
 False


---

$ python validate.py

# note: this would print out 'all' READMEs...

---


../buzzfeed/mono/video_player/README.md


Is README valid?
 False


---

$ python validate.py --debug --service bpager

# note: this would print out 'only' the bpager README...

---

../buzzfeed/mono/bpager/README.md

Found headers:
 ['# BPager', '## Running the service', '## Point of Contact and Slack channel', '## Runbook', '## System', '## Monitoring', '### Logs', '### Dashboards', '## Documentation']


Expected headers:
 {'## System', '## Monitoring', '## Running the service', '## Runbook', '## Point of Contact and Slack channel', '## Documentation'}


Missing headers:
 set()


Is README valid?
 True

---

$ python validate.py --service bpager

# note: this would print out 'only' the bpager README...

---

../buzzfeed/mono/bpager/README.md

Is README valid?
 True

---

./scripts/validate_readmes --service bpager --result

1 of 2 are valid

./scripts/validate_readmes --result

hold tight, this can take up to 40 seconds to process every README in mono...

19 of 1156 are valid

## 4. Multiprocess version.py
# no improvement in performance as the os.walk is the bottleneck and can't be sped up

import fnmatch
import itertools
import multiprocessing
import os
import re
import unicodedata

valid_headers = set([
    "## Point of Contact and Slack channel",
    "## Running the service",
    "## System",
    "## Runbook",
    "## Monitoring",
    "## Documentation",
])

title = re.compile("^\s*(#.+$)", re.MULTILINE)
invalid = re.compile("node_modules")

def worker(filename):
    with open(filename, "r") as f:
        if not invalid.search(filename):
            print(filename, "\n\n")

            contents = f.read()
            contents = unicodedata.normalize("NFKD", contents)
            matches = title.findall(contents)
            missing_headers = valid_headers - set(matches)
            is_valid = valid_headers.issubset(set(matches))

            if matches:
                print("Is README valid?\n", is_valid)
            else:
                print("no matching headers found? so will presume this README is invalid")

            print("\n\n---\n\n")

with multiprocessing.Pool(48) as Pool: # pool of 48 processes
    walk = os.walk("./") # generator

    # this is the bottleneck (the walking of all files & directories)
    fn_gen = itertools.chain.from_iterable((os.path.join(root, file)
                                            for file in fnmatch.filter(files, 'README.md'))
                                           for root, dirs, files in walk)

    results_of_work = Pool.map(worker, fn_gen) # this does the parallel processing
	"""
	Usage:

	python ./scripts/validate_readmes.py
	validate every README (~1min)

	python ./scripts/validate_readmes.py --debug
	validate every README with details on what headers are missing (~1min)

	python ./scripts/validate_readmes.py --debug --service bpager
	validate specific service README (time varies by service directory structure)
	"""

	import argparse
	import glob
	import re
	import unicodedata

	parser = argparse.ArgumentParser(description="Validate READMEs")
	parser.add_argument("-d", "--debug",
	help="Show failure details", action="store_true")
	parser.add_argument("-s", "--service", help="Validate specific service",
	default="**")
	parser.set_defaults(debug=False)
	args = parser.parse_args()

	valid_headers = set([
	"## Point of Contact and Slack channel",
	"## Running the service",
	"## System",
	"## Runbook",
	"## Monitoring",
	"## Documentation",
	])

	title = re.compile("^\s*(#.+$)", re.MULTILINE)
	invalid = re.compile("node_modules")
	service = args.service
	path = f"../buzzfeed/mono/{service}/README.md"


	for filename in glob.iglob(path, recursive=True):
	with open(filename, "r") as f:
	if not (invalid.search(filename)):
	print(filename, "\n\n")

	contents = f.read()
	contents = unicodedata.normalize("NFKD", contents)
	matches = title.findall(contents)
	missing_headers = valid_headers - set(matches)
	is_valid = valid_headers.issubset(set(matches))

	if matches:
	if args.debug:
	print("Found headers:\n", matches, "\n\n")
	print("Expected headers:\n", valid_headers, "\n\n")
	print("Missing headers:\n", missing_headers, "\n\n")
	print("Is README valid?\n", is_valid)
	else:
	print("no matching headers found? so will presume this README is invalid")

	print("\n\n---\n\n")
	# Python 3.4 has no iglob recursive option so we have to walk the entire tree (very slow)

	#!/usr/bin/env python3

	"""
	Usage:

	./scripts/validate_readmes
	validate every README (~40s)

	./scripts/validate_readmes --missing
	report how many READMEs are missing from top level service directories

	./scripts/validate_readmes --debug
	validate every README with details on what headers are missing (~40s+)

	./scripts/validate_readmes --debug --service bpager
	validate specific service README (time varies by service directory structure)

	./scripts/validate_readmes --result
	validate every README and show the number of valid vs invalid READMEs (~40s+)

	./scripts/validate_readmes --service bpager --result
	validate specific service README and show the number of valid vs invalid READMEs

	./scripts/validate_readmes --csv > readmes.csv
	generate CSV data highlighting valid vs invalid READMEs (~40s+)

	./scripts/validate_readmes --service bpager --csv > bpager.csv
	generate CSV data highlighting valid vs invalid READMEs for a specific service
	"""

	import argparse
	import fnmatch
	import json
	import re
	import os
	import unicodedata

	parser = argparse.ArgumentParser(description="Validate READMEs")
	parser.add_argument("-d", "--debug",
	help="Show failure details", action="store_true")
	parser.add_argument("-s", "--service", help="Validate specific service",
	default="")
	parser.add_argument("-r", "--result", help="Show only the results of the validation checker",
	action="store_true")
	parser.add_argument("-c", "--csv", help="Generate CSV of results",
	action="store_true")
	parser.add_argument("-m", "--missing", help="Show services that have no README file", action="store_true")
	parser.set_defaults(debug=False, result=False, csv=False, missing=False)
	args = parser.parse_args()

	valid_headers = set([
	"## Point of Contact and Slack channel",
	"## Running the service",
	"## System",
	"## Runbook",
	"## Monitoring",
	"## Documentation",
	])

	title = re.compile("^\s*(#.+$)", re.MULTILINE)
	invalid = re.compile("node_modules")
	service = args.service
	valid = 0
	csv = "file,valid\n"
	top_level_dirs = {}

	if args.missing:
	for item in sorted(os.listdir("./")):
	path = os.path.join("./", item)
	is_dir = os.path.isdir(path)
	if is_dir and not path.startswith("./."):
	top_level_dirs[path] = False
	for file in os.listdir(path):
	if file == "README.md":
	top_level_dirs[path] = True

	if args.debug:
	print(json.dumps(top_level_dirs, indent=2, sort_keys=True))

	total = len(top_level_dirs)
	missing = total - sum(top_level_dirs.values())
	percent = round(missing / total * 100)
	print("\n{}% of top level directories are missing a README ({} out of {})\n".format(percent, missing, total))
	exit()

	if service == "" and not args.csv:
	print("hold tight, this can take up to 40 seconds to process every README in mono...")

	file_matches = []
	for root, dirnames, filenames in os.walk("./" + args.service):
	for filename in fnmatch.filter(filenames, "README.md"):
	fn = os.path.join(root, filename)
	if not invalid.search(fn):
	file_matches.append(fn)

	for filename in file_matches:
	with open(filename, "r") as f:
	contents = f.read()
	contents = unicodedata.normalize("NFKD", contents)
	matches = title.findall(contents)
	missing_headers = valid_headers - set(matches)
	is_valid = valid_headers.issubset(set(matches))

	if args.csv:
	csv += "{},{}\n".format(filename, is_valid)

	if is_valid:
	valid += 1

	if not args.result and not args.csv:
	print(filename, "\n\n")

	if matches:
	if args.debug:
	print("Found headers:\n", matches, "\n\n")
	print("Expected headers:\n", valid_headers, "\n\n")
	print("Missing headers:\n", missing_headers, "\n\n")
	print("Is README valid?\n", is_valid)
	else:
	print("no matching headers found? so will presume this README is invalid")

	print("\n\n---\n\n")

	if args.result and not args.csv:
	percent = round(valid / len(file_matches) * 100)
	print("\n{}% of the READMEs found are valid ({} out of {})\n".format(percent, valid, len(file_matches)))

	if args.csv:
	print(csv)

	"""
	Inconsistent Numbers?

	When using the `--missing` flag, the script doesn't walk every _nested_ directory (like it does when validating READMEs using the `--result` flag). This is because not all nested directories would logically have a README.

	e.g. `/foo-service/random-dir/` can't necessarily be classed as _missing_ a README as there may well be no need for one there. Where as the top level directories in mono are all expected to have a README at the very least.

	But some top level directories will indeed have a nested directory that _does_ have a README and if so we should at least validate those the best we can against the standard format.

	e.g. `./scripts/validate_readmes -s bpager` reports there are two READMEs `./bpager/README.md` (valid) and `./bpager/tests/e2e/README.md` (invalid).

	Subsequently, because `--missing` only checks the top level directories, we'll likely find that number (i.e. the number of directories "missing" a README) will be a much lower number than the number of "invalid vs valid" READMEs reported by the `--result` flag.

	If you look at the examples given above, that's why the number of "found" READMEs is 378 (because it recursively searched all nested directories) where as the total number of top level directories is reported as 469. If you were to do `469-378` you won't see the same result as what is being reported as directories with "missing" READMEs (205) because that's not a comparison that can be made.

	The reason the script validates all found READMEs, rather than just validating the READMEs in the top level directories, is because there's lots of Python package related READMEs (e.g. `./packages/foo-package/README.md`) that need to be validated and those would otherwise be missed. We could potentially add a whitelist or blacklist to account for "known" directories that we should recurse into but then we could in future end up skipping over directories that we might want to validate READMEs.
	"""
	$ python validate.py --debug

	# note: this would print out 'all' READMEs...

	---


	../buzzfeed/mono/video_player/README.md


	Found headers:
	['# Video Player Service (VPS)', '## Usage', '## Data Response', '## Running the service', '## System', '## Runbook', '## Monitoring', '### Logs', '### Dashboards', '### Alarms', '## Documentation']


	Expected headers:
	{'## System', '## Documentation', '## Running the service', '## Monitoring', '## Point of Contact and Slack channel', '## Runbook', '## Usage'}


	Missing headers:
	{'## Point of Contact and Slack channel'}


	Is README valid?
	False


	---

	$ python validate.py

	# note: this would print out 'all' READMEs...

	---


	../buzzfeed/mono/video_player/README.md


	Is README valid?
	False


	---

	$ python validate.py --debug --service bpager

	# note: this would print out 'only' the bpager README...

	---

	../buzzfeed/mono/bpager/README.md

	Found headers:
	['# BPager', '## Running the service', '## Point of Contact and Slack channel', '## Runbook', '## System', '## Monitoring', '### Logs', '### Dashboards', '## Documentation']


	Expected headers:
	{'## System', '## Monitoring', '## Running the service', '## Runbook', '## Point of Contact and Slack channel', '## Documentation'}


	Missing headers:
	set()


	Is README valid?
	True

	---

	$ python validate.py --service bpager

	# note: this would print out 'only' the bpager README...

	---

	../buzzfeed/mono/bpager/README.md

	Is README valid?
	True

	---

	./scripts/validate_readmes --service bpager --result

	1 of 2 are valid

	./scripts/validate_readmes --result

	hold tight, this can take up to 40 seconds to process every README in mono...

	19 of 1156 are valid
	# no improvement in performance as the os.walk is the bottleneck and can't be sped up

	import fnmatch
	import itertools
	import multiprocessing
	import os
	import re
	import unicodedata

	valid_headers = set([
	"## Point of Contact and Slack channel",
	"## Running the service",
	"## System",
	"## Runbook",
	"## Monitoring",
	"## Documentation",
	])

	title = re.compile("^\s*(#.+$)", re.MULTILINE)
	invalid = re.compile("node_modules")

	def worker(filename):
	with open(filename, "r") as f:
	if not invalid.search(filename):
	print(filename, "\n\n")

	contents = f.read()
	contents = unicodedata.normalize("NFKD", contents)
	matches = title.findall(contents)
	missing_headers = valid_headers - set(matches)
	is_valid = valid_headers.issubset(set(matches))

	if matches:
	print("Is README valid?\n", is_valid)
	else:
	print("no matching headers found? so will presume this README is invalid")

	print("\n\n---\n\n")

	with multiprocessing.Pool(48) as Pool: # pool of 48 processes
	walk = os.walk("./") # generator

	# this is the bottleneck (the walking of all files & directories)
	fn_gen = itertools.chain.from_iterable((os.path.join(root, file)
	for file in fnmatch.filter(files, 'README.md'))
	for root, dirs, files in walk)

	results_of_work = Pool.map(worker, fn_gen) # this does the parallel processing