Skip to content

Instantly share code, notes, and snippets.

@peterbe
Created October 21, 2019 21:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterbe/f8acd5233efa5f18732cd0a0a9a648ba to your computer and use it in GitHub Desktop.
Save peterbe/f8acd5233efa5f18732cd0a0a9a648ba to your computer and use it in GitHub Desktop.
import os
import sys
import json
from pathlib import Path
import re
from functools import lru_cache
from pyquery import PyQuery
results = []
tests = []
# See https://codepen.io/peterbe/pen/xxxgrqw?editors=1011
not_safe = re.compile(r"[^\w\.\-_]")
alpha_start = re.compile(r"^\w")
@lru_cache(maxsize=None)
def is_valid(id):
if not id:
return False
if id[0].isdigit():
return False
if not alpha_start.findall(id):
return False
# return True
return not not_safe.findall(id)
mentions = set()
def test(path):
with open(path) as f:
data = json.load(f)
if data.get("documentData"):
# print(data['documentData'].keys())
doc = PyQuery(data["documentData"]["bodyHTML"])
for element in doc("[id]"):
id = element.attrib["id"]
if not id:
print("empty ID!", path)
result = is_valid(id)
results.append(result)
if not result:
if id not in mentions:
print(repr(id))
mentions.add(id)
tests.append(path)
def walk(root):
for thing in root.iterdir():
if thing.is_dir():
walk(thing)
else:
if os.stat(thing).st_size:
test(thing)
def run(root):
assert root.is_dir(), root
walk(root)
print(f"Analyzed {len(tests):,} files.")
valid = len([x for x in results if x])
not_valid = len([x for x in results if not x])
print(f"{not_valid:,} NOT valid IDs, {valid:,} valid.")
print(f"{100 * not_valid / (not_valid + valid):.1f}% are not valid")
if __name__ == "__main__":
run(Path(sys.argv[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment