Skip to content

Instantly share code, notes, and snippets.

@yaymukund
Last active October 4, 2021 13:18
Show Gist options
  • Save yaymukund/4613ecf344f2c710509767a7022517e8 to your computer and use it in GitHub Desktop.
Save yaymukund/4613ecf344f2c710509767a7022517e8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from bs4 import BeautifulSoup
import sys
filename = sys.argv[1]
with open(filename) as fp:
soup = BeautifulSoup(fp, 'html.parser')
def has_tag(name):
return soup.find(name) is not None
if has_tag('h1') and has_tag('h2') and has_tag('h3') and has_tag('h4') and has_tag('h5') and has_tag('h6'):
print(f"{filename}: found all header tags")
if len(soup.find_all('h1')) > 1:
print(f'{filename}: found multiple <h1>')
# Check that there is no <h3> descending from <h2>, or
# other such invalidly nested HTML.
for parent_index in range(1, 6):
for child_index in range(1, parent_index+1):
for parent in soup.find_all(f'h{parent_index}'):
if parent.find(f'h{child_index}') is not None:
print(f'{filename}: found h{parent_index} > h{child_index}')
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment