Skip to content

Instantly share code, notes, and snippets.

@Zemke

Zemke/beautiful-soup.py

Last active Mar 30, 2018
Embed
What would you like to do?
Takes an HTML file or a path to recursively associate input fields and labels using `id` and `for` attributes.
import os
import re
import sys
from typing import IO
from bs4 import BeautifulSoup
from bs4.element import Tag
if len(sys.argv) == 2:
argv_path = sys.argv[1]
else:
argv_path = os.getcwd()
class Replacement:
input: Tag = None
label: Tag = None
def __init__(self, input_tag: Tag, label_tag: Tag):
self.input = input_tag
self.label = label_tag
self.id = input_tag.get('id') or input_tag.get('name')
@staticmethod
def find_matching_label(input_tag: Tag):
assumed_label: Tag = Main.all_label_and_input_tags[Main.all_label_and_input_tags.index(input_tag) - 1]
if assumed_label.name != 'label':
return None
return assumed_label
@staticmethod
def has_ng_model_and_name(tag: Tag) -> bool:
return tag.has_attr('ng-model') and tag.has_attr('name')
def __str__(self) -> str:
return self.input.name \
+ "[" \
+ "ng-model=" + self.input.get('ng-model') + ', ' \
+ "name=" + self.input.get('name') + ', ' \
+ "label=" + self.label.contents[0] \
+ ']'
class Main:
all_label_and_input_tags = []
def __init__(self, file: IO, soup: BeautifulSoup):
content = file.read()
Main.all_label_and_input_tags = soup(['label', Replacement.has_ng_model_and_name])
replacements = []
for input_tag in list(filter(lambda x: x.name != 'label', Main.all_label_and_input_tags)):
matching_label = Replacement.find_matching_label(input_tag)
if matching_label is not None:
replacements.append(Replacement(input_tag, matching_label))
change_took_place = False
for replacement in replacements:
if replacement.input.name[:3] == 'pkp':
continue
if replacement.input.parent.name == 'label':
continue
if not replacement.input.has_attr('id'):
content = Main.add_id_to_input_tag(content, replacement)
change_took_place = True
if not replacement.label.has_attr('for'):
content = Main.add_for_to_label_tag(content, replacement)
change_took_place = True
if change_took_place:
with open(file.name + '.out', 'w') as out_file:
out_file.write(content)
os.replace(out_file.name, file.name)
@staticmethod
def add_id_to_input_tag(content: str, replacement: Replacement) -> str:
pattern = \
'(<' \
+ re.escape(replacement.input.name) \
+ '[^>]*?(?=[^>]*name="' \
+ re.escape(replacement.input.get('name')) \
+ '")(?=[^>]*ng-model="' \
+ re.escape(replacement.input.get('ng-model')) \
+ '")[^>]*?)\/?>'
return re.sub(pattern, r'\1 id="' + replacement.id + '">', content)
@staticmethod
def add_for_to_label_tag(content: str, replacement: Replacement) -> str:
space_sep_strings = \
" ".join(
list(
filter(lambda y: y != ' ',
list(
map(
lambda x: x.__str__().replace('>', ' ').replace('\n', ' '),
replacement.label.contents))
))
).split(" ")
pattern = r"\<label([^\>]*?\>"
for space_sep_string in space_sep_strings:
pattern += r"(?=(?:(?!\<\/label\>).)*{})".format(re.escape(space_sep_string))
pattern += r".*?\<\/label\>)"
return re.sub(re.compile(pattern, re.DOTALL), r'<label for="{}"\1'.format(replacement.id), content)
def recurse(path):
if os.path.isdir(path):
path_contents = os.listdir(path)
for path_content in path_contents:
file_path = os.path.join(path, path_content)
if os.path.isdir(file_path):
recurse(file_path)
elif os.path.splitext(path_content)[1] == '.html':
process_html_file(file_path)
def process_html_file(file_path):
file = None
# noinspection PyBroadException
try:
file = open(file_path, "r+")
Main(file, BeautifulSoup(open(file_path, "r+"), "html.parser"))
except Exception as err:
print('ERR', file_path, err)
finally:
if file is not None:
file.close()
print(file_path)
if os.path.isdir(argv_path):
recurse(argv_path)
else:
process_html_file(argv_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment