Skip to content

Instantly share code, notes, and snippets.

@lamons
Last active January 1, 2024 12:01
Show Gist options
  • Save lamons/a5828d6f44626d6c2138296aa39ffe74 to your computer and use it in GitHub Desktop.
Save lamons/a5828d6f44626d6c2138296aa39ffe74 to your computer and use it in GitHub Desktop.
import os
from bs4 import BeautifulSoup
def process_html_file(file_path):
print(f"Processing File: {file_path}")
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
soup = BeautifulSoup(content, 'html.parser')
sans_content = []
other_content = []
for tag in soup.find_all(True):
if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong']:
sans_content.extend(tag.get_text())
else:
other_content.extend(tag.get_text())
return {
'sans_content': set(sans_content),
'other_content': set(other_content)
}
def write_to_file(file_name, content):
with open(file_name, 'w', encoding='utf-8') as file:
file.write(''.join(content))
def combine_and_write_to_file(output_file_name, contents):
combined_content = []
for content in contents:
combined_content.extend(content)
combined_content = list(set(combined_content))
combined_content = [glyph for glyph in combined_content if ord(glyph) > 127]
write_to_file(output_file_name, combined_content)
def main():
directory = '.' # change this to the directory containing your HTML files
sans_contents = []
other_contents = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(('.html', '.htm')):
file_path = os.path.join(root, file)
result = process_html_file(file_path)
sans_contents.append(result['sans_content'])
other_contents.append(result['other_content'])
combine_and_write_to_file('./sans_content.txt', sans_contents)
combine_and_write_to_file('./other_content.txt', other_contents)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment