Skip to content

Instantly share code, notes, and snippets.

@woshichuanqilz
Created July 2, 2024 21:54
Show Gist options
  • Save woshichuanqilz/8633cb3acf54c8e8f0c045a13e9b6069 to your computer and use it in GitHub Desktop.
Save woshichuanqilz/8633cb3acf54c8e8f0c045a13e9b6069 to your computer and use it in GitHub Desktop.
get text content from html
import os
import re
from bs4 import BeautifulSoup
from multiprocessing import Pool
def extract_html_content(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
main_content = soup.get_text()
return main_content
def merge_html_content(html_content, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
for content in html_content:
lines = content.split('\n')
filtered_lines = [line for line in lines if line.strip()] # Exclude empty lines
f.write('\n'.join(filtered_lines) + '\n')
def process_html_files(folder_path):
html_files = []
for root, dirs, files in os.walk(folder_path):
html_files.extend([os.path.join(root, file) for file in files if file.endswith(".html")])
return html_files
if __name__ == '__main__':
folder_path = '.'
html_files = process_html_files(folder_path)
with Pool() as pool:
html_content = pool.map(extract_html_content, html_files)
output_file = 'merged_html_content.txt'
merge_html_content(html_content, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment