woshichuanqilz/getContent.py

## getContent.py
import os
import re
from bs4 import BeautifulSoup
from multiprocessing import Pool

def extract_html_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'html.parser')
    main_content = soup.get_text()
    return main_content

def merge_html_content(html_content, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for content in html_content:
            lines = content.split('\n')
            filtered_lines = [line for line in lines if line.strip()]  # Exclude empty lines
            f.write('\n'.join(filtered_lines) + '\n')

def process_html_files(folder_path):
    html_files = []
    for root, dirs, files in os.walk(folder_path):
        html_files.extend([os.path.join(root, file) for file in files if file.endswith(".html")])
    return html_files

if __name__ == '__main__':
    folder_path = '.'
    html_files = process_html_files(folder_path)

    with Pool() as pool:
        html_content = pool.map(extract_html_content, html_files)

    output_file = 'merged_html_content.txt'
    merge_html_content(html_content, output_file)
	import os
	import re
	from bs4 import BeautifulSoup
	from multiprocessing import Pool

	def extract_html_content(file_path):
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	soup = BeautifulSoup(content, 'html.parser')
	main_content = soup.get_text()
	return main_content

	def merge_html_content(html_content, output_file):
	with open(output_file, 'w', encoding='utf-8') as f:
	for content in html_content:
	lines = content.split('\n')
	filtered_lines = [line for line in lines if line.strip()] # Exclude empty lines
	f.write('\n'.join(filtered_lines) + '\n')

	def process_html_files(folder_path):
	html_files = []
	for root, dirs, files in os.walk(folder_path):
	html_files.extend([os.path.join(root, file) for file in files if file.endswith(".html")])
	return html_files

	if __name__ == '__main__':
	folder_path = '.'
	html_files = process_html_files(folder_path)

	with Pool() as pool:
	html_content = pool.map(extract_html_content, html_files)

	output_file = 'merged_html_content.txt'
	merge_html_content(html_content, output_file)