import os | |
from bs4 import BeautifulSoup | |
def html2txt(fin, fout): | |
with open(fin, "r", encoding="utf8") as fi: | |
html = fi.read() | |
soup = BeautifulSoup(html, "html.parser") | |
# Rip out scripts and style | |
for script in soup(["script", "style"]): script.extract() | |
text = soup.get_text() | |
with open(fout, 'w', encoding="utf8") as fo: | |
fo.write(text) | |
for filename in os.listdir("./html"): | |
if not filename.endswith(".html"): continue | |
html2txt("./html/" + filename, "./txt/" + filename + ".txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment