Skip to content

Instantly share code, notes, and snippets.


gugray/ Secret

Created May 19, 2017
What would you like to do?
import os
from bs4 import BeautifulSoup
def html2txt(fin, fout):
with open(fin, "r", encoding="utf8") as fi:
html =
soup = BeautifulSoup(html, "html.parser")
# Rip out scripts and style
for script in soup(["script", "style"]): script.extract()
text = soup.get_text()
with open(fout, 'w', encoding="utf8") as fo:
for filename in os.listdir("./html"):
if not filename.endswith(".html"): continue
html2txt("./html/" + filename, "./txt/" + filename + ".txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment