Instantly share code, notes, and snippets.

@gugray /html2txt.py Secret
Created May 19, 2017

Embed
What would you like to do?
import os
from bs4 import BeautifulSoup
def html2txt(fin, fout):
with open(fin, "r", encoding="utf8") as fi:
html = fi.read()
soup = BeautifulSoup(html, "html.parser")
# Rip out scripts and style
for script in soup(["script", "style"]): script.extract()
text = soup.get_text()
with open(fout, 'w', encoding="utf8") as fo:
fo.write(text)
for filename in os.listdir("./html"):
if not filename.endswith(".html"): continue
html2txt("./html/" + filename, "./txt/" + filename + ".txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment