Skip to content

Instantly share code, notes, and snippets.

@urigoren
Last active July 12, 2022 01:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save urigoren/3890daf24284824834081e88c6c5f3bb to your computer and use it in GitHub Desktop.
Save urigoren/3890daf24284824834081e88c6c5f3bb to your computer and use it in GitHub Desktop.
import re
from html import unescape
def html2text(htm):
ret = unescape(htm)
ret = ret.translate({
8209: ord('-'),
ord('`'): ord("'"),
ord('’'): ord("'"),
8220: ord('"'),
8221: ord('"'),
160: ord(' '),
})
ret = re.sub(r"\s", " ", ret, flags = re.MULTILINE)
ret = re.sub("<br>|<br />|</p>|</div>|</h\d>", "\n", ret, flags = re.IGNORECASE)
ret = re.sub('<.*?>', ' ', ret, flags=re.DOTALL | re.MULTILINE)
ret = re.sub(r" +", " ", ret)
return ret
if __name__=="__main__":
with open("1.html", "rb") as f:
htm=f.read().decode("utf8", errors="ignore")
with open("1.txt", "w") as f:
f.write(html2text(htm))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment