urigoren/html2text.py

## html2text.py
import re
from html import unescape
def html2text(htm):
    ret = unescape(htm)
    ret = ret.translate({
        8209: ord('-'),
        ord('`'): ord("'"),
        ord('’'): ord("'"),
        8220: ord('"'),
        8221: ord('"'),
        160: ord(' '),
    })
    ret = re.sub(r"\s", " ", ret, flags = re.MULTILINE)
    ret = re.sub("<br>|<br />|</p>|</div>|</h\d>", "\n", ret, flags = re.IGNORECASE)
    ret = re.sub('<.*?>', ' ', ret, flags=re.DOTALL | re.MULTILINE)
    ret = re.sub(r"  +", " ", ret)
    return ret

if __name__=="__main__":
    with open("1.html", "rb") as f:
        htm=f.read().decode("utf8", errors="ignore")
    with open("1.txt", "w") as f:
        f.write(html2text(htm))
	import re
	from html import unescape
	def html2text(htm):
	ret = unescape(htm)
	ret = ret.translate({
	8209: ord('-'),
	ord('`'): ord("'"),
	ord('’'): ord("'"),
	8220: ord('"'),
	8221: ord('"'),
	160: ord(' '),
	})
	ret = re.sub(r"\s", " ", ret, flags = re.MULTILINE)
	ret = re.sub("<br>\|<br />\|</p>\|</div>\|</h\d>", "\n", ret, flags = re.IGNORECASE)
	ret = re.sub('<.*?>', ' ', ret, flags=re.DOTALL \| re.MULTILINE)
	ret = re.sub(r" +", " ", ret)
	return ret

	if __name__=="__main__":
	with open("1.html", "rb") as f:
	htm=f.read().decode("utf8", errors="ignore")
	with open("1.txt", "w") as f:
	f.write(html2text(htm))