FirefighterBlu3/bs4-no-html-translations.py

## bs4-no-html-translations.py
import bs4.dammit
import bs4.builder._htmlparser
from bs4 import BeautifulSoup

# when analyzing spam, we always run into situations where the sender tries
# hard to obfuscate their input in order to sneak by spam detecting engines.
# instead of "ABC", they'll use HTML Entity references; &#65;&#66;&#67; in
# order to extract the viewer readable segments, we need a parser.  most
# parsers try to be smart and clean things up.  BeautifulSoup and lxml are
# two common parsers.  lxml is largely compiled C so we can't tweak it very
# easily.

# step #1
# BeautifulSoup insists on always doing entity substitution and there's no
# way to politely tell it to fuck off.  override the hex->int and
# word->symbol conversions, simply append our data to the growing stack
_handle_data = bs4.builder._htmlparser.BeautifulSoupHTMLParser.handle_data
bs4.builder._htmlparser.BeautifulSoupHTMLParser.handle_charref   = lambda cls,s: _handle_data(cls, '&#'+s+';')
bs4.builder._htmlparser.BeautifulSoupHTMLParser.handle_entityref = lambda cls,s: _handle_data(cls, '&'+s+';')

# step #2
# BeautifulSoup insists on further ensuring printed data is always tidy and
# semantically correct, thus it ALWAYS does entity substitution even after
# we refused to do it above.  the below ensures the __str__ methods don't
# attempt to mangle the serialized data.  this simply returns the original
# matched input when the substitution methods are called
bs4.dammit.EntitySubstitution._substitute_html_entity = lambda o: o.group(0)
bs4.dammit.EntitySubstitution._substitute_xml_entity  = lambda o: o.group(0)

# now let's merrily do what the fuck we want
body = '<html><body><p>&#1234;&rsquo;</p></body></html>'
soup = BeautifulSoup(body, 'html.parser')
assert str(soup.find('p')) == '<p>&#1234;&rsquo;</p>'
	import bs4.dammit
	import bs4.builder._htmlparser
	from bs4 import BeautifulSoup

	# when analyzing spam, we always run into situations where the sender tries
	# hard to obfuscate their input in order to sneak by spam detecting engines.
	# instead of "ABC", they'll use HTML Entity references; ABC in
	# order to extract the viewer readable segments, we need a parser. most
	# parsers try to be smart and clean things up. BeautifulSoup and lxml are
	# two common parsers. lxml is largely compiled C so we can't tweak it very
	# easily.

	# step #1
	# BeautifulSoup insists on always doing entity substitution and there's no
	# way to politely tell it to fuck off. override the hex->int and
	# word->symbol conversions, simply append our data to the growing stack
	_handle_data = bs4.builder._htmlparser.BeautifulSoupHTMLParser.handle_data
	bs4.builder._htmlparser.BeautifulSoupHTMLParser.handle_charref = lambda cls,s: _handle_data(cls, '&#'+s+';')
	bs4.builder._htmlparser.BeautifulSoupHTMLParser.handle_entityref = lambda cls,s: _handle_data(cls, '&'+s+';')

	# step #2
	# BeautifulSoup insists on further ensuring printed data is always tidy and
	# semantically correct, thus it ALWAYS does entity substitution even after
	# we refused to do it above. the below ensures the __str__ methods don't
	# attempt to mangle the serialized data. this simply returns the original
	# matched input when the substitution methods are called
	bs4.dammit.EntitySubstitution._substitute_html_entity = lambda o: o.group(0)
	bs4.dammit.EntitySubstitution._substitute_xml_entity = lambda o: o.group(0)

	# now let's merrily do what the fuck we want
	body = '<html><body><p>Ӓ’</p></body></html>'
	soup = BeautifulSoup(body, 'html.parser')
	assert str(soup.find('p')) == '<p>Ӓ’</p>'