yakovsh/2009_02_09-cleanup2.pl

## 2009_02_09-cleanup2.pl
# Here is another way to cleanup bad HTML with Perl, and convert to XML:
# This approach relies on the HTML::DOMbo module to do the actual conversion
# between HTML and XML, and HTML::TreeBuilder for parsing.

use HTML::DOMbo;
use HTML::TreeBuilder;
use XML::LibXML;

$html_code = '';

# Parse HTML
my $builder = HTML::TreeBuilder->new();
$xml_source = $builder->parse($html_code);

# Convert to XML DOM
$xml_source1 = $xml_source->to_XML_DOM;

# Extract XML and encode UTF-8
$xml_source2 = (encode("utf-8", $xml_source1);
	# Here is another way to cleanup bad HTML with Perl, and convert to XML:
	# This approach relies on the HTML::DOMbo module to do the actual conversion
	# between HTML and XML, and HTML::TreeBuilder for parsing.

	use HTML::DOMbo;
	use HTML::TreeBuilder;
	use XML::LibXML;

	$html_code = '';

	# Parse HTML
	my $builder = HTML::TreeBuilder->new();
	$xml_source = $builder->parse($html_code);

	# Convert to XML DOM
	$xml_source1 = $xml_source->to_XML_DOM;

	# Extract XML and encode UTF-8
	$xml_source2 = (encode("utf-8", $xml_source1);