Skip to content

Instantly share code, notes, and snippets.

@briandfoy
Created August 29, 2015 22:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briandfoy/60f2dc751720975843af to your computer and use it in GitHub Desktop.
Save briandfoy/60f2dc751720975843af to your computer and use it in GitHub Desktop.
#!/Users/brian/bin/perls/perl5.22.0
use open qw(:std :utf8);
use v5.10;
use HTTP::Tiny;
use Devel::Peek qw(Dump);
use Encode qw(decode find_encoding);
use HTML::HeadParser;
my $ht = HTTP::Tiny->new;
# The problem URL
my $url = 'http://blogs.perl.org/users/patch/2015/07/noirin-plunkett.html';
my $response = $ht->get( $url );
# blogs.perl.org doesn't return an encoding in the HTTP header
my $type_in_header = $response->{headers}{'content-type'};
say "Got type [$type_in_header] from header";
my $charset_in_header;
if( ( $charset_in_header ) = $type_in_header =~ /;\s+charset=(\S+)/ ) {
say "Charset from HTML is $charset_in_header";
}
# so let's look in the <head>
my $p = HTML::HeadParser->new;
$p->parse( $response->{content} );
my $type = $p->header('Content-Type');
say "Got type [$type] from HTML head";
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
my $charset_in_html;
if( ( $charset_in_html ) = $type =~ /;\s+charset=(\S+)/ ) {
say "Charset from HTML is $charset_in_html";
}
# create an encoding object from the first defined value
my $encoding = find_encoding( $charset_in_header // $charset_in_html );
die "Could not discover encoding\n" unless $encoding;
# Now, take the octets from the raw response and decode them into
# its Perl string form
my $string = $encoding->decode( $response->{content} );
# Now it should be okay inside Perl
say $string;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment