numeroteca/kiosko-newspaper-names

## kiosko-newspaper-names
#to use it write "$ ./parse_kiosko.pl <lang>" where 'lang' is the language: en, es or fr
#it will output a csv. see example at http://brownbag.me:9001/p/pageonex-kiosko-newspaper-names

#!/usr/bin/env perl

use strict;
use warnings;
use utf8;
use LWP::UserAgent;
use HTML::TreeBuilder 5 -weak;
use URI;

use constant LANG => qw(en es fr);

# No stdout buffering
$|++;

# Force UTF8 output
binmode(STDOUT, ":utf8");

# Define which kiosko root page we're taking
@ARGV == 1 or usage();
my ($lang) = @ARGV;
grep { $_ eq $lang } (LANG) or usage();

my $kiosko_url = "http://$lang.kiosko.net";

# Regexps to get countries
my %regexp = (
    es => q|Periódicos de (.+?)\. Toda la prensa de hoy|,
    en => q|Newspapers in (.+?)\. Today's press covers|,
    fr => q|Les Unes des journaux de (.+?)\. Toute la presse d'aujourd'hui|,
);

# top page
my $tree = HTML::TreeBuilder->new_from_url($kiosko_url);

# Get HTML::Entities from top page
# Look for <a> tags with
# - a defined href
# - a title Periodicos
my @a_tags = $tree->look_down(
    '_tag', 'a',
    sub {
        defined $_[0]->attr('href')  &&
        defined $_[0]->attr('title') && $_[0]->attr('title') =~ /^Periodicos de/
    },
);

# Get a sorted unique list of country codes
# Exclude geo links as we will get it afterwards looking into the country pages
my %uniq;
my @country_pages =
    grep { ! $uniq{$_}++ }
        sort grep { ! m:/geo/: } map { $_->attr('href') } @a_tags;

# Header
print "Country, Country code, Newspaper name, Newspaper code\n";

for my $country_page (@country_pages) {
    my $p_uri = URI->new_abs($country_page, $kiosko_url);
    my $p_tree = HTML::TreeBuilder->new_from_url($p_uri->as_string);

    # Get title of this page to have the country
    # Title of pages have the format i.e
    # "Periódicos de Argentina. Toda la prensa de hoy. Kiosko.net"
    # "Periódicos de R. Dominicana. Toda la prensa de hoy. Kiosko.net"
    my ($p_title_tag) = $p_tree->look_down('_tag', 'title');

    # Cannot do it this way due to Dominican Republic :(
    #my ($country) = (split /\s/, (split /\./, $p_title_tag->as_text)[0])[-1];

    my ($country) = ($p_title_tag->as_text =~ qr|$regexp{$lang}|) ? $1 : $p_title_tag->as_text;

    my $gen_uri = URI->new_abs('general.html', $p_uri);
    my $g_tree = HTML::TreeBuilder->new_from_url($gen_uri->as_string);

    # Links of the newspapers are in the a tags having
    # class=thcover and the attribute href defined
    my @gen_a_tags = $g_tree->look_down(
        '_tag', 'a',
        sub {
            defined $_[0]->attr('class') &&
            $_[0]->attr('class') eq 'thcover' &&
            defined $_[0]->attr('href')
        }

    );

    for my $a (@gen_a_tags) {
        my $n_uri = URI->new_abs($a->attr('href'), $kiosko_url);
        my $n_tree = HTML::TreeBuilder->new_from_url($n_uri->as_string);
        my ($n_title_tag) = $n_tree->look_down('_tag', 'title');

        # Get newspaper name
        # Newspaper pages names have the format, i.e
        # "Periódico Diario Libre (R. Dominicana). Periódicos de R. Dominicana. Toda la prensa de hoy. Kiosko.net"
        my $n_name = $n_title_tag->as_text;
        $n_name =~ s/ \(.+//;
        $n_name =~ s/[^\s]+\s//;

        # Get newspaper code
        # newspaper links have the format: /fr/np/presseocean.html
        my ($n_code) = ( split /\//, $a->attr('href') )[-1];
        $n_code =~ s/\.html//;

        # Remove slashes from country_page
        $country_page =~ s:/::g;

        print join(',', $country, $country_page, $n_name, $n_code), "\n";
    }
}

sub usage {
    die sprintf "Usage: $0 <%s>\n", join '|', (LANG);
}
	#to use it write "$ ./parse_kiosko.pl <lang>" where 'lang' is the language: en, es or fr
	#it will output a csv. see example at http://brownbag.me:9001/p/pageonex-kiosko-newspaper-names

	#!/usr/bin/env perl

	use strict;
	use warnings;
	use utf8;
	use LWP::UserAgent;
	use HTML::TreeBuilder 5 -weak;
	use URI;

	use constant LANG => qw(en es fr);

	# No stdout buffering
	$\|++;

	# Force UTF8 output
	binmode(STDOUT, ":utf8");

	# Define which kiosko root page we're taking
	@ARGV == 1 or usage();
	my ($lang) = @ARGV;
	grep { $_ eq $lang } (LANG) or usage();

	my $kiosko_url = "http://$lang.kiosko.net";

	# Regexps to get countries
	my %regexp = (
	es => q\|Periódicos de (.+?)\. Toda la prensa de hoy\|,
	en => q\|Newspapers in (.+?)\. Today's press covers\|,
	fr => q\|Les Unes des journaux de (.+?)\. Toute la presse d'aujourd'hui\|,
	);

	# top page
	my $tree = HTML::TreeBuilder->new_from_url($kiosko_url);

	# Get HTML::Entities from top page
	# Look for <a> tags with
	# - a defined href
	# - a title Periodicos
	my @a_tags = $tree->look_down(
	'_tag', 'a',
	sub {
	defined $_[0]->attr('href') &&
	defined $_[0]->attr('title') && $_[0]->attr('title') =~ /^Periodicos de/
	},
	);

	# Get a sorted unique list of country codes
	# Exclude geo links as we will get it afterwards looking into the country pages
	my %uniq;
	my @country_pages =
	grep { ! $uniq{$_}++ }
	sort grep { ! m:/geo/: } map { $_->attr('href') } @a_tags;

	# Header
	print "Country, Country code, Newspaper name, Newspaper code\n";

	for my $country_page (@country_pages) {
	my $p_uri = URI->new_abs($country_page, $kiosko_url);
	my $p_tree = HTML::TreeBuilder->new_from_url($p_uri->as_string);

	# Get title of this page to have the country
	# Title of pages have the format i.e
	# "Periódicos de Argentina. Toda la prensa de hoy. Kiosko.net"
	# "Periódicos de R. Dominicana. Toda la prensa de hoy. Kiosko.net"
	my ($p_title_tag) = $p_tree->look_down('_tag', 'title');

	# Cannot do it this way due to Dominican Republic :(
	#my ($country) = (split /\s/, (split /\./, $p_title_tag->as_text)[0])[-1];

	my ($country) = ($p_title_tag->as_text =~ qr\|$regexp{$lang}\|) ? $1 : $p_title_tag->as_text;

	my $gen_uri = URI->new_abs('general.html', $p_uri);
	my $g_tree = HTML::TreeBuilder->new_from_url($gen_uri->as_string);

	# Links of the newspapers are in the a tags having
	# class=thcover and the attribute href defined
	my @gen_a_tags = $g_tree->look_down(
	'_tag', 'a',
	sub {
	defined $_[0]->attr('class') &&
	$_[0]->attr('class') eq 'thcover' &&
	defined $_[0]->attr('href')
	}

	);

	for my $a (@gen_a_tags) {
	my $n_uri = URI->new_abs($a->attr('href'), $kiosko_url);
	my $n_tree = HTML::TreeBuilder->new_from_url($n_uri->as_string);
	my ($n_title_tag) = $n_tree->look_down('_tag', 'title');

	# Get newspaper name
	# Newspaper pages names have the format, i.e
	# "Periódico Diario Libre (R. Dominicana). Periódicos de R. Dominicana. Toda la prensa de hoy. Kiosko.net"
	my $n_name = $n_title_tag->as_text;
	$n_name =~ s/ \(.+//;
	$n_name =~ s/[^\s]+\s//;

	# Get newspaper code
	# newspaper links have the format: /fr/np/presseocean.html
	my ($n_code) = ( split /\//, $a->attr('href') )[-1];
	$n_code =~ s/\.html//;

	# Remove slashes from country_page
	$country_page =~ s:/::g;

	print join(',', $country, $country_page, $n_name, $n_code), "\n";
	}
	}

	sub usage {
	die sprintf "Usage: $0 <%s>\n", join '\|', (LANG);
	}