Skip to content

Instantly share code, notes, and snippets.

@chrishanretty
Last active December 11, 2015 15:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrishanretty/4621435 to your computer and use it in GitHub Desktop.
Save chrishanretty/4621435 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use strict;
use Switch;
binmode STDOUT, ":utf8";
use WWW::Mechanize;
use HTML::TokeParser;
use Data::Dumper;
my $start = 'http://www.sondaggipoliticoelettorali.it/';
my $m = WWW::Mechanize->new();
my $maxpages=33;
## Iterate over thirteen rows
for (my $pageno = 1; $pageno < $maxpages; $pageno++) {
sleep(.5);
for (my $rowno = 1; $rowno<14;$rowno++) {
$m->get($start);
$m->follow_link( text=> 'Sondaggi');
$m->field('ctl00$Contenuto$dgSondaggi_VaiAPaginaTextBox',$pageno);
$m->click_button( name => 'ctl00$Contenuto$dgSondaggi_VaiAPaginaBottone' );
$m->click_button( name => 'ctl00$Contenuto$dgSondaggi_Row' . $rowno . '_DataInserimento' );
## expose to TokeParser;
my %gestione;
my $p = HTML::TokeParser->new(\$m->{content});
while (my $para = $p->get_tag("p")) {
## begin if block
if ((defined($para->[1])) && (defined($para->[1]{id}))) {
my $id = $para->[1]{id};
switch ($id) {
case /ReadOnly_Titolo/ { $gestione{"title"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Realizzatore/ { $gestione{"company"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Committente/ { $gestione{"commissioner"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Acquirente/ { $gestione{"purchaser"} = $p->get_trimmed_text("/p") }
case /ReadOnly_DataRealizzazioneDa/ { $gestione{"datefrom"} = $p->get_trimmed_text("/p") }
case /ReadOnly_DataRealizzazioneA/ { $gestione{"dateto"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Mezzo_Comunic_massa/ { $gestione{"outlet"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Data_Pubblicazione/ { $gestione{"publicationdate"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Popolazione_Riferimento/ { $gestione{"population"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Estensione_Territoriale/ { $gestione{"territory"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Metodo_Campionamento/ { $gestione{"sampling"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Campione_Intervistati/ { $gestione{"samplesize"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Rappresentativa_Campione/ { $gestione{"representativeness"} = $p->get_trimmed_text("/p") }
case /ReadOnly_Metodo_Raccolta_Informazioni/ { $gestione{"method"} = $p->get_trimmed_text("/p") }
else {}
}
}
}
#print Dumper(%gestione);
## Now go to the questions
$m->click_button( name => 'ctl00$Titolo$TabSondaggio$DomandeRisposte');
my $p2 = HTML::TokeParser->new(\$m->{content});
## Need to find out how many questions there are
## this is really inelegant stuff
my $numero_domande=0;
while (my $i = $p2 ->get_tag("input")) {
if ((defined($i->[1])) && (defined($i->[1]{value})) && (defined($i->[1]{name}))) {
next unless $i->[1]{name} =~ /ListaDomande/;
my $altval = $i->[1]{value};
next unless (length($altval)<3);
$altval =~s/\D//g;
next unless (length($altval)>0);
$numero_domande = ($numero_domande >= $altval ? $numero_domande : $altval);
}
}
my %domande;
for (my $numero = 1; $numero <= $numero_domande; $numero++) {
my $theform = $m->form_number(1);
my $selector = 'ctl00$Contenuto$ucGestioneDomande$ucListaDomande$dgDomande_Row' . $numero . '_RowNumber';
my $thebutton = $theform->find_input( '^' . $selector );
if (defined($thebutton)) {
} else {
warn "Failure at page: $pageno, row $rowno!";
print "Number : ", $numero_domande, "\n\n";
print $m->content(), "\n\n\n\n";
exit;
}
$m->click_button( name => 'ctl00$Contenuto$ucGestioneDomande$ucListaDomande$dgDomande_Row' . $numero . '_RowNumber');
$p2 = HTML::TokeParser->new(\$m->{content});
while (my $para = $p2->get_tag("p")) {
if ((defined($para->[1])) && (defined($para->[1]{id}))) {
my $id = $para->[1]{id};
if ($id =~ /ctl00_Contenuto_ucGestioneDomande_ucSchedaDomandaReadOnly_Domanda/) {
my $question = $p2->get_trimmed_text("/p");
$p2->get_tag("p");
my $answer = $p2->get_text("/p");
$answer =~s/\t/ /g;
## Does the answer contain anything
if ($answer =~/\d|\w/) {
## Yes, it does: fine
} else {
## No, it doesn't: move to Area Allegato
$p2->get_text("h3"); $p2->get_text("table");
$answer = '';
while (my $cell = $p2->get_tag("td","tr","/table")) {
if ($cell->[0] eq 'tr') {
$answer .= "\n";
} elsif ($cell->[0] eq 'td') {
$answer .= ' ' . $p2->get_trimmed_text("/td");
}
last if ($cell->[0] eq '/table');
}
#$answer = $p2->get_text("/table");
#warn $answer;
}
$domande{ $question } = $answer;
}
}
}
}
## What are we to do with this stuff?
## Split on linebreaks, hope each is a response associated with a value
foreach (keys(%domande)) {
my $key = $_;
my @outlines = split(/\n/, $domande{$_});
foreach (@outlines) {
print $pageno, "\t", $rowno, "\t", join("\t",values(%gestione)), "\t", $key, "\t", $_, "\n";
}
}
#print Dumper(%domande);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment