Last active
December 11, 2015 15:29
-
-
Save chrishanretty/4621435 to your computer and use it in GitHub Desktop.
Screen-scraper for www.sondaggipoliticoelettorali.it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use Switch; | |
binmode STDOUT, ":utf8"; | |
use WWW::Mechanize; | |
use HTML::TokeParser; | |
use Data::Dumper; | |
my $start = 'http://www.sondaggipoliticoelettorali.it/'; | |
my $m = WWW::Mechanize->new(); | |
my $maxpages=33; | |
## Iterate over thirteen rows | |
for (my $pageno = 1; $pageno < $maxpages; $pageno++) { | |
sleep(.5); | |
for (my $rowno = 1; $rowno<14;$rowno++) { | |
$m->get($start); | |
$m->follow_link( text=> 'Sondaggi'); | |
$m->field('ctl00$Contenuto$dgSondaggi_VaiAPaginaTextBox',$pageno); | |
$m->click_button( name => 'ctl00$Contenuto$dgSondaggi_VaiAPaginaBottone' ); | |
$m->click_button( name => 'ctl00$Contenuto$dgSondaggi_Row' . $rowno . '_DataInserimento' ); | |
## expose to TokeParser; | |
my %gestione; | |
my $p = HTML::TokeParser->new(\$m->{content}); | |
while (my $para = $p->get_tag("p")) { | |
## begin if block | |
if ((defined($para->[1])) && (defined($para->[1]{id}))) { | |
my $id = $para->[1]{id}; | |
switch ($id) { | |
case /ReadOnly_Titolo/ { $gestione{"title"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Realizzatore/ { $gestione{"company"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Committente/ { $gestione{"commissioner"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Acquirente/ { $gestione{"purchaser"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_DataRealizzazioneDa/ { $gestione{"datefrom"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_DataRealizzazioneA/ { $gestione{"dateto"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Mezzo_Comunic_massa/ { $gestione{"outlet"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Data_Pubblicazione/ { $gestione{"publicationdate"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Popolazione_Riferimento/ { $gestione{"population"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Estensione_Territoriale/ { $gestione{"territory"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Metodo_Campionamento/ { $gestione{"sampling"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Campione_Intervistati/ { $gestione{"samplesize"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Rappresentativa_Campione/ { $gestione{"representativeness"} = $p->get_trimmed_text("/p") } | |
case /ReadOnly_Metodo_Raccolta_Informazioni/ { $gestione{"method"} = $p->get_trimmed_text("/p") } | |
else {} | |
} | |
} | |
} | |
#print Dumper(%gestione); | |
## Now go to the questions | |
$m->click_button( name => 'ctl00$Titolo$TabSondaggio$DomandeRisposte'); | |
my $p2 = HTML::TokeParser->new(\$m->{content}); | |
## Need to find out how many questions there are | |
## this is really inelegant stuff | |
my $numero_domande=0; | |
while (my $i = $p2 ->get_tag("input")) { | |
if ((defined($i->[1])) && (defined($i->[1]{value})) && (defined($i->[1]{name}))) { | |
next unless $i->[1]{name} =~ /ListaDomande/; | |
my $altval = $i->[1]{value}; | |
next unless (length($altval)<3); | |
$altval =~s/\D//g; | |
next unless (length($altval)>0); | |
$numero_domande = ($numero_domande >= $altval ? $numero_domande : $altval); | |
} | |
} | |
my %domande; | |
for (my $numero = 1; $numero <= $numero_domande; $numero++) { | |
my $theform = $m->form_number(1); | |
my $selector = 'ctl00$Contenuto$ucGestioneDomande$ucListaDomande$dgDomande_Row' . $numero . '_RowNumber'; | |
my $thebutton = $theform->find_input( '^' . $selector ); | |
if (defined($thebutton)) { | |
} else { | |
warn "Failure at page: $pageno, row $rowno!"; | |
print "Number : ", $numero_domande, "\n\n"; | |
print $m->content(), "\n\n\n\n"; | |
exit; | |
} | |
$m->click_button( name => 'ctl00$Contenuto$ucGestioneDomande$ucListaDomande$dgDomande_Row' . $numero . '_RowNumber'); | |
$p2 = HTML::TokeParser->new(\$m->{content}); | |
while (my $para = $p2->get_tag("p")) { | |
if ((defined($para->[1])) && (defined($para->[1]{id}))) { | |
my $id = $para->[1]{id}; | |
if ($id =~ /ctl00_Contenuto_ucGestioneDomande_ucSchedaDomandaReadOnly_Domanda/) { | |
my $question = $p2->get_trimmed_text("/p"); | |
$p2->get_tag("p"); | |
my $answer = $p2->get_text("/p"); | |
$answer =~s/\t/ /g; | |
## Does the answer contain anything | |
if ($answer =~/\d|\w/) { | |
## Yes, it does: fine | |
} else { | |
## No, it doesn't: move to Area Allegato | |
$p2->get_text("h3"); $p2->get_text("table"); | |
$answer = ''; | |
while (my $cell = $p2->get_tag("td","tr","/table")) { | |
if ($cell->[0] eq 'tr') { | |
$answer .= "\n"; | |
} elsif ($cell->[0] eq 'td') { | |
$answer .= ' ' . $p2->get_trimmed_text("/td"); | |
} | |
last if ($cell->[0] eq '/table'); | |
} | |
#$answer = $p2->get_text("/table"); | |
#warn $answer; | |
} | |
$domande{ $question } = $answer; | |
} | |
} | |
} | |
} | |
## What are we to do with this stuff? | |
## Split on linebreaks, hope each is a response associated with a value | |
foreach (keys(%domande)) { | |
my $key = $_; | |
my @outlines = split(/\n/, $domande{$_}); | |
foreach (@outlines) { | |
print $pageno, "\t", $rowno, "\t", join("\t",values(%gestione)), "\t", $key, "\t", $_, "\n"; | |
} | |
} | |
#print Dumper(%domande); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment