Created
March 21, 2013 12:22
-
-
Save jreisinger/5212643 to your computer and use it in GitHub Desktop.
DON'T EDIT/REMOVE - referred to in http://perlmonks.org/?node_id=1024748
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Which companies are the names associated with? | |
use strict; | |
use warnings; | |
use HTML::LinkExtor; | |
use LWP::Simple; | |
use URI::Encode qw(uri_encode); | |
my $names_file = shift; | |
open my $fh, "<", $names_file; | |
my @names = <$fh>; | |
close $fh; | |
for my $name (@names) { | |
chomp $name; | |
print "Working on '$name'\n"; | |
my $base_url = 'http://www.orsr.sk'; | |
my @names = split ' ', $name; | |
if ( scalar @names ne 2 ) { # strange name | |
print "Skipping: >$name<\n"; | |
next; | |
} | |
my $gname = uri_encode( lc $names[0] ); | |
my $surname = uri_encode( lc $names[1] ); | |
#my $gname = lc $names[0]; | |
#my $surname = lc $names[1]; | |
my $url = | |
"${base_url}/hladaj_osoba.asp?PR=${surname}&MENO=${gname}&SID=0&T=f0&R=on"; | |
print ">$url<\n"; # Debug | |
sleep int rand(3); # lets be decent and wait for some random time | |
my @ids = get_ids($url); | |
if ( @ids == 0 ) { | |
# The name is not associated with any company | |
} else { | |
print "$name associated with ", scalar @ids, " companies\n"; | |
for my $id (@ids) { | |
print "\t$base_url/vypis.asp?ID=${id}&SID=5&P=0\n"; | |
} | |
} | |
} | |
#################### | |
sub get_ids { | |
#################### | |
my $url = shift; | |
my @ids; # internal ORSR IDs | |
my $parser = HTML::LinkExtor->new(); | |
$parser->parse( get $url)->eof; | |
for my $linkarray ( $parser->links ) { | |
my @element = @$linkarray; | |
my ( $elt_type, $attr_name, $link ) = @element; | |
if ( $elt_type eq 'a' and $attr_name eq 'href' ) { | |
# vypis.asp?ID=101797&SID=5&P=0 | |
push @ids, $1 if $link =~ /^vypis\.asp\?ID=(\d+).*P=0$/; | |
} | |
} | |
return @ids; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment