Last active
August 29, 2015 14:07
-
-
Save chankeypathak/94338d2459644f4eec4c to your computer and use it in GitHub Desktop.
scrapping cromaretail
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.010; | |
use open qw(:locale); | |
use strict; | |
use utf8; | |
use warnings qw(all); | |
use Mojo::UserAgent; | |
my @urls = map { Mojo::URL->new($_) } qw( | |
http://www.cromaretail.com/Mobile-Phones-c-10.aspx | |
); | |
# 4 parallel connections | |
my $max_conn = 4; | |
# 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
$ua->proxy->detect; | |
my $active = 0; | |
Mojo::IOLoop->recurring( | |
0 => sub { | |
for ($active + 1 .. $max_conn) { | |
return ($active or Mojo::IOLoop->stop) | |
unless my $url = shift @urls; | |
++$active; | |
$ua->get($url => \&get_callback); | |
} | |
} | |
); | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
sub get_callback { | |
my (undef, $tx) = @_; | |
--$active; | |
return | |
if not $tx->res->is_status_class(200) | |
or $tx->res->headers->content_type !~ m{^text/html\b}ix; | |
# Request URL | |
my $url = $tx->req->url; | |
parse_html($url, $tx); | |
return; | |
} | |
sub parse_html { | |
my ($url, $tx) = @_; | |
#Print the mobile names | |
print "Devices are are....\n"; | |
say $tx->res->dom->find('h2 > a')->text; | |
#Print the links of mobiles | |
print "Links are....\n"; | |
say $tx->res->dom->find('h2 > a')->attr('href'); | |
#Print the price of mobiles | |
print "Prices are....\n"; | |
say $tx->res->dom->find('article > h3')->text; | |
say ''; | |
return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment