Created
May 20, 2011 15:48
-
-
Save kimarx/983210 to your computer and use it in GitHub Desktop.
This script is meant to display examples of sentence scraped from certain web
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
## This script is meant to display examples of sentence scraped from certain web | |
## pages. | |
## Last Update: 2011-05-21 00:48+09:00. | |
## Kim, Yi-Chul <kimarx@gmail.com> | |
use strict; | |
use warnings; | |
use utf8; | |
use Web::Scraper; | |
use URI; | |
use YAML::Syck qw|Dump|; | |
use Encode; | |
use Term::ANSIColor; | |
# use Data::Dumper; | |
my $base_uri = "http://search.yahoo.co.jp/search?p="; | |
# Command-line argument is set to this variable. | |
my $search_string = "@ARGV"; | |
# Set to this array websites that you want this script to scrape examples from. | |
my @sites = ( 'thesun.co.uk', 'guardian.co.uk', 'nytimes.com', 'ft.com', 'en.wikipedia.org', 'marxists.org', 'plato.stanford.edu', 'nybooks.com' ); | |
foreach my $site ( @sites ) { | |
print "\n$site\n"; | |
my $result = get_scraped_result($site, $search_string); | |
# print Dumper $result->{list}; | |
# $result = $result->{list}; | |
display_examples ( $result ); | |
} | |
sub display_examples | |
{ | |
my $yaml_dat = shift; | |
my $examples = $yaml_dat->{list}; | |
if ( defined $examples ) { | |
my @exms = @$examples; | |
foreach my $exm ( @exms ) { | |
$exm = encode( 'utf8', $exm ); | |
# print $exm; | |
highlighting_search_string( $exm ); | |
print "\n"; | |
} | |
} | |
} | |
sub highlighting_search_string | |
{ | |
my $letters = shift; | |
my ($pstr, $mstr, $nstr) = &split_string($letters, $search_string); | |
if (defined $pstr || defined $nstr ) { | |
print $pstr if $pstr; | |
print colored ['bold reverse'], $mstr; # Matched string pattern is highlighted. | |
print $nstr if $nstr; | |
} | |
return 0; | |
} | |
sub split_string | |
{ | |
my($str, $match) = @_; | |
$str =~ m/(^.*\s)$match/i; | |
my $pre_str = $1; | |
$str =~ m/$match(\s.*$)/i; | |
my $nex_str = $1; | |
return ($pre_str, $match, $nex_str); | |
} | |
sub get_scraped_result | |
{ | |
my ($uri, $string) = @_; | |
$uri = "+site%3A" . $uri; | |
$string = "intext%3A%22" . $string . "%22"; | |
my $perfect_uri = $base_uri . $string .$uri . "&ei=UTF-8"; | |
my $scrape_uri = URI->new( $perfect_uri ); | |
my $scraper = scraper { | |
process '/html/body/div/div[2]/div/div/ol/li/div', 'list[]' => 'TEXT'; | |
}; | |
my $result = $scraper->scrape( $scrape_uri ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment