Skip to content

Instantly share code, notes, and snippets.

@kimarx
Created May 20, 2011 15:48
Show Gist options
  • Save kimarx/983210 to your computer and use it in GitHub Desktop.
Save kimarx/983210 to your computer and use it in GitHub Desktop.
This script is meant to display examples of sentence scraped from certain web
#!/usr/bin/perl
## This script is meant to display examples of sentence scraped from certain web
## pages.
## Last Update: 2011-05-21 00:48+09:00.
## Kim, Yi-Chul <kimarx@gmail.com>
use strict;
use warnings;
use utf8;
use Web::Scraper;
use URI;
use YAML::Syck qw|Dump|;
use Encode;
use Term::ANSIColor;
# use Data::Dumper;
my $base_uri = "http://search.yahoo.co.jp/search?p=";
# Command-line argument is set to this variable.
my $search_string = "@ARGV";
# Set to this array websites that you want this script to scrape examples from.
my @sites = ( 'thesun.co.uk', 'guardian.co.uk', 'nytimes.com', 'ft.com', 'en.wikipedia.org', 'marxists.org', 'plato.stanford.edu', 'nybooks.com' );
foreach my $site ( @sites ) {
print "\n$site\n";
my $result = get_scraped_result($site, $search_string);
# print Dumper $result->{list};
# $result = $result->{list};
display_examples ( $result );
}
sub display_examples
{
my $yaml_dat = shift;
my $examples = $yaml_dat->{list};
if ( defined $examples ) {
my @exms = @$examples;
foreach my $exm ( @exms ) {
$exm = encode( 'utf8', $exm );
# print $exm;
highlighting_search_string( $exm );
print "\n";
}
}
}
sub highlighting_search_string
{
my $letters = shift;
my ($pstr, $mstr, $nstr) = &split_string($letters, $search_string);
if (defined $pstr || defined $nstr ) {
print $pstr if $pstr;
print colored ['bold reverse'], $mstr; # Matched string pattern is highlighted.
print $nstr if $nstr;
}
return 0;
}
sub split_string
{
my($str, $match) = @_;
$str =~ m/(^.*\s)$match/i;
my $pre_str = $1;
$str =~ m/$match(\s.*$)/i;
my $nex_str = $1;
return ($pre_str, $match, $nex_str);
}
sub get_scraped_result
{
my ($uri, $string) = @_;
$uri = "+site%3A" . $uri;
$string = "intext%3A%22" . $string . "%22";
my $perfect_uri = $base_uri . $string .$uri . "&ei=UTF-8";
my $scrape_uri = URI->new( $perfect_uri );
my $scraper = scraper {
process '/html/body/div/div[2]/div/div/ol/li/div', 'list[]' => 'TEXT';
};
my $result = $scraper->scrape( $scrape_uri );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment