kimarx/gist:983210

## gistfile1.pl
#!/usr/bin/perl

## This script is meant to display examples of sentence scraped from certain web
## pages.
## Last Update: 2011-05-21 00:48+09:00.
## Kim, Yi-Chul <kimarx@gmail.com>


use strict;
use warnings;
use utf8;

use Web::Scraper;
use URI;
use YAML::Syck qw|Dump|;
use Encode;
use Term::ANSIColor;
# use Data::Dumper;


my $base_uri = "http://search.yahoo.co.jp/search?p=";

# Command-line argument is set to this variable.
my $search_string = "@ARGV";

# Set to this array websites that you want this script to scrape examples from.
my @sites = ( 'thesun.co.uk', 'guardian.co.uk', 'nytimes.com', 'ft.com', 'en.wikipedia.org', 'marxists.org', 'plato.stanford.edu', 'nybooks.com' );

foreach my $site ( @sites ) {
  print "\n$site\n";

  my $result = get_scraped_result($site, $search_string);

  # print Dumper $result->{list};

  # $result = $result->{list};
  display_examples ( $result );
}


sub display_examples
{
  my $yaml_dat = shift;
  my $examples = $yaml_dat->{list};

  if ( defined $examples ) {

    my @exms = @$examples;


    foreach my $exm ( @exms ) {
      $exm = encode( 'utf8', $exm );
      # print $exm;
      highlighting_search_string( $exm );
      print "\n";
    }

  }
}


sub highlighting_search_string
{
  my $letters = shift;

  my ($pstr, $mstr, $nstr) = &split_string($letters, $search_string);
  if (defined $pstr || defined $nstr ) {

    print $pstr if $pstr;
    print colored ['bold reverse'], $mstr;    # Matched string pattern is highlighted.
    print $nstr if $nstr;
  }

  return 0;
}


sub split_string
{
  my($str, $match) = @_;

  $str =~ m/(^.*\s)$match/i;
  my $pre_str = $1;
  $str =~ m/$match(\s.*$)/i;
  my $nex_str = $1;

  return ($pre_str, $match, $nex_str);
}


sub get_scraped_result
{
  my ($uri, $string) = @_;

  $uri = "+site%3A" . $uri;
  $string = "intext%3A%22" . $string . "%22";

  my $perfect_uri = $base_uri . $string .$uri . "&ei=UTF-8";

  my $scrape_uri = URI->new( $perfect_uri );
  my $scraper = scraper {
    process '/html/body/div/div[2]/div/div/ol/li/div', 'list[]' => 'TEXT';
  };

  my $result = $scraper->scrape( $scrape_uri );
}
	#!/usr/bin/perl

	## This script is meant to display examples of sentence scraped from certain web
	## pages.
	## Last Update: 2011-05-21 00:48+09:00.
	## Kim, Yi-Chul <kimarx@gmail.com>


	use strict;
	use warnings;
	use utf8;

	use Web::Scraper;
	use URI;
	use YAML::Syck qw\|Dump\|;
	use Encode;
	use Term::ANSIColor;
	# use Data::Dumper;


	my $base_uri = "http://search.yahoo.co.jp/search?p=";

	# Command-line argument is set to this variable.
	my $search_string = "@ARGV";

	# Set to this array websites that you want this script to scrape examples from.
	my @sites = ( 'thesun.co.uk', 'guardian.co.uk', 'nytimes.com', 'ft.com', 'en.wikipedia.org', 'marxists.org', 'plato.stanford.edu', 'nybooks.com' );

	foreach my $site ( @sites ) {
	print "\n$site\n";

	my $result = get_scraped_result($site, $search_string);

	# print Dumper $result->{list};

	# $result = $result->{list};
	display_examples ( $result );
	}



	sub display_examples
	{
	my $yaml_dat = shift;
	my $examples = $yaml_dat->{list};

	if ( defined $examples ) {

	my @exms = @$examples;


	foreach my $exm ( @exms ) {
	$exm = encode( 'utf8', $exm );
	# print $exm;
	highlighting_search_string( $exm );
	print "\n";
	}

	}
	}


	sub highlighting_search_string
	{
	my $letters = shift;

	my ($pstr, $mstr, $nstr) = &split_string($letters, $search_string);
	if (defined $pstr \|\| defined $nstr ) {

	print $pstr if $pstr;
	print colored ['bold reverse'], $mstr; # Matched string pattern is highlighted.
	print $nstr if $nstr;
	}

	return 0;
	}


	sub split_string
	{
	my($str, $match) = @_;

	$str =~ m/(^.*\s)$match/i;
	my $pre_str = $1;
	$str =~ m/$match(\s.*$)/i;
	my $nex_str = $1;

	return ($pre_str, $match, $nex_str);
	}


	sub get_scraped_result
	{
	my ($uri, $string) = @_;

	$uri = "+site%3A" . $uri;
	$string = "intext%3A%22" . $string . "%22";

	my $perfect_uri = $base_uri . $string .$uri . "&ei=UTF-8";

	my $scrape_uri = URI->new( $perfect_uri );
	my $scraper = scraper {
	process '/html/body/div/div[2]/div/div/ol/li/div', 'list[]' => 'TEXT';
	};

	my $result = $scraper->scrape( $scrape_uri );
	}