Skip to content

Instantly share code, notes, and snippets.

@clintongormley
Created May 20, 2011 08:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clintongormley/982551 to your computer and use it in GitHub Desktop.
Save clintongormley/982551 to your computer and use it in GitHub Desktop.
Crawl ElasticSearch blogposts
#!/usr/bin/perl
use strict;
use warnings;
use URI;
use Web::Scraper;
use ElasticSearch();
my $url = URI->new("http://www.elasticsearch.org/blog");
my $es = ElasticSearch->new(
servers => 'localhost:9200',
use_index => 'es_blogs',
use_type => 'post'
);
prepare_index($es);
scrape( $es, $url );
print "Done\n";
#===================================
sub scrape {
#===================================
my $es = shift;
my $url = shift;
print "Retrieving blog links from $url\n";
# retrieve links to all blog posts
my $urls = scraper {
process 'ul.posts a', 'urls[]' => '@href';
result 'urls';
}
->scrape($url);
printf "... Found %d links \n\n", 0 + @$urls;
my $post_scraper = scraper {
process 'h2.page_title', 'title' => 'TEXT';
process '#content', 'content' => 'TEXT';
};
for my $post_url (@$urls) {
# retrieve title and content from blog post
my $post = $post_scraper->scrape($post_url);
printf "Indexing: %s\n", $post->{title};
$post->{url} = $post_url->as_string;
$es->index( data => $post );
# sleep to avoid github's rate limiting
sleep 1;
}
}
#===================================
sub prepare_index {
#===================================
my $es = shift;
# remove an existing index, if it exists
$es->delete_index( ignore_missing => 1 );
# create a new index
$es->create_index(
mappings => {
'post' => {
# to enable highlighting
_all => { store => 'yes', },
properties => {
# boost title because important
title => { type => 'string', boost => 2 },
content => { type => 'string' },
# url shouldn't be analyzed
url => {
type => 'string',
index => 'not_analyzed',
include_in_all => 0
},
}
}
}
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment