Created
May 20, 2011 08:24
-
-
Save clintongormley/982551 to your computer and use it in GitHub Desktop.
Crawl ElasticSearch blogposts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use URI; | |
use Web::Scraper; | |
use ElasticSearch(); | |
my $url = URI->new("http://www.elasticsearch.org/blog"); | |
my $es = ElasticSearch->new( | |
servers => 'localhost:9200', | |
use_index => 'es_blogs', | |
use_type => 'post' | |
); | |
prepare_index($es); | |
scrape( $es, $url ); | |
print "Done\n"; | |
#=================================== | |
sub scrape { | |
#=================================== | |
my $es = shift; | |
my $url = shift; | |
print "Retrieving blog links from $url\n"; | |
# retrieve links to all blog posts | |
my $urls = scraper { | |
process 'ul.posts a', 'urls[]' => '@href'; | |
result 'urls'; | |
} | |
->scrape($url); | |
printf "... Found %d links \n\n", 0 + @$urls; | |
my $post_scraper = scraper { | |
process 'h2.page_title', 'title' => 'TEXT'; | |
process '#content', 'content' => 'TEXT'; | |
}; | |
for my $post_url (@$urls) { | |
# retrieve title and content from blog post | |
my $post = $post_scraper->scrape($post_url); | |
printf "Indexing: %s\n", $post->{title}; | |
$post->{url} = $post_url->as_string; | |
$es->index( data => $post ); | |
# sleep to avoid github's rate limiting | |
sleep 1; | |
} | |
} | |
#=================================== | |
sub prepare_index { | |
#=================================== | |
my $es = shift; | |
# remove an existing index, if it exists | |
$es->delete_index( ignore_missing => 1 ); | |
# create a new index | |
$es->create_index( | |
mappings => { | |
'post' => { | |
# to enable highlighting | |
_all => { store => 'yes', }, | |
properties => { | |
# boost title because important | |
title => { type => 'string', boost => 2 }, | |
content => { type => 'string' }, | |
# url shouldn't be analyzed | |
url => { | |
type => 'string', | |
index => 'not_analyzed', | |
include_in_all => 0 | |
}, | |
} | |
} | |
} | |
); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment