clintongormley/crawl.pl

## crawl.pl
#!/usr/bin/perl
use strict;
use warnings;
use URI;
use Web::Scraper;
use ElasticSearch();

my $url = URI->new("http://www.elasticsearch.org/blog");

my $es = ElasticSearch->new(
    servers   => 'localhost:9200',
    use_index => 'es_blogs',
    use_type  => 'post'
);

prepare_index($es);
scrape( $es, $url );

print "Done\n";

#===================================
sub scrape {
#===================================
    my $es  = shift;
    my $url = shift;

    print "Retrieving blog links from $url\n";

    # retrieve links to all blog posts
    my $urls = scraper {
        process 'ul.posts a', 'urls[]' => '@href';
        result 'urls';
    }
    ->scrape($url);

    printf "... Found %d links \n\n", 0 + @$urls;

    my $post_scraper = scraper {
        process 'h2.page_title', 'title'   => 'TEXT';
        process '#content',      'content' => 'TEXT';
    };

    for my $post_url (@$urls) {

        # retrieve title and content from blog post
        my $post = $post_scraper->scrape($post_url);

        printf "Indexing: %s\n", $post->{title};
        $post->{url} = $post_url->as_string;
        $es->index( data => $post );

        # sleep to avoid github's rate limiting
        sleep 1;

    }
}

#===================================
sub prepare_index {
#===================================
    my $es = shift;

    # remove an existing index, if it exists
    $es->delete_index( ignore_missing => 1 );

    # create a new index
    $es->create_index(
        mappings => {
            'post' => {

                # to enable highlighting
                _all       => { store => 'yes', },
                properties => {

                    # boost title because important
                    title   => { type => 'string', boost => 2 },
                    content => { type => 'string' },

                    # url shouldn't be analyzed
                    url => {
                        type           => 'string',
                        index          => 'not_analyzed',
                        include_in_all => 0
                    },
                }
            }
        }
    );

}
	#!/usr/bin/perl
	use strict;
	use warnings;
	use URI;
	use Web::Scraper;
	use ElasticSearch();

	my $url = URI->new("http://www.elasticsearch.org/blog");

	my $es = ElasticSearch->new(
	servers => 'localhost:9200',
	use_index => 'es_blogs',
	use_type => 'post'
	);

	prepare_index($es);
	scrape( $es, $url );

	print "Done\n";

	#===================================
	sub scrape {
	#===================================
	my $es = shift;
	my $url = shift;

	print "Retrieving blog links from $url\n";

	# retrieve links to all blog posts
	my $urls = scraper {
	process 'ul.posts a', 'urls[]' => '@href';
	result 'urls';
	}
	->scrape($url);

	printf "... Found %d links \n\n", 0 + @$urls;

	my $post_scraper = scraper {
	process 'h2.page_title', 'title' => 'TEXT';
	process '#content', 'content' => 'TEXT';
	};

	for my $post_url (@$urls) {

	# retrieve title and content from blog post
	my $post = $post_scraper->scrape($post_url);

	printf "Indexing: %s\n", $post->{title};
	$post->{url} = $post_url->as_string;
	$es->index( data => $post );

	# sleep to avoid github's rate limiting
	sleep 1;

	}
	}

	#===================================
	sub prepare_index {
	#===================================
	my $es = shift;

	# remove an existing index, if it exists
	$es->delete_index( ignore_missing => 1 );

	# create a new index
	$es->create_index(
	mappings => {
	'post' => {

	# to enable highlighting
	_all => { store => 'yes', },
	properties => {

	# boost title because important
	title => { type => 'string', boost => 2 },
	content => { type => 'string' },

	# url shouldn't be analyzed
	url => {
	type => 'string',
	index => 'not_analyzed',
	include_in_all => 0
	},
	}
	}
	}
	);

	}