Skip to content

Instantly share code, notes, and snippets.

@fuba
Created December 25, 2008 03:44
Show Gist options
  • Save fuba/39847 to your computer and use it in GitHub Desktop.
Save fuba/39847 to your computer and use it in GitHub Desktop.
いま自分ではつかってないのでメンテナやる人がいたらてきとうにforkしてください、ダイアリーとかからも誘導しときます
- module: Filter::EntryFullText::LDRFullFeed
config:
force_upgrade: 1
alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json
force_alternative_siteinfo: 1
package Plagger::Plugin::Filter::EntryFullText::LDRFullFeed;
use strict;
use base qw( Plagger::Plugin::Filter::EntryFullText );
use JSON;
use Plagger::UserAgent;
use WebService::Wedata;
sub class_id {
my $self = shift;
return ($self->conf->{impersonate})
? "Filter-EntryFullText"
: $self->SUPER::class_id;
}
sub load_plugins {
my $self = shift;
$self->SUPER::load_plugins(@_);
$self->load_plugin_siteinfo;
}
sub load_plugin_siteinfo {
my $self = shift;
my $siteinfo = $self->_siteinfo;
if ($siteinfo) {
Plagger->context->log(debug => "Loaded siteinfo");
}
else {
Plagger->context->log(warn => "No siteinfo");
return;
}
push @{ $self->{plugins} },
map { Plagger::Plugin::Filter::EntryFullText::SiteInfo->new($_) }
@{$siteinfo};
}
sub _siteinfo {
my $self = shift;
my $ua = Plagger::UserAgent->new;
my $wedata = WebService::Wedata->new;
$wedata->{ua} = $ua;
my $i = 0;
my %priority = qw/
SBM 1000
INDIVIDUAL 100
IND 100
SUBGENERAL 10
SUB 10
GENERAL 1
GEN 1
/;
my @rules;
my $db;
my $items_ref;
if (eval {
die if ($self->conf->{force_alternative_siteinfo});
$db = $wedata->get_database('LDRFullFeed');
}) {
$items_ref = $db->get_items;
}
else {
Plagger->context->log(debug => "Wedata seems down");
my $json = $ua->get(
$self->conf->{siteinfo}
)->decoded_content;
$json =~ s|^\(||;
$json =~ s|\)$||;
$items_ref = from_json($json);
}
for my $item (
sort {
$a->{data}->{priority} <=> $b->{data}->{priority}
}
map {
$_->{data}->{priority} ||= ($_->{data}->{type})
? $priority{$_->{data}->{type}}
: 0; $_;
} @{$items_ref}
) {
Plagger->context->log(
debug => sprintf(
'siteinfo: %s %s %s',
$item->{data}->{url},
$item->{data}->{xpath},
$item->{data}->{type}
)
);
push @rules, {
handle => $item->{data}->{url},
extract_xpath => {
body => $item->{data}->{xpath}
},
};
}
return (@rules) ? \@rules : undef;
}
package Plagger::Plugin::Filter::EntryFullText::SiteInfo;
use base 'Plagger::Plugin::Filter::EntryFullText::YAML';
sub extract {
my($self, $args) = @_;
my $data;
unless ($self->{extract} || $self->{extract_xpath}) {
Plagger->context->log(error => "SiteInfo doesn't have either 'extract' nor 'extract_xpath'");
return;
}
if ($self->{extract}) {
if (my @match = $args->{content} =~ /$self->{extract}/s) {
my @capture = split /\s+/, $self->{extract_capture};
@capture = ('body') unless @capture;
@{$data}{@capture} = @match;
}
}
if ($self->{extract_xpath}) {
eval { require HTML::TreeBuilder::XPath };
if ($@) {
Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
return;
}
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($args->{content});
$tree->eof;
for my $capture (keys %{$self->{extract_xpath}}) {
my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
if (@children) {
no warnings 'redefine';
local *HTML::Element::_xml_escape = \&xml_escape;
my $body = '';
for my $child (@children) {
$body .= $child->isElementNode
? $child->as_XML
: $child->getValue;
}
$data->{capture} = $body;
} else {
Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
}
}
}
if ($data) {
if ($self->{extract_after_hook}) {
eval $self->{extract_after_hook};
Plagger->context->error($@) if $@;
}
if ($data->{date}) {
if (my $format = $self->{extract_date_format}) {
$format = [ $format ] unless ref $format;
$data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
if ($data->{date} && $self->{extract_date_timezone}) {
$data->{date}->set_time_zone($self->{extract_date_timezone});
}
} else {
$data->{date} = Plagger::Date->parse_dwim($data->{date});
}
}
return $data;
}
}
sub xml_escape {
for my $x (@_) {
$x = Plagger::Util::encode_xml($x);
}
}
1;
__END__
=head1 NAME
Plagger::Plugin::Filter::EntryFullText::LDRFullFeed - Upgrade feeds to fulltext class by using LDRFullFeed siteinfo
=head1 SYNOPSIS
- module: Filter::EntryFullText::LDRFullFeed
config:
force_upgrade: 1
alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json
=head1 DESCRIPTION
=head1 CONFIG
=over 4
=item impersonate
=item store_html_on_failure
=item force_upgrade
=item alternative_siteinfo
This module uses alternative siteinfo if Wedata down.
=item force_alternative_siteinfo
=back
=head1 AUTHOR
fuba
=head1 SEE ALSO
L<Plagger>, L<Plagger::Plugin::Filter::EntryFullText>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment