# -*- coding: utf-8 -*- # = Config example # # - module: Web::GetEntries # config: # url: http://exmaple.com/ # extract_xpath: # title: {xpath} # link: {xpath} # description: {xpath} # date: {xpath} # # extract_xpath は必須ではなく、 # 1. まず url だけを指定して取得する # 2. それで取得できない場合、または取得できたものでは不都合な場合は # extract_xpath をユーザが定義する。 # # = 備考 # title, date などが取得できない・不正な場合は # nil を返し、呼び出し元で処理する。 require "pp" require 'yapra/plugin/mechanize_base' module Yapra::Plugin::Feed class Extract < Yapra::Plugin::MechanizeBase def initialize super @xconfig_for_hatom = { "capture" => "//body", "split" => xpath_attr_contains("class", "hentry"), "title" => xpath_attr_contains("class", "entry-title"), "link" => xpath_attr_contains("rel", "bookmark"), "description" => xpath_attr_contains("class", "entry-content"), "date" => xpath_attr_contains("class", "published") } end # Returns XPath that explains {attr} contains {value}. def xpath_attr_contains(attr, value) ".//*[contains(concat(' ',normalize-space(@#{attr}),' '), ' #{value} ')]" end def get_title(element, xpath) if element.xpath(xpath).size > 0 element.xpath(xpath)[0].text.strip else nil end end def get_link(element, xpath, url) href = element.xpath( xpath )[0].attr("href") unless /^http/ =~ href url_base = url.split("/")[0..2].join("/") href = url_base + href end href end def get_description(element, xpath) if xpath element.xpath(xpath)[0] else element end end def get_date(element, xpath) if xpath && element.xpath(xpath).size > 0 date_str = element.xpath(xpath)[0].text begin Time.parse( Date.parse(date_str).to_s ) rescue ArgumentError nil end else nil end end def get_entries(page, config) xconfig = config["extract_xpath"] ? config["extract_xpath"] : @xconfig_for_hatom capture = xconfig['capture'] root = capture ? page.root.xpath(capture) : page.root root.xpath(xconfig['split']) end def run(data) url = config["url"] page = agent.get(url) xconfig = if config["extract_xpath"] config["extract_xpath"] else @xconfig_for_hatom end get_entries(page, config).map{|entry| item = RSS::RDF::Item.new item.title = get_title(entry, xconfig["title"]) item.link = get_link(entry, xconfig["link"], url) item.description = get_description(entry, xconfig["description"]) item.date = get_date(entry, xconfig["date"]) item } end end end