Skip to content

Instantly share code, notes, and snippets.

@danchoi
Created March 23, 2009 18:17
Show Gist options
  • Save danchoi/83702 to your computer and use it in GitHub Desktop.
Save danchoi/83702 to your computer and use it in GitHub Desktop.
class Feed < ActiveRecord::Base
has_many :items, :order => "published_at desc", :dependent => :delete_all
has_many :feed_errors
# This is problematic because on update, the feed_url can be changed. It can
# be changed to the feed_url of another feed. TODO Figure out a way to
# automatically merge two feeds in this case.
# validates_uniqueness_of :feed_url
validates_presence_of :feed_url
has_many :feed_discoveries, :dependent => :destroy
has_many :subscriptions, :dependent => :destroy
named_scope :popular, :order => 'subscriptions_count desc', :limit => 50
named_scope :subscribed_to, :order => "subscriptions_count desc, feeds.title asc",
:conditions => "subscriptions_count > 0"
attr_reader :feedzirra_feed
alias :ff :feedzirra_feed
attr_reader :encoding, :xml
serialize :download_times
# Before calling this, create or initialize the feed first by setting the
# feed_url. This method will add or update entries as necessary.
def fetch_and_parse
@xml = fetch_raw(self.feed_url)
parse
#rescue Feedzirra::NoParserAvailable => ex
rescue Exception => ex
create_feed_error_from_exception(ex)
end
def stale?
self.last_downloaded_at < 1.hour.ago
end
def slow?
median_download_time && median_download_time > 5
end
# deletes items and starts over
def start_over
items.delete_all
fetch_and_parse
end
def fetch_raw(url)
start_time = Time.now
xml = Feedzirra::Feed.fetch_raw(url)
calculate_download_times(start_time)
xml
end
def calculate_download_times(start_time)
if self.download_times.nil? || !self.download_times.is_a?(Array)
self.download_times = []
end
self.download_times = (self.download_times << (Time.now - start_time))
self.average_download_time = mean(download_times)
self.median_download_time = median(download_times)
save!
end
def parse(xml=@xml)
@feedzirra_feed = Feedzirra::Feed.parse xml
if @feedzirra_feed
# No errors, means we can create the Feed
analyze_xml_decl
self.update_from_feedzirra
end
rescue Feedzirra::NoParserAvailable => ex
create_feed_error_from_exception(ex)
end
def analyze_xml_decl
@encoding = nil
md = /<\?xml [^>]+\?>/.match @xml
if md
declaration = md[0]
# at this point, we have a string like
# <?xml version="1.0" encoding="ISO-8859-1"?>
md2 = /encoding=["']([^"']+)["']/i.match declaration
if md2
this_encoding = md2[1]
end
end
@encoding = this_encoding ? this_encoding.upcase : nil
end
def create_feed_error_from_exception(ex)
if self.new_record?
FeedError.create :error_type => ex.class.to_s, :message => ex.message, :feed_url => self.feed_url, :trace => ex.bracktrace
else
self.feed_errors.create :error_type => ex.class.to_s, :message => ex.message, :trace => ex.backtrace
end
return false
end
# Assumes @feedzirra_feed has been obtained
def update_from_feedzirra
#ff.sanitize_entries! # for some reason, this inserts <p> tags in the
#titles!
self.update_attributes :last_downloaded_at => Time.now,
:title => ff.title,
:etag => ff.etag,
:web_url => ff.url,
:last_modified_at => ff.last_modified
if ff.feed_url
self.update_attribute :feed_url, ff.feed_url
end
asciify_title
if self.items.empty?
save!
create_items(ff.entries)
else
add_new_items
end
update_statistics
save!
end
def create_items(ff_entries)
ff_entries[0,20].each do |e|
logger.info "-" * 80
logger.info "Creating item: #{e.title} at #{Time.zone.now}"
item = self.items.create :published_at => e.published,
:title => e.title,
:url => e.url,
:author => e.author,
:summary => e.summary,
:content => e.content
item.process_content
logger.info "Done processing item: #{e.title} at #{Time.zone.now}"
end
end
def add_new_items
return if ff.entries.empty?
newer_than_date = self.items.first.published_at || self.items.first.created_at
if ff.entries.first.published.to_i > newer_than_date.to_i
self.create_items( ff.entries.select {|e| e.published > newer_than_date} )
end
end
def update_statistics
self.average_words_per_item = self.items.with_content.average(:word_count).to_f.round.to_i
# TODO posting frequency and avg words per day
end
def self.update_subscribed_feeds
self.subscribed_to.each do |feed|
if feed.last_downloaded_at > 1.hour.ago
puts "skipping #{feed.title}"
next
end
feed.fetch_and_parse
puts "updated #{feed.title}"
end
end
def asciify_title
self.title = self.title.strip if self.title
self.update_attribute :title, Iconv.conv("US-ASCII//IGNORE//TRANSLIT", 'UTF-8', self.title)
end
def self.asciify_titles
Feed.all.each do |x|
puts "asciifying #{x.title}"
x.asciify_title
puts "asciified #{x.title}"
puts
end
end
# virtual attributes
def title
read_attribute(:title) || "no title"
end
# for calculating median and mean download times
def mean(array)
array.inject(0) { |sum, x| sum += x } / array.size.to_f
end
def median(array, already_sorted=false)
return nil if array.empty?
array = array.sort unless already_sorted
m_pos = array.size / 2
return array.size % 2 == 1 ? array[m_pos] : mean(array[m_pos-1..m_pos])
end
end
ActiveRecord::Schema.define(:version => 20090226004897) do
create_table "feeds", :force => true do |t|
t.datetime "last_downloaded_at"
t.datetime "last_modified_at"
t.string "title"
t.string "subtitle"
t.string "feed_url"
t.string "web_url"
t.string "favicon_url"
t.integer "average_words_per_item"
t.integer "average_items_per_day"
t.integer "unparseable_entries_count", :default => 0
t.integer "subscriptions_count", :default => 0
t.datetime "created_at"
t.datetime "updated_at"
t.string "etag"
t.text "download_times"
t.float "average_download_time"
t.float "median_download_time"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment