Skip to content

Instantly share code, notes, and snippets.

@edvakf
Created March 10, 2009 17:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edvakf/77015 to your computer and use it in GitHub Desktop.
Save edvakf/77015 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/ruby
require 'open-uri'
require 'uri'
require 'rss'
require 'digest/md5'
cache_dir = File.dirname(__FILE__)+'/cache'
Dir.mkdir(cache_dir) unless File.exist?(cache_dir)
result_file = File.dirname(__FILE__)+'/results.txt'
results = Hash.new
if File.exists?(result_file)
File.open(result_file,'r') do |f|
f.each_line do |line|
next if line.empty?
s = line.sub(/\n$/,'').split("\t")
results[s[1]] = s[0].to_i
end
end
end
#open('./hoge.txt') do |doc|
open('http://pipes.yahoo.com/pipes/pipe.run?_id=OoFUOg8N3hGZoDfPrbQIDg&_render=rss') do |doc|
rss = RSS::Parser.parse(doc.read)
#puts rss
rss.items.each do |item|
md5 = Digest::MD5.hexdigest(item.to_s)
file = cache_dir +'/'+ md5
next if File.exist?(file)
File.open(file,'w+') do |f|
f.print(item.to_s)
end
hash = {'&lt;'=>'<','&gt;'=>'>','&quot;'=>"\'"}
description = item.description
if !description || description.empty?
next
elsif description =~ /<|>|'/
html = description
else
html = description.gsub(/&lt;|&gt;|&quot;/){|s| hash[s]}
end
links = html.scan(/(<a[^>]*href=['"](.*?)['"].*?<\/a)/)
domains = links.map do |link|
begin
if link[0] =~ /<a.*?>\s*<img.+?>\s*<\/a/
''
elsif link[1] =~ /^https?:\/\//
URI.parse(link[1]).host
else
URI.parse(item.link).host
end
rescue
end
end
domains.each do |domain|
if results[domain]
results[domain] += 1
else
results[domain] = 1
end
end
end
end
text = results.to_a.delete_if{|a|
!a[0] || a[0].empty?
}.sort{|a,b|
b[1]<=>a[1]
}.map{|a|
#puts("#{a[1]}\t#{a[0]}")
"#{a[1]}\t#{a[0]}"
}.join("\n")
File.open(result_file,'w+') do |f|
f.print(text)
end
Dir.foreach(cache_dir) do |file|
if Time.now - File.atime("#{cache_dir}/#{file}") > 60*60*24*7
File.delete(file)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment