Skip to content

Instantly share code, notes, and snippets.

@ferostabio
Last active December 12, 2015 07:49
Show Gist options
  • Save ferostabio/4739650 to your computer and use it in GitHub Desktop.
Save ferostabio/4739650 to your computer and use it in GitHub Desktop.
Script that uses nokogiri and data-mapper to fetch and store data from my posts in PijamaSurf
# encoding: utf-8
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'data_mapper'
require 'dm-sqlite-adapter'
require 'date'
require 'set'
require 'json'
DataMapper.setup(:default, "sqlite3://#{Dir.pwd}/pijama.db")
class Post
include DataMapper::Resource
property :id, Serial
property :name, String, :length => 140
property :url, String, :length => 140
property :date, Date
property :words, Integer
property :visits, Integer
property :comments, Integer
property :twitter, Integer
property :facebook, Integer
end
DataMapper.auto_upgrade!
def facebook_likes(url)
data = open("http://graph.facebook.com/?ids=#{URI.escape(url)}").read
data = JSON.parse(data)
data[url]['shares']
end
def twitter_shares(url)
data = open("http://urls.api.twitter.com/1/urls/count.json?url=#{URI.escape(url)}").read
data = JSON.parse(data)
data['count']
end
URL = 'http://pijamasurf.com/author/federico-erostarbe/'
urls = Set.new
base = Nokogiri::HTML(open(URL))
data = base.at_css("#search-list").children().children()
data.each do |node|
attrs = node.attributes
if attrs.member?('href')
urls << attrs['href'].content
end
end
urls.each do |url|
post = Post.first(:url => url)
flag = true
if post.nil?
flag = false
end
post_base = Nokogiri::HTML(open(url))
post_data = post_base.at_css('#post-info')
info = post_data.children.children
if !flag
content_set = post_base.at_css('.post-content').children
text = ""
content_set.each do |t|
if !(t.content.start_with?('Twitter del autor') || t.content.start_with?('También en Pijama Surf'))
text += t.content
end
end
string_date = post_base.css('.post-above').to_xml(:indent => 5, :encoding => 'UTF-8')[/\d{2}\/\d{2}\/\d{4}/]
name = post_base.css('title').children.first.content.split(' | ').first
post = Post.new
post.name = name
post.date = Date::strptime(string_date, '%d/%m/%Y')
post.url = url
post.words = text.split.size
end
comments = post_base.css('#comments-count').children.first.content.split(' ').first.to_i
if comments == 0 && post_base.css('#comments-count').children.first.content.start_with?('Un ')
comments = 1
end
visits = info.first.content.split(' | ').last.to_i
post.twitter = twitter_shares url
post.facebook = facebook_likes url
post.comments = comments
post.visits = visits
post.save
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment