Skip to content

Instantly share code, notes, and snippets.

@drobune
Last active January 17, 2017 16:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drobune/644605702413c2f2f4077a7b179a47ee to your computer and use it in GitHub Desktop.
Save drobune/644605702413c2f2f4077a7b179a47ee to your computer and use it in GitHub Desktop.
require 'open-uri'
require 'nokogiri'
require 'json'
def get_content(url)
charset = nil
html = open(url) do |f|
charset = f.charset
f.read
end
articles = []
doc = Nokogiri::HTML.parse(html, nil, charset)
doc.css('.entryTd').each_with_index do |ent, i|
lines = ent.text.split("\r\n")
title = lines[0].strip
hashs = title.split(" ")
hashs.delete("")
hashed = hashs.map{ |l| "#" + l }
lines = lines + hashed
lines = lines.map {|l| l.gsub(/(\s| )+/, '')}
lines.delete("")
lines = lines.map {|l| l.include?("youtube.com") ? "[" + l + "]" : l}
#source url
source = doc.css('.entryTitle')[i].css('a')[0].attribute('href').value
lines.push("ソース" + source) if source
articles << { title: title, lines: lines }
end
articles
end
articles = []
(1..213).to_a.each do |i|
url = 'http://blogs.yahoo.co.jp/oya_ji3/MYBLOG/yblog.html?m=lc&p='
url = url + i.to_s
articles += get_content(url)
sleep 1
end
scrapbox = {
pages: articles
}
puts JSON.generate(scrapbox)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment