Skip to content

Instantly share code, notes, and snippets.

@reizist
Created February 27, 2017 16:19
Show Gist options
  • Save reizist/4e179b241c501db28bdee26ef9371cda to your computer and use it in GitHub Desktop.
Save reizist/4e179b241c501db28bdee26ef9371cda to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
require 'pry'
require "pry-byebug"
require 'natto'
require 'active_support'
require 'active_support/core_ext'
class Parser
URL = 'http://negineesan.hatenablog.com'
def initialize(url)
@url = url
@host = URI.parse(url).host
@base_dir = 'doc'
@save_path = "#{@base_dir}/#{@host}/entries"
end
def save_entries
(2012..2017).each do |year|
url = "#{@url}/archive/#{year}"
html = open(url).read
doc = Nokogiri::HTML.parse(html, nil, 'utf-8')
doc.xpath('//a[@class="entry-title-link"]').each do |link|
entry_url = link.attribute('href').value
entry_html = open(entry_url).read
entry_doc = Nokogiri::HTML.parse(entry_html, nil, 'utf-8')
text = entry_doc.xpath('//div[@class="entry-content"]').text
file_name = URI.parse(entry_url).path.gsub(/\/entry\//, "").gsub(/\//, "-")
FileUtils.mkdir_p(@save_path) unless FileTest.exist?(@save_path)
File.open("#{@save_path}/#{file_name}.txt", 'w') do |f|
f.write(text)
end
end
end
end
def make_digest
dir = "#{@base_dir}/#{@host}"
file = "#{dir}/digest.txt"
digest = File.open(file, 'w+')
Dir.glob("#{@save_path}/*.txt").each do |article|
digest.puts(File.read(article))
end
end
end
class Markov
def initialize
file = "doc/negineesan.hatenablog.com/digest.txt"
@text = File.read(file)
@table = {}
end
def make_markov_table
mecab = Natto::MeCab.new('-Owakati')
data = ['BEGIN', 'BEGIN']
mecab.parse(@text) do |text|
data << text.surface unless text.surface.blank?
end
data << "END"
data.each_cons(3).each do |d|
suffix = d.pop
prefix = d
@table[prefix] ||= []
@table[prefix] << suffix
# @table << { d[0] => { d[1] => suffix } }
end
end
def result
make_markov_table
result = ""
prefix = ["BEGIN","BEGIN"]
loop {
n = @table[prefix].length
prefix = [prefix[1], @table[prefix][Random.rand(0..n-1)]]
result += prefix[0] if prefix[0] != 'BEGIN'
if @table[prefix].last == 'END'
result += prefix[1]
break
end
}
result
end
end
# Parser.new(Parser::URL).save_entries
# Parser.new(Parser::URL).make_digest
puts Markov.new.result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment