Created
May 13, 2014 06:02
-
-
Save noah/49b92bf6d7269070e57e to your computer and use it in GitHub Desktop.
html-to-markdown.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
%w[nokogiri open-uri chronic].each{|lib| require lib} | |
base = "https://blog.tilton.co" | |
doc = Nokogiri::HTML open "#{base}/archive.html" | |
doc.search('section.archives a').each do |post| | |
post_link = post[:href] | |
post_title = post.text | |
post_tags_wc = post.next_element.text.split(';')[0] | |
post_tags = post_tags_wc.split(/,\s+/) if not post_tags_wc.nil? | |
post_basename = File.basename(post_link, File.extname(post_link)) | |
#markdown = system("pandoc -s -r html #{base}/#{post_link} -o ./scrape/#{md}") | |
infile = "./scrape/#{post_basename}.md" | |
re1 = "\n\nPlease enable JavaScript to view the..*tilton@gmail.com\\).\n" | |
re2 = "---.*{#post-title}\n----+$\n\n" | |
File.open(infile) do |f| | |
f = f.read.gsub(Regexp.new(re1, Regexp::MULTILINE), '') | |
f = f.gsub(Regexp.new(re2, Regexp::MULTILINE), '') | |
f = f.gsub(/\n\n(\#\#\# edited at )(.*)( {\#post-edited})$/, '') | |
f.gsub(/^.*(\#\#\# )(.*)( {.*{\#post-created})$/, '') | |
post_created = Date.strptime("#{$2} 12:00:00", '%a %b %d, %Y').strftime('%Y-%m-%d %H:%M:%S') | |
f = f.gsub(/.*{\#post-created}.*\n\n/, '') | |
post_slug = post_basename.split('-')[3..-1].join('-') | |
outfile = "_posts/#{post_basename}.md" | |
puts outfile | |
File.open(outfile, "w") do |f2| | |
f2.write(<<EOF | |
created: #{post_created} | |
edited: | |
title: #{post_title} | |
slug: #{post_slug} | |
draft: False | |
tags: #{post_tags.join("\n") if not post_tags.nil?} | |
#{f} | |
<!-- | |
vim: ft=markdown | |
--> | |
EOF | |
) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment