require 'date' | |
require 'uri' | |
require 'net/http' | |
require 'json' | |
require 'rubygems/text' | |
include Gem::Text | |
SRC = "./" | |
DST = "./migrated" | |
ORGTAG = /:([a-zA-Z\-_]+)/ | |
def render(data) | |
if data["subtitle"] | |
subtitle = "\nsubtitle: #{data["subtitle"]}" | |
else | |
subtitle = "" | |
end | |
%{title: #{data["title"]}#{subtitle} | |
author: #{mla_authors(data["authors"])} | |
publisher: #{data["publisher"]} | |
year: #{data["publishedDate"]} | |
identifier: #{data[:identifier]} | |
tags: #{data[:tags]} | |
id: #{data[:id]} | |
#{data[:body]}} | |
end | |
def id(datetime) | |
datetime.strftime("%Y%m%d%H%M%S") | |
end | |
def clean_body(string) | |
string.strip | |
end | |
def mla_authors(authors) | |
authors.join(", ") if authors | |
end | |
def clean_title(string) | |
string | |
.downcase | |
.gsub(/[^a-zA-Z0-9\-]/, "-") | |
.gsub(/-{2,}/, '-') | |
end | |
def extract_link(string) | |
URI.extract(string) | |
.filter{|url| url =~ /\A#{URI::regexp(['http', 'https'])}\z/} | |
.first | |
end | |
def string_to_tag(str) | |
str | |
.gsub(/[^a-zA-Z_]/, '_') | |
.gsub(/_{2,}/, '_') | |
.downcase | |
.prepend('#') | |
end | |
def process_folder(folder) | |
Dir.children(folder).each do |filename| | |
unless ["migrated", ".DS_Store", "migrate_booknotes.rb"].include?(filename) | |
if File.directory?(filename) | |
process_folder(filename) | |
else | |
puts "Processing #{filename}…" | |
if filename.match?(/.*\.(?:md|txt)/) | |
path = File.join(folder, filename) | |
data = Hash.new | |
content = File.read(path) | |
# EXTRACT read date from file | |
# I had the files randomly labeled and foldered based on year. | |
begin | |
date = Date.parse(content) | |
rescue StandardError | |
date = File.birthtime(path) | |
# date = Date.parse("#{t.month} #{t.day}, #{folder}") | |
end | |
# ADJUST dates (this is _crazy_ lazy, I know) | |
until date.year == folder.to_i | |
puts "Adjusting date... #{date.year}" | |
if date.year > folder.to_i | |
date = date.prev_year | |
else | |
date = date.next_year | |
end | |
end | |
# ADD id to metadata based on date | |
data[:id] = id(date) | |
data[:date] = date.strftime("%a, %e %b %Y %T") | |
# EXTRACT tags from /:\w+:/ format and transform to hashtags | |
tags = | |
content | |
.scan(ORGTAG) | |
.flatten | |
.map{ |t| '#' + t.gsub(":", "").gsub('-', '_').downcase } | |
.uniq | |
tags.push("#book") | |
content.gsub!(ORGTAG, '\1') | |
# EXTRACT search query from filename | |
query = filename.gsub(/\..{2,3}$/, '') | |
title = query.split(" by ").first.strip | |
author = query.split(" by ").last.strip | |
books = | |
JSON.parse(Net::HTTP.get_response( | |
URI("https://www.googleapis.com/books/v1/volumes?q=#{URI.encode(query)}")).body) | |
# https://stackoverflow.com/questions/16323571/measure-the-distance-between-two-strings-with-ruby | |
book = books["items"].min_by do |b| | |
levenshtein_distance(b["volumeInfo"]["title"], title) | |
end | |
if book | |
data.merge!(book["volumeInfo"]) | |
data[:identifier] = data["industryIdentifiers"][0]["identifier"] | |
if data["categories"] | |
data["categories"].map{ |t| tags.push(string_to_tag(t)) } | |
end | |
else | |
data[:title] = title | |
data[:author] = author | |
end | |
data[:tags] = tags.join(", ") | |
data[:body] = clean_body(content) | |
# WRITE filename with new filename to a new folder | |
filename = "#{data[:id]}-#{clean_title("#{data["title"]} by #{mla_authors(data["authors"])}")}.md" | |
# puts "\n\n>> #{filename} <<\n" | |
# puts render(data) | |
IO.write(File.join(DST, filename), render(data)) | |
end | |
end | |
end | |
end | |
end | |
process_folder(SRC) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment