|
require 'nokogiri' |
|
require 'date' |
|
require 'voog_api' |
|
require './voog_assets_uploader.rb' |
|
require 'open-uri' |
|
|
|
IMG_REGEX = /\<img.*?src="(.*?)".*?\/\>/ |
|
|
|
# Site-specific URL for all media assets |
|
# Find this from the assets panel |
|
@voog_media_host = '//media.voog.com/0000/0000/0000/photos' |
|
|
|
@xml_filename = 'DATA.xml' |
|
@voog_host = 'MYSITE.voog.com' |
|
@voog_token = 'SUPERSECRET' |
|
@blog_path = 'blog' |
|
|
|
# Hosts that are replaced with @voog_media_host and whose assets are re-uploaded to Voog |
|
@replaceable_hosts = [ |
|
'PREVIOUSSITE.com', |
|
'...' |
|
] |
|
|
|
# Folder for the downloaded files |
|
@assets_folder = 'wp_content' |
|
|
|
# Tweak these to match the XML structure |
|
def get_title(item) |
|
item.at_css('title').text |
|
end |
|
|
|
def get_body(item) |
|
newlines_to_brs(item.at_xpath('content:encoded').text) |
|
end |
|
|
|
def get_excerpt(item) |
|
item.at_xpath('excerpt:encoded').text |
|
end |
|
|
|
def get_path(item) |
|
item.at_xpath('wp:post_name').text |
|
end |
|
|
|
def get_date(item) |
|
Date.parse(item.at_css('pubDate')).strftime('%d.%m.%Y') |
|
end |
|
|
|
def get_assets(item) |
|
get_body(item).scan(IMG_REGEX).flatten.uniq.sort |
|
end |
|
|
|
# Utilities |
|
|
|
# Converts all literal newline characters to <br/> tags |
|
def newlines_to_brs(str) |
|
str.gsub(/\n/, '<br/>') |
|
end |
|
|
|
# Returns everything except the filename itself |
|
def get_asset_prefix(asset) |
|
parts = asset.split('/') |
|
parts = parts.slice(0, parts.length - 1) |
|
parts = parts.join('/') |
|
end |
|
|
|
# Find all image URLs that are hosted on the original host |
|
def get_downloadable_asset_urls(items) |
|
items |
|
.map { |i| get_assets(i) } |
|
.flatten |
|
.uniq |
|
.select { |a| @replaceable_hosts.map { |h| a.gsub(/https?:\/\//, '').start_with?(h) }.any? } |
|
end |
|
|
|
# Download all files from given urls to defined directory |
|
def download_files!(urls, directory) |
|
if Dir.exists?(directory) |
|
puts "=== Downloading #{urls.size} assets" |
|
|
|
urls.each.with_index(1) do |url, index| |
|
url_parts = url.split('/') |
|
filename = URI.decode(url_parts.pop) |
|
full_url = (url_parts + [URI.encode(filename)]).join('/') |
|
if File.exists?("#{directory}/#{filename}") |
|
puts "--> #{index}: #{filename} already downloaded. - (SKIPPING)" |
|
else |
|
puts "--> #{index}: Downloading #{url}..." |
|
File.open("#{directory}/#{filename}", 'wb') do |saved_file| |
|
begin |
|
open(full_url, 'rb') do |read_file| |
|
saved_file.write(read_file.read) |
|
end |
|
rescue => e |
|
puts "Could not download #{url}! (#{e.message.inspect})" |
|
puts client.last_response.inspect |
|
end |
|
end |
|
end |
|
end |
|
else |
|
puts "Could not download files. Target directory '#{directory}' is missing" |
|
end |
|
end |
|
|
|
if __FILE__ == $0 |
|
# parse the XML file and fetch the articles |
|
doc = Nokogiri::XML(File.open(@xml_filename)) |
|
items = doc.css('item') |
|
|
|
# download files that should be migrated to Voog |
|
download_files!(get_downloadable_asset_urls(items), @assets_folder) |
|
|
|
# upload them to Voog |
|
upload_assets(@assets_folder) |
|
|
|
# Construct Voog articles from parsed info |
|
blog = client.pages.select {|p| p.path == @blog_path}.first |
|
|
|
if !blog.nil? && items.size > 0 |
|
items.each do |item| |
|
title = get_title(item) |
|
body = get_body(item) |
|
excerpt = get_excerpt(item) |
|
path = get_path(item) |
|
date = get_date(item) |
|
|
|
puts "Creating article '#{title}'..." |
|
|
|
# Replace all original paths with Voog's media paths |
|
replaceable_urls = get_downloadable_asset_urls([item]).map{ |u| get_asset_prefix(u) } |
|
puts " Replacing URLs:\n " + replaceable_urls.join("\n ") |
|
replaceable_urls.each { |url| body.gsub!(url, @voog_media_host) } |
|
|
|
# Create the article |
|
begin |
|
client.create_article({ |
|
autosaved_title: title, |
|
autosaved_excerpt: excerpt, |
|
autosaved_body: body, |
|
path: path, |
|
created_at: date, |
|
updated_at: date, |
|
publishing: true, |
|
published_at: date, |
|
page_id: blog.id, |
|
language_id: blog.language_id |
|
}) |
|
puts " OK!" |
|
rescue => e |
|
puts " Something went wrong! #{e.message.inspect}" |
|
end |
|
end |
|
else |
|
puts "Blog '/#{@blog_path}' not found!" unless !blog.nil? |
|
puts "No items found in XML file!" unless items.size > 0 |
|
end |
|
end |
|
|