mikkpr/README.markdown

## README.markdown

      
    Raw
  

              README.markdown
            
          
    Migrating Wordpress articles to Voog

Requirements


Ruby (2.0 and newer)
gem install voog_api
gem install mime-types
gem install nokogiri

What to do?


Make sure you have enough free space to temporarily hold all the necessary image assets before re-uploading them to Voog
Make sure the voog_assets_uploader.rb and wordpressxml_to_voog_articles.rb files are in the same folder
Create a new wp_content folder
Acquire a Wordpress XML dump file with all the articles you wish to migrate to Voog and set its name on line 13 in wordpressxml_to_voog_articles.rb
Create a Voog site (if you haven't already) and generate an API token ((Read more here)[https://www.voog.com/support/guides/stats-and-maintenance/how-to-generate-an-api-token])
Set the host and token variables in wordpressxml_to_voog_articles, lines 14 and 15.
Inspect any image assets on your Voog site to find out the media server URL for your site's assets, e.g. //media.voog.com/0000/0000/0000/photos, set that on line 11.
Create the blog page on your site where you want the articles moved, set that page's path on line 16.
On line 19, there's a list of domains where the WP articles' content might be hosted. This is used to find which URLs are left intact and which ones are downloaded and re-uploaded to Voog to be hosted there. Minimally, the old WP site's domain should be in the list.
On line 28 and onward, all get_* methods look for specific values at specific selectors in the XML. This might also need tweaking, depending on the XML structure — look at the XML file and see if the selector strings match with the expected values.
If everything looks fine, run the script with ruby wordpressxml_to_voog_articles.rb


## voog_assets_uploader.rb
# This script uploads files to your Voog site over API.
#
# Run this script:
#
#   ruby voog_assets_uploader.rb
#
# Required gems:
#   gem install voog_api)
#   gem install mime-types
#
# More about Voog API: http://www.voog.com/developers/api/
require 'voog_api' # v0.0.11 or newer
require 'mime/types'

# Your Voog site host
@voog_host = 'MYSITE.voog.computer'
# Your Voog API token host
# Read more: http://www.voog.com/support/guides/developers/developer-account-basics#generate-api-token
@voog_token = 'SUPERSECRET'
@assets_folder = 'wp_content'

# Upload assets to your Voog site.
# Existing assets are skipped.
def upload_assets(dir)
  if Dir.exists?(dir)
    files = Dir.glob(File.join(dir, '*.*'))
    puts "Processing #{files.size} files:\n"

    files.each.with_index(1) do |file, index|
      filename = File.basename(file)
      if file_by_name(filename)
        puts "#{index}: #{file} - (SKIPPING)"
        next
      end

      sleep 5 if index % 10 == 0
      sleep 10 if index % 100 == 0

      mime_type = MIME::Types.type_for(file).first
      content_type = mime_type ? mime_type.content_type : 'application/octet-stream'
      puts "#{index}: #{file} - (#{File.size(file)} - #{content_type})..."
      asset = client.create_asset(filename: filename, size: File.size(file), content_type: content_type)

      conn = Faraday.new() do |f|
        f.adapter :net_http
      end
      conn.headers[:x_amz_acl] = 'public-read'
      conn.headers[:content_type] = content_type
      conn.headers[:content_length] = File.size(file).to_s

      response = conn.put(asset.upload_url, Faraday::UploadIO.new(file, content_type))

      if response.success?
        client.confirm_asset(asset.id)
      else
        puts "Error on uploading to S3: #{response.body.inspect}"
      end
    end
  end
end

# Get list of existing assets.
def assets
  @assets ||= client.assets
end

# Check existing files
def file_by_name(filename)
  @file_by_name ||= assets.each_with_object({}) { |e, h| h[e.filename] = e }
  @file_by_name[filename]
end

# Return Voog API client
def client
  @client ||= Voog::Client.new(@voog_host, @voog_token, protocol: :http, auto_paginate: true, raise_on_error: true)
end

def clear_cache!
  @assets = nil
  @file_by_name = nil
  @client = nil
end

if __FILE__ == $0
  clear_cache!
  upload_assets(@assets_folder)
end

## wordpressxml_to_voog_articles.rb
require 'nokogiri'
require 'date'
require 'voog_api'
require './voog_assets_uploader.rb'
require 'open-uri'

IMG_REGEX = /\<img.*?src="(.*?)".*?\/\>/

# Site-specific URL for all media assets
# Find this from the assets panel
@voog_media_host = '//media.voog.com/0000/0000/0000/photos'

@xml_filename = 'DATA.xml'
@voog_host = 'MYSITE.voog.com'
@voog_token = 'SUPERSECRET'
@blog_path = 'blog'

# Hosts that are replaced with @voog_media_host and whose assets are re-uploaded to Voog
@replaceable_hosts = [
  'PREVIOUSSITE.com',
  '...'
]

# Folder for the downloaded files
@assets_folder = 'wp_content'

# Tweak these to match the XML structure
def get_title(item)
  item.at_css('title').text
end

def get_body(item)
  newlines_to_brs(item.at_xpath('content:encoded').text)
end

def get_excerpt(item)
  item.at_xpath('excerpt:encoded').text
end

def get_path(item)
  item.at_xpath('wp:post_name').text
end

def get_date(item)
  Date.parse(item.at_css('pubDate')).strftime('%d.%m.%Y')
end

def get_assets(item)
  get_body(item).scan(IMG_REGEX).flatten.uniq.sort
end

# Utilities

# Converts all literal newline characters to <br/> tags
def newlines_to_brs(str)
  str.gsub(/\n/, '<br/>')
end

# Returns everything except the filename itself
def get_asset_prefix(asset)
  parts = asset.split('/')
  parts = parts.slice(0, parts.length - 1)
  parts = parts.join('/')
end

# Find all image URLs that are hosted on the original host
def get_downloadable_asset_urls(items)
  items
    .map { |i| get_assets(i) }
    .flatten
    .uniq
    .select { |a| @replaceable_hosts.map { |h| a.gsub(/https?:\/\//, '').start_with?(h) }.any? }
end

# Download all files from given urls to defined directory
def download_files!(urls, directory)
  if Dir.exists?(directory)
    puts "=== Downloading #{urls.size} assets"

    urls.each.with_index(1) do |url, index|
      url_parts = url.split('/')
      filename = URI.decode(url_parts.pop)
      full_url = (url_parts + [URI.encode(filename)]).join('/')
      if File.exists?("#{directory}/#{filename}")
        puts "--> #{index}: #{filename} already downloaded. - (SKIPPING)"
      else
        puts "--> #{index}: Downloading #{url}..."
        File.open("#{directory}/#{filename}", 'wb') do |saved_file|
          begin
            open(full_url, 'rb') do |read_file|
              saved_file.write(read_file.read)
            end
          rescue => e
            puts "Could not download #{url}! (#{e.message.inspect})"
            puts client.last_response.inspect
          end
        end
      end
    end
  else
    puts "Could not download files. Target directory '#{directory}' is missing"
  end
end

if __FILE__ == $0
  # parse the XML file and fetch the articles
  doc = Nokogiri::XML(File.open(@xml_filename))
  items = doc.css('item')

  # download files that should be migrated to Voog
  download_files!(get_downloadable_asset_urls(items), @assets_folder)

  # upload them to Voog
  upload_assets(@assets_folder)

  # Construct Voog articles from parsed info
  blog = client.pages.select {|p| p.path == @blog_path}.first

  if !blog.nil? && items.size > 0
    items.each do |item|
      title = get_title(item)
      body = get_body(item)
      excerpt = get_excerpt(item)
      path = get_path(item)
      date = get_date(item)

      puts "Creating article '#{title}'..."

      # Replace all original paths with Voog's media paths
      replaceable_urls = get_downloadable_asset_urls([item]).map{ |u| get_asset_prefix(u) }
      puts "  Replacing URLs:\n  " + replaceable_urls.join("\n  ")
      replaceable_urls.each { |url| body.gsub!(url, @voog_media_host) }

      # Create the article
      begin
        client.create_article({
          autosaved_title: title,
          autosaved_excerpt: excerpt,
          autosaved_body: body,
          path: path,
          created_at: date,
          updated_at: date,
          publishing: true,
          published_at: date,
          page_id: blog.id,
          language_id: blog.language_id
        })
        puts "  OK!"
      rescue => e
        puts "  Something went wrong! #{e.message.inspect}"
      end
    end
  else
    puts "Blog '/#{@blog_path}' not found!" unless !blog.nil?
    puts "No items found in XML file!" unless items.size > 0
  end
end
	# This script uploads files to your Voog site over API.
	#
	# Run this script:
	#
	# ruby voog_assets_uploader.rb
	#
	# Required gems:
	# gem install voog_api)
	# gem install mime-types
	#
	# More about Voog API: http://www.voog.com/developers/api/
	require 'voog_api' # v0.0.11 or newer
	require 'mime/types'

	# Your Voog site host
	@voog_host = 'MYSITE.voog.computer'
	# Your Voog API token host
	# Read more: http://www.voog.com/support/guides/developers/developer-account-basics#generate-api-token
	@voog_token = 'SUPERSECRET'
	@assets_folder = 'wp_content'

	# Upload assets to your Voog site.
	# Existing assets are skipped.
	def upload_assets(dir)
	if Dir.exists?(dir)
	files = Dir.glob(File.join(dir, '.'))
	puts "Processing #{files.size} files:\n"

	files.each.with_index(1) do \|file, index\|
	filename = File.basename(file)
	if file_by_name(filename)
	puts "#{index}: #{file} - (SKIPPING)"
	next
	end

	sleep 5 if index % 10 == 0
	sleep 10 if index % 100 == 0

	mime_type = MIME::Types.type_for(file).first
	content_type = mime_type ? mime_type.content_type : 'application/octet-stream'
	puts "#{index}: #{file} - (#{File.size(file)} - #{content_type})..."
	asset = client.create_asset(filename: filename, size: File.size(file), content_type: content_type)

	conn = Faraday.new() do \|f\|
	f.adapter :net_http
	end
	conn.headers[:x_amz_acl] = 'public-read'
	conn.headers[:content_type] = content_type
	conn.headers[:content_length] = File.size(file).to_s

	response = conn.put(asset.upload_url, Faraday::UploadIO.new(file, content_type))

	if response.success?
	client.confirm_asset(asset.id)
	else
	puts "Error on uploading to S3: #{response.body.inspect}"
	end
	end
	end
	end

	# Get list of existing assets.
	def assets
	@assets \|\|= client.assets
	end

	# Check existing files
	def file_by_name(filename)
	@file_by_name \|\|= assets.each_with_object({}) { \|e, h\| h[e.filename] = e }
	@file_by_name[filename]
	end

	# Return Voog API client
	def client
	@client \|\|= Voog::Client.new(@voog_host, @voog_token, protocol: :http, auto_paginate: true, raise_on_error: true)
	end

	def clear_cache!
	@assets = nil
	@file_by_name = nil
	@client = nil
	end

	if __FILE__ == $0
	clear_cache!
	upload_assets(@assets_folder)
	end
	require 'nokogiri'
	require 'date'
	require 'voog_api'
	require './voog_assets_uploader.rb'
	require 'open-uri'

	IMG_REGEX = /\<img.?src="(.?)".*?\/\>/

	# Site-specific URL for all media assets
	# Find this from the assets panel
	@voog_media_host = '//media.voog.com/0000/0000/0000/photos'

	@xml_filename = 'DATA.xml'
	@voog_host = 'MYSITE.voog.com'
	@voog_token = 'SUPERSECRET'
	@blog_path = 'blog'

	# Hosts that are replaced with @voog_media_host and whose assets are re-uploaded to Voog
	@replaceable_hosts = [
	'PREVIOUSSITE.com',
	'...'
	]

	# Folder for the downloaded files
	@assets_folder = 'wp_content'

	# Tweak these to match the XML structure
	def get_title(item)
	item.at_css('title').text
	end

	def get_body(item)
	newlines_to_brs(item.at_xpath('content:encoded').text)
	end

	def get_excerpt(item)
	item.at_xpath('excerpt:encoded').text
	end

	def get_path(item)
	item.at_xpath('wp:post_name').text
	end

	def get_date(item)
	Date.parse(item.at_css('pubDate')).strftime('%d.%m.%Y')
	end

	def get_assets(item)
	get_body(item).scan(IMG_REGEX).flatten.uniq.sort
	end

	# Utilities

	# Converts all literal newline characters to <br/> tags
	def newlines_to_brs(str)
	str.gsub(/\n/, '<br/>')
	end

	# Returns everything except the filename itself
	def get_asset_prefix(asset)
	parts = asset.split('/')
	parts = parts.slice(0, parts.length - 1)
	parts = parts.join('/')
	end

	# Find all image URLs that are hosted on the original host
	def get_downloadable_asset_urls(items)
	items
	.map { \|i\| get_assets(i) }
	.flatten
	.uniq
	.select { \|a\| @replaceable_hosts.map { \|h\| a.gsub(/https?:\/\//, '').start_with?(h) }.any? }
	end

	# Download all files from given urls to defined directory
	def download_files!(urls, directory)
	if Dir.exists?(directory)
	puts "=== Downloading #{urls.size} assets"

	urls.each.with_index(1) do \|url, index\|
	url_parts = url.split('/')
	filename = URI.decode(url_parts.pop)
	full_url = (url_parts + [URI.encode(filename)]).join('/')
	if File.exists?("#{directory}/#{filename}")
	puts "--> #{index}: #{filename} already downloaded. - (SKIPPING)"
	else
	puts "--> #{index}: Downloading #{url}..."
	File.open("#{directory}/#{filename}", 'wb') do \|saved_file\|
	begin
	open(full_url, 'rb') do \|read_file\|
	saved_file.write(read_file.read)
	end
	rescue => e
	puts "Could not download #{url}! (#{e.message.inspect})"
	puts client.last_response.inspect
	end
	end
	end
	end
	else
	puts "Could not download files. Target directory '#{directory}' is missing"
	end
	end

	if __FILE__ == $0
	# parse the XML file and fetch the articles
	doc = Nokogiri::XML(File.open(@xml_filename))
	items = doc.css('item')

	# download files that should be migrated to Voog
	download_files!(get_downloadable_asset_urls(items), @assets_folder)

	# upload them to Voog
	upload_assets(@assets_folder)

	# Construct Voog articles from parsed info
	blog = client.pages.select {\|p\| p.path == @blog_path}.first

	if !blog.nil? && items.size > 0
	items.each do \|item\|
	title = get_title(item)
	body = get_body(item)
	excerpt = get_excerpt(item)
	path = get_path(item)
	date = get_date(item)

	puts "Creating article '#{title}'..."

	# Replace all original paths with Voog's media paths
	replaceable_urls = get_downloadable_asset_urls([item]).map{ \|u\| get_asset_prefix(u) }
	puts " Replacing URLs:\n " + replaceable_urls.join("\n ")
	replaceable_urls.each { \|url\| body.gsub!(url, @voog_media_host) }

	# Create the article
	begin
	client.create_article({
	autosaved_title: title,
	autosaved_excerpt: excerpt,
	autosaved_body: body,
	path: path,
	created_at: date,
	updated_at: date,
	publishing: true,
	published_at: date,
	page_id: blog.id,
	language_id: blog.language_id
	})
	puts " OK!"
	rescue => e
	puts " Something went wrong! #{e.message.inspect}"
	end
	end
	else
	puts "Blog '/#{@blog_path}' not found!" unless !blog.nil?
	puts "No items found in XML file!" unless items.size > 0
	end
	end