Skip to content

Instantly share code, notes, and snippets.

@tanelj
Created February 19, 2016 13:13
Show Gist options
  • Save tanelj/4b96fc290de530117966 to your computer and use it in GitHub Desktop.
Save tanelj/4b96fc290de530117966 to your computer and use it in GitHub Desktop.
Migration scripts that imports articles from Joomla database to your Voog website: http://www.voog.com
# This script download all assets linked in articles in Joomla database.
#
# It finds all asset url in article contents (stored in "jos_content" table). And downloads them.
#
# NB! Modify "Configuration" section variables before running this script.
#
# Run this script to see instructions:
#
# ruby joomla_assets_downloader.rb help
#
# Required gems:
# gem install sequel (4.31.0)
# gem install mysql2
#
# More about Voog API: http://www.voog.com/developers/api/
require 'rubygems'
require 'sequel'
require 'mysql2'
require 'open-uri'
# === Configuration
# Host of Joomla website
@host = 'http://www.mycompany.com'
# To where to download assets. Directory should be exists.
@assets_folder = 'joomla_assets'
# Migration config where "source_category_id" is value of "catid" column in "jos_content" table
# for migratable articles
@migration_conf = [
{
# target_blog_path: 'news',
source_category_id: 25,
}
]
# Database name for Joomla
@database_name = 'my_joomla_database'
# Setup connection to Joomla database
DB = Sequel.mysql2(@database_name, host: 'localhost', user: 'root', password: '')
# === End configuration
# Get list of assets used by migratable articles (in "src" and "img" attributes)
def get_assets_list
images = []
DB[:jos_content].where(catid: @migration_conf.map { |e| e[:source_category_id] }, state: 1).all.each do |article|
images += article[:introtext].scan(/\<img.*?src="(.*?)".*\/\>/)
images += article[:fulltext].scan(/\<img.*?src="(.*?)".*\/\>/)
images += article[:introtext].scan(/\ href="(images\/.*?)"/)
images += article[:fulltext].scan(/\ href="(images\/.*?)"/)
end
result = images.flatten.compact.uniq.sort
# Prepend host to asset if it missing.
result.map do |image|
"#{"#{@host}/" if image.start_with?('images/')}#{image}"
end.sort
end
# Get list of used local (in Joomla site) asset folders.
# It can be used as input of the Voog importer script for conversion (@assets_folder_map).
def get_replaceable_strings
arr = get_assets_list.map { |e| arr = e.gsub("#{@host}/", '').split('/'); arr.pop; arr.join('/') }.uniq.sort.reverse
arr.map { |e| "#{@host}/#{e}" } + arr
end
# Download all files from given urls to defined directory
def download_files!(urls, directory)
if Dir.exists?(directory)
puts "=== Downloading #{urls.size} assets"
urls.each do |url|
urls.each.with_index(1) do |url, index|
puts "--> #{index}: Downloading #{url}..."
url_parts = url.split('/')
filename = URI.decode(url_parts.pop)
full_url = (url_parts + [URI.encode(filename)]).join('/')
File.open("#{directory}/#{filename}", 'wb') do |saved_file|
begin
open(full_url, 'rb') do |read_file|
saved_file.write(read_file.read)
end
rescue => e
puts "Could not download #{url}! (#{e.message.inspect})"
end
end
end
else
puts "Could not download files. Target directory '#{directory}' is missing"
end
end
# Run from command line:
if __FILE__ == $0
case ARGV.first
when 'show-list'
puts "=== Assets in articles"
puts get_assets_list.join("\n")
when 'show-used-folders'
puts "=== Used asset folders\n\n"
puts "@assets_folder_map = ["
puts get_replaceable_strings.map { |e| " '#{e}'"}.join(",\n")
puts "]"
when 'download'
puts "=== Download used assets to '#{@assets_folder}' folder"
download_files!(get_assets_list, @assets_folder)
else
puts "\nUsage:\n"
puts " Get list of assets used by migratable articles (in \"src\" and \"img\" attributes)"
puts " ruby joomla_assets_downloader.rb show-list\n\n"
puts " Get list of used local (in Joomla site) asset folders"
puts " ruby joomla_assets_downloader.rb show-used-folders\n\n"
puts " Download all files from to defined directory (#{@assets_folder})"
puts " ruby joomla_assets_downloader.rb download\n\n"
puts " Show this help message"
puts " ruby joomla_assets_downloader.rb help\n\n"
end
end

Migrate from Joomla to Voog

Here are some sctipts that you can use to migrate your articles from Joomla database to your Voog site.

Step 1: download assets

Edit joomla_assets_downloader.rb file and setup correct parameters.

Create a folder joomla_assets

Run script to download all used files and to see some configuration options needed for migration script:

ruby joomla_assets_downloader.rb download

Get required configuration variable for migration script joomla_to_voog_migrator.rb:

ruby joomla_assets_downloader.rb show-used-folders

Step 2: uplad assets to your Voog site

Configure the uploader script voog_assets_uploader.rb to set correct folder and Voog API keys and run it:

ruby voog_assets_uploader.rb

Step 3: run article migration

Get required configuration variable for migration script:

ruby joomla_assets_downloader.rb show-used-folders

Edit migration script joomla_to_voog_migrator.rb configuration and run it:

joomla_to_voog_migrator.rb
# This Script imports articles from Joomla database to Voog site.
#
# It loads requested articles from Joomla database (table "jos_content")
# and imports them to Voog site.
#
# It also converts referred image urls to Voog structure and fixes some broken HTML tags.
#
# NB! All non images links are getting wrong url and should be changed manually after migration
# from "/photos" to "/files".
#
# NB! All images should be uploaded to your Voog site BEFORE running this script.
#
# Use "joomla_assets_downloader.rb" script to download all requested assets.
# You can download it from: https://gist.github.com/tanelj/fadf23ea1f020dfbed5f
#
# ruby joomla_assets_downloader.rb download
#
# Use "voog_assets_uploader.rb" script to upload assets to your Voog site.
#
# NB! Modify "Configuration" section variables before running this script.
#
# You can get correct values for "@assets_folder_map" from output of this script:
#
# ruby joomla_assets_downloader.rb show-used-folders
#
# Run this script:
#
# ruby joomla_to_voog_migrator.rb
#
# Required gems:
# gem install sequel (tested 4.31.0)
# gem install mysql2
# gem install nokogiri
# gem install voog_api (min v0.0.11)
#
# More about Voog API: http://www.voog.com/developers/api/
require 'rubygems'
require 'sequel'
require 'mysql2'
require 'voog_api'
require 'nokogiri'
# === Configuration
@host = 'http://www.mycompany.com'
# Your Voog site host
@voog_host = 'mycompany.voog.com'
# Your Voog API token host
# Read more: http://www.voog.com/support/guides/developers/developer-account-basics#generate-api-token
@voog_token = 'xxxxxxxxxxxx'
# Image prefix for assets in Voog site
@images_prefix = '/photos'
# If you know your Voog site images prefix then use this instead
# You can find it out when you upload some image to your Voog site and check its url.
# @images_prefix = '//media.voog.com/0000/0000/0001/photos'
# Migration config where "source_category_id" is value of "catid" column in "jos_content" table
# for migratable articles.
# "target_blog_path" is your article path in your Voog site (should be exist).
@migration_conf = [
{
target_blog_path: 'news-test',
source_category_id: 25,
}
]
# Database name for Joomla
@database_name = 'my_joomla_database'
# Setup connection to Joomla database
DB = Sequel.mysql2(@database_name, host: 'localhost', user: 'root', password: '')
# Map for fixing assets reference in new site
# Use "ruby joomla_assets_downloader.rb show-used-folders" to generate this list.
@assets_folder_map = [
'http://www.mycompany.ee/images/news/thumbnails',
'http://www.mycompany.ee/images/news',
'http://www.mycompany.ee/images/thumbnails/thumbnails',
'http://www.mycompany.ee/images/thumbnails',
'http://www.mycompany.ee/images',
'images/news/thumbnails',
'images/news',
'images/thumbnails/thumbnails',
'images/thumbnails',
'images'
]
@cleanup_strings = [
/\<!DOCTYPE.*\>/i,
/\<html.*\>/i,
/\<\/html\>/i,
/\<head.*\>.*?\<\/head\>/m,
/\<body.*\>/i,
/\<\/body\>/i,
]
# === End configuration
# Cleanup text. Also removes tags given by @cleanup_strings.
def cleanup_text(source_text)
text = source_text.dup
@cleanup_strings.each do |regex|
text.gsub!(regex, '')
end
text.to_s.gsub(/\r/, "\n").gsub(/^[[:space:]]*$/, "\n").gsub(/\n{3,}/, "\n\n").strip
end
# Cleanup text and fix broken or partial HTML tags.
def cleanup_text_with_nokogiri(source_text)
parsed = Nokogiri::HTML::DocumentFragment.parse(source_text.dup)
text = parsed.at('body') ? parsed.at('body').inner_html : parsed.to_s
text.to_s.gsub(/\r/, "\n").gsub(/^[[:space:]]*$/, "\n").gsub(/\n{3,}/, "\n\n").strip
end
# Convert local links in "href" and image "src" attributes to be suitable for target site.
def convert_links(source_text)
text = source_text.dup
@assets_folder_map.each do |str|
text.gsub!("\"#{str}/", "\"#{@images_prefix}/")
end
text
end
# Return Voog API client
def client
@client ||= Voog::Client.new(@voog_host, @voog_token, protocol: :http, auto_paginate: true, raise_on_error: true)
end
# Database base query for Joomla database
# It returns database records form content table ("jos_content") and filters out only published ("state = 1").
def database_base_query
@database_base_query ||= DB[:jos_content].where(state: 1)
end
# Migrate all articles defined in @migration_conf form Joomla database to Voog site.
def migrate_articles!
puts "=== Migrating articles"
@migration_conf.each do |h|
batch_step = 50
puts "\n\n--- Processing articles for '#{h[:target_blog_path]}'"
puts "Fetching information for target blog page..."
blog = client.pages(path: h[:target_blog_path], content_type: 'blog').first
if blog
puts "Migrating articles to #{blog.public_url}"
query = database_base_query.where(catid: h[:source_category_id]).order(:created)
puts "Total: #{query.count}"
query.all.each.with_index(1) do |item, index|
puts "--> #{index}: (ID=#{item[:id]}) #{item[:title]} (#{item[:alias]})"
sleep 5 if index % batch_step == 0
article = client.create_article(
page_id: blog.id,
path: item[:alias],
autosaved_title: item[:title],
autosaved_excerpt: convert_links(cleanup_text_with_nokogiri(item[:introtext])),
autosaved_body: convert_links(cleanup_text_with_nokogiri(item[:fulltext])),
publishing: true,
created_at: item[:created].strftime('%d.%m.%Y')
)
if article
puts "<-- #{article.public_url}"
else
puts "ERROR: Something went wrong"
end
end
else
puts "ERROR: Blog page was not found in target server"
end
end
end
migrate_articles!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment