Skip to content

Instantly share code, notes, and snippets.

@tjh
Created January 31, 2012 16:07
  • Star 60 You must be signed in to star a gist
  • Fork 18 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save tjh/1711329 to your computer and use it in GitHub Desktop.
Convert all Rails table column collation and character set
#!/usr/bin/env ruby
# Put this file in the root of your Rails project,
# then run it to output the SQL needed to change all
# your tables and columns to the same character set
# and collation.
#
# > ruby character_set_and_collation.rb
DATABASE = ''
CHARACTER_SET = 'utf8'
COLLATION = 'utf8_general_ci'
schema = File.open('db/schema.rb', 'r').read
rows = schema.split("\n")
table_name = nil
rows.each do |row|
if row =~ /create_table/
table_name = row.match(/create_table "(.+)"/)[1]
puts "ALTER TABLE `#{DATABASE}`.`#{table_name}` DEFAULT CHARACTER SET #{CHARACTER_SET} COLLATE #{COLLATION};"
elsif row =~ /t\.string/
field_name = row.match(/"(.+)"/)[1]
puts "ALTER TABLE `#{DATABASE}`.`#{table_name}` CHANGE COLUMN `#{field_name}` `#{field_name}` CHARACTER SET #{CHARACTER_SET} COLLATE #{COLLATION};"
elsif row =~ /t\.text/
field_name = row.match(/"(.+)"/)[1]
puts "ALTER TABLE `#{DATABASE}`.`#{table_name}` CHANGE COLUMN `#{field_name}` `#{field_name}` CHARACTER SET #{CHARACTER_SET} COLLATE #{COLLATION};"
end
end
@aganov
Copy link

aganov commented Feb 17, 2016

# rails generate migration change_to_utf8mb_encoding

class ChangeToUtf8mbEncoding < ActiveRecord::Migration
  # def execute(text)
  #   puts text
  # end

  def db
    ActiveRecord::Base.connection
  end

  def up
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4;"
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` CHARACTER SET = utf8mb4;"
      db.columns(table).each do |column|
        case column.sql_type
        when "text"
          execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
        when /varchar\(([0-9]+)\)/i
          # InnoDB has a maximum index length of 767 bytes, so for utf8 or utf8mb4
          # columns, you can index a maximum of 255 or 191 characters, respectively.
          # If you currently have utf8 columns with indexes longer than 191 characters,
          # you will need to index a smaller number of characters.
          indexed_column = db.indexes(table).any? { |index| index.columns.include?(column.name) }
          sql_type = (indexed_column && $1.to_i > 191) ? "VARCHAR(191)" : column.sql_type.upcase
          execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
        end
      end
    end
  end
end

@Dariusch
Copy link

Dariusch commented May 9, 2016

small update to keep default values and null: false/true

# rails generate migration change_to_utf8mb_encoding

class ChangeToUtf8mbEncoding < ActiveRecord::Migration
  def db
    ActiveRecord::Base.connection
  end

  def up
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4;"
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` CHARACTER SET = utf8mb4;"
      db.columns(table).each do |column|
        case column.sql_type
        when "text"
          execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
        when /varchar\(([0-9]+)\)/i
          # InnoDB has a maximum index length of 767 bytes, so for utf8 or utf8mb4
          # columns, you can index a maximum of 255 or 191 characters, respectively.
          # If you currently have utf8 columns with indexes longer than 191 characters,
          # you will need to index a smaller number of characters.
          indexed_column = db.indexes(table).any? { |index| index.columns.include?(column.name) }
          sql_type  = (indexed_column && $1.to_i > 191) ? 'VARCHAR(191)' : column.sql_type.upcase
          default   = (column.default.blank?) ? '' : "DEFAULT '#{column.default}'"
          null      = (column.null) ? '' : 'NOT NULL'
          execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
        end
      end
    end
  end
end

@zuf
Copy link

zuf commented Sep 30, 2016

Convert not only TEXT fields but all *TEXT fields (longtext, mediumtext, etc)
Also cut length of all VARCHAR to 191, not only indexed (which may not detected correctly).

class MigrateToUtf8mb4 < ActiveRecord::Migration[5.0]

  def db
    ActiveRecord::Base.connection
  end

  def up
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4;"
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` CHARACTER SET = utf8mb4;"
      db.columns(table).each do |column|
        case column.sql_type
          when  /([a-z]*)text/i
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{$1.upcase}TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
          when /varchar\(([0-9]+)\)/i
            # InnoDB has a maximum index length of 767 bytes, so for utf8 or utf8mb4
            # columns, you can index a maximum of 255 or 191 characters, respectively.
            # If you currently have utf8 columns with indexes longer than 191 characters,
            # you will need to index a smaller number of characters.
            indexed_column = db.indexes(table).any? { |index| index.columns.include?(column.name) }

            sql_type  = ($1.to_i > 191) ? 'VARCHAR(191)' : column.sql_type.upcase
            default   = (column.default.blank?) ? '' : "DEFAULT '#{column.default}'"
            null      = (column.null) ? '' : 'NOT NULL'
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
        end
      end
    end
  end

end

@mosheka
Copy link

mosheka commented Oct 13, 2016

Few fixes and warnings:

  1. In the last version you dropped the indexed_column. You should not cut VARCHAR o/w (Other option is changing the index itself)
  2. This code should not be run on production server w/ large tables, as it may copy large tables few times (for each ALTER) that may result in a long downtime of you system(if your table are really large you should use pt-online-schema-change for MySQL case)
class MigrateToUtf8mb4 < ActiveRecord::Migration[5.0]

  def db
    ActiveRecord::Base.connection
  end

  def up
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4;"
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` CHARACTER SET = utf8mb4;"
      db.columns(table).each do |column|
        case column.sql_type
          when  /([a-z]*)text/i
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{$1.upcase}TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
          when /varchar\(([0-9]+)\)/i
            # InnoDB has a maximum index length of 767 bytes, so for utf8 or utf8mb4
            # columns, you can index a maximum of 255 or 191 characters, respectively.
            # If you currently have utf8 columns with indexes longer than 191 characters,
            # you will need to index a smaller number of characters.
            indexed_column = db.indexes(table).any? { |index| index.columns.include?(column.name) }

            sql_type  = (indexed_column && $1.to_i > 191) ? 'VARCHAR(191)' : column.sql_type.upcase
            default   = (column.default.blank?) ? '' : "DEFAULT '#{column.default}'"
            null      = (column.null) ? '' : 'NOT NULL'
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
        end
      end
    end
  end
end

@jules2689
Copy link

class MigrateToUtf8mb4 < ActiveRecord::Migration[5.0]

  def db
    ActiveRecord::Base.connection
  end

  def up
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4;"
    db.tables.each do |table|
      next if %w(ar_internal_metadata schema_migrations).include?(table)
      execute "ALTER TABLE `#{table}` CHARACTER SET = utf8mb4;"
      db.columns(table).each do |column|
        case column.sql_type
          when  /([a-z]*)text/i
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{$1.upcase}TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
          when /varchar\(([0-9]+)\)/i
            # InnoDB has a maximum index length of 767 bytes, so for utf8 or utf8mb4
            # columns, you can index a maximum of 255 or 191 characters, respectively.
            # If you currently have utf8 columns with indexes longer than 191 characters,
            # you will need to index a smaller number of characters.
            indexed_column = db.indexes(table).any? { |index| index.columns.include?(column.name) }

            sql_type  = (indexed_column && $1.to_i > 191) ? 'VARCHAR(191)' : column.sql_type.upcase
            default   = (column.default.blank?) ? '' : "DEFAULT '#{column.default}'"
            null      = (column.null) ? '' : 'NOT NULL'
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
        end
      end
    end
  end
end

I added the line next if %w(ar_internal_metadata schema_migrations).include?(table) as there is no need to change Rails' tables. Also, I was having issues with index sizes.

@jfly
Copy link

jfly commented Apr 4, 2017

Unfortunately, the loop over db.tables includes a mysql view of ours, which doesn't allow changing its character set:

Mysql2::Error: 'wca_development.rails_persons' is not BASE TABLE: ALTER TABLE `rails_persons` CHARACTER SET = utf8mb4;

Apparently the .tables method will change in Rails 5.1 to not include views:

irb(main):001:0> ActiveRecord::Base.connection.tables.include?("rails_persons")
DEPRECATION WARNING: #tables currently returns both tables and views. This behavior is deprecated and will be changed with Rails 5.1 to only return tables. Use #data_sources instead. (called from irb_binding at (irb):1)

But until then, I've had to add another "get me out of here" escape hatch like @jules2689 (I also ran into index size issues with the internal Rails tables): next if db.views.include?(table).

Other problems:

  • The migration ignored char columns, so I've changed it to support those in addition to varchar.
  • The migration ignored default empty string values (it was checking .blank? so I've changed it to check .nil?)
class ConvertUtf8ToUtf8mb4 < ActiveRecord::Migration[5.0]
  def db
    ActiveRecord::Base.connection
  end

  def up 
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4;"
    db.tables.each do |table|
      next if %w(ar_internal_metadata schema_migrations).include?(table)
      next if db.views.include?(table) # Skip views. This will not be necessary in Rails 5.1, when `db.tables` will change to only return actual tables.
      execute "ALTER TABLE `#{table}` CHARACTER SET = utf8mb4;"
      db.columns(table).each do |column|
        case column.sql_type 
          when /([a-z]*)text/i
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{$1.upcase}TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
          when /((?:var)?char)\(([0-9]+)\)/i
            # InnoDB has a maximum index length of 767 bytes, so for utf8 or utf8mb4
            # columns, you can index a maximum of 255 or 191 characters, respectively.
            # If you currently have utf8 columns with indexes longer than 191 characters,
            # you will need to index a smaller number of characters.
            indexed_column = db.indexes(table).any? { |index| index.columns.include?(column.name) }

            sql_type = (indexed_column && $2.to_i > 191) ? "#{$1}(191)" : column.sql_type.upcase
            default = (column.default.nil?) ? '' : "DEFAULT '#{column.default}'"
            null = (column.null) ? '' : 'NOT NULL'
            execute "ALTER TABLE `#{table}` CHANGE `#{column.name}` `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
        end
      end
    end
end

@Simplify
Copy link

Thanks a lot for this, saved me some coding :)

If you are on MySQL 5.7, you probably don't need to shorten columns to 191 chars. Maximum index size is not limit of InnoDB, but row_format used on specific table.

SELECT @@innodb_default_row_format;

This should return dynamic. dynamic and compressed support indexes up to 3072 bytes, if innodb_large_prefix is on (1). That is default in 5.7.

SELECT @@innodb_large_prefix;

You can see row_format for every table with:

SHOW TABLE STATUS FROM <your-database-name>;

If you still have some tables with compact row format, you can change it with:

ALTER TABLE <table-name> ROW_FORMAT=DYNAMIC;

@brendon
Copy link

brendon commented Jun 14, 2018

@Simplify is right about column limits. I was worried about data truncation until I read this: https://dev.mysql.com/doc/refman/5.6/en/innodb-row-format-dynamic.html that explains that columns can store more than their limit when the row format is dynamic.

My final migration file was:

def change

    db = ActiveRecord::Base.connection

    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"

      db.columns(table).each do |column|
        case column.sql_type
          when /([a-z]*)text/i
            default = (column.default.nil?) ? '' : "DEFAULT '#{column.default}'"
            null = (column.null) ? '' : 'NOT NULL'
            execute "ALTER TABLE `#{table}` MODIFY `#{column.name}` #{column.sql_type.upcase} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
          when /varchar\(([0-9]+)\)/i
            sql_type = column.sql_type.upcase
            default = (column.default.nil?) ? '' : "DEFAULT '#{column.default}'"
            null = (column.null) ? '' : 'NOT NULL'
            execute "ALTER TABLE `#{table}` MODIFY `#{column.name}` #{sql_type} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci #{default} #{null};"
        end
      end
    end

  end

That was run on a database with dynamic row formats and on MySQL 5.7.

Oh! and a trap for new players. Some of my tables were still MyISAM for some reason! This blew up the migrations around key length of course. Converting them to InnoDB did the trick.

@pkarjala
Copy link

pkarjala commented Aug 5, 2021

Just adding on, the default COLLATE for MySQL 8.0 is now utf8mb4_0900_ai_c. Scripts may need to be updated to support this.

See http://mysqlserverteam.com/new-collations-in-mysql-8-0-0/ for more information.

@Whitespace
Copy link

Unsure why CONVERT TO CHARACTER SET isn't used? It converts all the TEXT and VARCHAR to the proper data type, including converting them to a different type if the bytes change (for example VARCHAR => MEDIUMTEXT if the byte limit for VARCHAR would be hit). Seems reasonable vs reducing 255 => 191 characters.

Additionally there are blocking writes for every ALTER TABLE in the db.columns(table) block. You can use multiple MODIFY statements in a single ALTER TABLE.

class ChangeDefaultCharsetAndCollation < ActiveRecord::Migration[5.2]
  def up
    db = ActiveRecord::Base.connection

    # set defaults for new tables
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"

    # migrate all tables to Barracuda InnoDB and utf8mb4
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` ROW_FORMAT=COMPRESSED, ALGORITHM=INPLACE, LOCK=NONE;"
      execute "ALTER TABLE `#{table}` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;"
    end
  end

  def down
    db = ActiveRecord::Base.connection

    # restore utf8mb3 defaults for new tables
    execute "ALTER DATABASE `#{db.current_database}` CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci;"

    # restore all tables to Antelope InnoDB and utf8mb3
    db.tables.each do |table|
      execute "ALTER TABLE `#{table}` CONVERT TO CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci;"
      execute "ALTER TABLE `#{table}` ROW_FORMAT=COMPACT, ALGORITHM=INPLACE, LOCK=NONE;"
    end
  end
end

@jerry2013
Copy link

CONVERT TO CHARACTER SET is great but MySQL warns against using it if the existing columns are latin1.

From https://dev.mysql.com/doc/refman/8.0/en/alter-table.html

The CONVERT TO operation converts column values between the original and named character sets. This is not what you want if you have a column in one character set (like latin1) but the stored values actually use some other, incompatible character set (like utf8mb4).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment