calexandrepcjr/charset_collation_conversion.sql

## charset_collation_conversion.sql
/*
  https://stackoverflow.com/questions/37364618/utf8mb4-unicode-ci-vs-utf8mb4-bin
  https://drupal.stackexchange.com/questions/166405/why-are-we-using-utf8mb4-general-ci-and-not-utf8mb4-unicode-ci/184528#184528?newreg=82d15487d4b84bbe978ce38e687baaa9
  https://pentahointegra.blogspot.com/2015/09/utf8generalci-vs-utf8unicodeci-what.html

  Note: in new versions of MySQL use utf8mb4, rather than utf8, which is the same
  UTF-8 data format with same performance but previously only accepted the first
  65,536 Unicode characters.

  Accuracy
  utf8mb4_unicode_ci is based on the Unicode standard for sorting and
  comparison, which sorts accurately in a very wide range of languages.
  utf8mb4_general_ci fails to implement all of the Unicode sorting rules,
  which will result in undesirable sorting in some situations, such as when using particular languages or characters.

  Performance
  utf8mb4_general_ci is faster at comparisons and sorting, because it takes a
  bunch of performance-related shortcuts.
  On modern servers, this performance boost will be all but negligible.
  It was devised in a time when servers had a tiny fraction of the CPU
  performance of today's computers.
  utf8mb4_unicode_ci, which uses the Unicode rules for sorting and comparison,
  employs a fairly complex algorithm for correct sorting in a wide range of
  languages and when using a wide range of special characters. These rules
  need to take into account language-specific conventions; not everybody sorts
  their characters in what we would call 'alphabetical order'.

  As far as Latin (ie "European") languages go, there is not much difference
  between the Unicode sorting and the simplified utf8mb4_general_ci sorting
  in MySQL, but there are still a few differences:

  For examples, the Unicode collation sorts "ß" like "ss", and "Œ" like "OE"
  as people using those characters would normally want, whereas
  utf8mb4_general_ci sorts them as single characters
  (presumably like "s" and "e" respectively).
  Some Unicode characters are defined as ignorable, which means they shouldn't
  count toward the sort order and the comparison should move on to the next
  character instead. utf8mb4_unicode_ci handles these properly.

  In non-latin languages, such as Asian languages or languages with different
  alphabets, there may be a lot more differences between Unicode sorting and
  the simplified utf8mb4_general_ci sorting. The suitability of utf8mb4_general_ci
  will depend heavily on the language used. For some languages, it'll be quite inadequate.

  WHAT IT DOES:
  Database charset/collation will be changed accordingly, together your entire tables
  and columns that handles charset inside database.

  IMPORTANT: This operation will not handle special table cases, as partitioned tables
  and other specific conditions of row/column that InnoDB offers.

  HOW TO USE:
  Enter in your database with USE **YOUR_DATABASE_NAME** and copy/paste the code below.
*/

ALTER DATABASE CHARACTER SET utf8mb4 COLLATE = utf8mb4_unicode_ci;

DROP PROCEDURE IF EXISTS change_tables_character_set;
CREATE PROCEDURE change_tables_character_set(in charset VARCHAR(64), in collation VARCHAR(64))
BEGIN
DECLARE done BOOLEAN DEFAULT false;
DECLARE tab_name VARCHAR(64);
DECLARE charset_cursor CURSOR FOR
    SELECT table_name FROM information_schema.tables
    WHERE table_schema = DATABASE() AND
    create_options = '';
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;

SET foreign_key_checks = 0;

OPEN charset_cursor;
change_loop: LOOP
FETCH charset_cursor INTO tab_name;
IF done THEN
    LEAVE change_loop;
END IF;
SET @alter = concat('ALTER TABLE ', tab_name,
                    ' CONVERT TO CHARACTER SET ', charset,
                    ' COLLATE ', collation , ';');
SELECT @alter;
PREPARE stmt FROM @alter;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END LOOP change_loop;
CLOSE charset_cursor;
SET foreign_key_checks = 1;
END;

DROP PROCEDURE IF EXISTS change_columns_character_set;
CREATE PROCEDURE change_columns_character_set(IN charset VARCHAR(64), IN collation VARCHAR(64))
BEGIN
    DECLARE done BOOLEAN DEFAULT false;
    DECLARE tab_name VARCHAR(64);
    DECLARE col_name VARCHAR(64);
    DECLARE col_type longtext;
    DECLARE charset_cursor CURSOR FOR
    SELECT table_name, column_name, column_type
         FROM information_schema.columns
         WHERE table_schema = DATABASE()
    	   AND table_name IN (
	       SELECT table_name
	       FROM information_schema.tables
    	       WHERE table_schema = DATABASE()
	       AND create_options = ''
	   )
    	   AND character_set_name IS NOT NULL;

    DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;

    SET foreign_key_checks = 0;

    OPEN charset_cursor;

    change_loop: LOOP
	FETCH charset_cursor INTO tab_name, col_name, col_type;
	IF done THEN
	    LEAVE change_loop;
	END IF;
	SET @alter = concat(
	    'ALTER TABLE `',
	    DATABASE(), '`.`',
	    tab_name,
	    '` MODIFY `',
	    col_name,
	    '` ',
	    col_type ,
	    ' CHARACTER SET ',
	    charset,
	    ' COLLATE ',
	    collation,
	    ';'
	);

	SELECT @alter;
	PREPARE stmt FROM @alter;
	EXECUTE stmt;
	DEALLOCATE PREPARE stmt;
    END LOOP change_loop;

    CLOSE charset_cursor;
    SET foreign_key_checks = 1;
END;

DROP PROCEDURE IF EXISTS change_character_set;
CREATE PROCEDURE change_character_set(IN charset VARCHAR(64), IN collation VARCHAR(64))
BEGIN
    CALL change_tables_character_set(charset, collation);
    CALL change_columns_character_set(charset, collation);
END;

CALL change_character_set('utf8mb4', 'utf8mb4_unicode_ci');
	/*
	https://stackoverflow.com/questions/37364618/utf8mb4-unicode-ci-vs-utf8mb4-bin
	https://drupal.stackexchange.com/questions/166405/why-are-we-using-utf8mb4-general-ci-and-not-utf8mb4-unicode-ci/184528#184528?newreg=82d15487d4b84bbe978ce38e687baaa9
	https://pentahointegra.blogspot.com/2015/09/utf8generalci-vs-utf8unicodeci-what.html

	Note: in new versions of MySQL use utf8mb4, rather than utf8, which is the same
	UTF-8 data format with same performance but previously only accepted the first
	65,536 Unicode characters.

	Accuracy
	utf8mb4_unicode_ci is based on the Unicode standard for sorting and
	comparison, which sorts accurately in a very wide range of languages.
	utf8mb4_general_ci fails to implement all of the Unicode sorting rules,
	which will result in undesirable sorting in some situations, such as when using particular languages or characters.

	Performance
	utf8mb4_general_ci is faster at comparisons and sorting, because it takes a
	bunch of performance-related shortcuts.
	On modern servers, this performance boost will be all but negligible.
	It was devised in a time when servers had a tiny fraction of the CPU
	performance of today's computers.
	utf8mb4_unicode_ci, which uses the Unicode rules for sorting and comparison,
	employs a fairly complex algorithm for correct sorting in a wide range of
	languages and when using a wide range of special characters. These rules
	need to take into account language-specific conventions; not everybody sorts
	their characters in what we would call 'alphabetical order'.

	As far as Latin (ie "European") languages go, there is not much difference
	between the Unicode sorting and the simplified utf8mb4_general_ci sorting
	in MySQL, but there are still a few differences:

	For examples, the Unicode collation sorts "ß" like "ss", and "Œ" like "OE"
	as people using those characters would normally want, whereas
	utf8mb4_general_ci sorts them as single characters
	(presumably like "s" and "e" respectively).
	Some Unicode characters are defined as ignorable, which means they shouldn't
	count toward the sort order and the comparison should move on to the next
	character instead. utf8mb4_unicode_ci handles these properly.

	In non-latin languages, such as Asian languages or languages with different
	alphabets, there may be a lot more differences between Unicode sorting and
	the simplified utf8mb4_general_ci sorting. The suitability of utf8mb4_general_ci
	will depend heavily on the language used. For some languages, it'll be quite inadequate.

	WHAT IT DOES:
	Database charset/collation will be changed accordingly, together your entire tables
	and columns that handles charset inside database.

	IMPORTANT: This operation will not handle special table cases, as partitioned tables
	and other specific conditions of row/column that InnoDB offers.

	HOW TO USE:
	Enter in your database with USE YOUR_DATABASE_NAME and copy/paste the code below.
	*/

	ALTER DATABASE CHARACTER SET utf8mb4 COLLATE = utf8mb4_unicode_ci;

	DROP PROCEDURE IF EXISTS change_tables_character_set;
	CREATE PROCEDURE change_tables_character_set(in charset VARCHAR(64), in collation VARCHAR(64))
	BEGIN
	DECLARE done BOOLEAN DEFAULT false;
	DECLARE tab_name VARCHAR(64);
	DECLARE charset_cursor CURSOR FOR
	SELECT table_name FROM information_schema.tables
	WHERE table_schema = DATABASE() AND
	create_options = '';
	DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;

	SET foreign_key_checks = 0;

	OPEN charset_cursor;
	change_loop: LOOP
	FETCH charset_cursor INTO tab_name;
	IF done THEN
	LEAVE change_loop;
	END IF;
	SET @alter = concat('ALTER TABLE ', tab_name,
	' CONVERT TO CHARACTER SET ', charset,
	' COLLATE ', collation , ';');
	SELECT @alter;
	PREPARE stmt FROM @alter;
	EXECUTE stmt;
	DEALLOCATE PREPARE stmt;
	END LOOP change_loop;
	CLOSE charset_cursor;
	SET foreign_key_checks = 1;
	END;

	DROP PROCEDURE IF EXISTS change_columns_character_set;
	CREATE PROCEDURE change_columns_character_set(IN charset VARCHAR(64), IN collation VARCHAR(64))
	BEGIN
	DECLARE done BOOLEAN DEFAULT false;
	DECLARE tab_name VARCHAR(64);
	DECLARE col_name VARCHAR(64);
	DECLARE col_type longtext;
	DECLARE charset_cursor CURSOR FOR
	SELECT table_name, column_name, column_type
	FROM information_schema.columns
	WHERE table_schema = DATABASE()
	AND table_name IN (
	SELECT table_name
	FROM information_schema.tables
	WHERE table_schema = DATABASE()
	AND create_options = ''
	)
	AND character_set_name IS NOT NULL;

	DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;

	SET foreign_key_checks = 0;

	OPEN charset_cursor;

	change_loop: LOOP
	FETCH charset_cursor INTO tab_name, col_name, col_type;
	IF done THEN
	LEAVE change_loop;
	END IF;
	SET @alter = concat(
	'ALTER TABLE `',
	DATABASE(), '`.`',
	tab_name,
	'` MODIFY `',
	col_name,
	'` ',
	col_type ,
	' CHARACTER SET ',
	charset,
	' COLLATE ',
	collation,
	';'
	);

	SELECT @alter;
	PREPARE stmt FROM @alter;
	EXECUTE stmt;
	DEALLOCATE PREPARE stmt;
	END LOOP change_loop;

	CLOSE charset_cursor;
	SET foreign_key_checks = 1;
	END;

	DROP PROCEDURE IF EXISTS change_character_set;
	CREATE PROCEDURE change_character_set(IN charset VARCHAR(64), IN collation VARCHAR(64))
	BEGIN
	CALL change_tables_character_set(charset, collation);
	CALL change_columns_character_set(charset, collation);
	END;

	CALL change_character_set('utf8mb4', 'utf8mb4_unicode_ci');