Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielholmstrom/1264170 to your computer and use it in GitHub Desktop.
Save danielholmstrom/1264170 to your computer and use it in GitHub Desktop.
Fix double encoded UTF-8 in mysql with an SQL-query.
-- Example of how to fix double encoded UTF-8 in mysql with a single mysql query.
-- This example assumes that the data has been inserted with NAMES=latin1.
--
-- XXX: Do not rely on this to work without testing it. XXX
--
-- XXX: If this is run on a column with a UNIQUE index and the COLLATE is
-- case-insensitive this might fail. The reason for this is that double-encoded
-- upper and lower case chars are considered different but when encoded correctly
-- they will be considered the same.
DROP TABLE IF EXISTS double_encoded_utf8;
CREATE TABLE double_encoded_utf8(
`text` varchar(500) COLLATE utf8_swedish_ci
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_swedish_ci;
-- Insert text with NAMES=latin1
SET NAMES latin1;
INSERT INTO double_encoded_utf8 VALUES("!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ");
SELECT `text` AS '=== Test selected with NAMES=latin1' FROM double_encoded_utf8;
SELECT count(*) as '=== Number of matches before encoding' FROM double_encoded_utf8 WHERE `text`="!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
-- Switch to UTF8
SET NAMES utf8;
SELECT `text` AS '=== Test selected with NAMES=utf8 before converting' FROM double_encoded_utf8;
SELECT count(*) as '=== Number of matches before encoding' FROM double_encoded_utf8 WHERE `text`="!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
-- Convert
UPDATE double_encoded_utf8 SET `text`=convert(cast(cast(`text` AS CHAR CHARACTER SET latin1) AS BINARY) USING utf8);
SELECT `text` AS '=== Test selected with NAMES=utf8 after converting' FROM double_encoded_utf8;
SELECT count(*) as '=== Number of matches after encoding' FROM double_encoded_utf8 WHERE `text`="!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment