Skip to content

Instantly share code, notes, and snippets.

@cpatulea
Created April 30, 2024 04:54
Show Gist options
  • Save cpatulea/8446030f4a70e92fd66fa9a090c238c3 to your computer and use it in GitHub Desktop.
Save cpatulea/8446030f4a70e92fd66fa9a090c238c3 to your computer and use it in GitHub Desktop.
Foulab wiki: non-ASCII characters
with h as (
select *, convert(data using utf8) as data_utf8 from tiki_history
where data <> convert(convert(data using utf8) using ascii)
),
h1b as (
with recursive cte as (
select pageName, version, data_utf8, 0 as ascii, ' ' as c, 1 as pos
from h
union all
select pageName, version, data_utf8,
regexp_instr(substring(data_utf8, pos, 1), '[\r\n\t -~]'),
substring(data_utf8, pos, 1),
pos + 1
from cte
where pos <= char_length(data_utf8)
)
select pageName, version, ascii, c
from cte
where pos >= 2
and not ascii
)
-- select
-- hex(c),
-- c,
-- concat('https://laboratoires.foulab.org/w/tiki-pagehistory.php?page=', replace(pageName, ' ', '+'), '&preview=', version)
-- from h1b;
select
hex(c) as c_hex,
c,
count(*)
-- min(concat('https://laboratoires.foulab.org/w/tiki-pagehistory.php?page=', replace(pageName, ' ', '+'), '&preview=', version)) as url
from h1b group by c_hex, c order by c_hex;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment