This Julia Gist generates the character set mapping dictionaries from the mappings files (8 bit characters to Unicode; ftp.unicode.org/Public/MAPPINGS/) and it decodes the text from a given code page.
using HDF5, JLD | |
# https://www.dropbox.com/s/o4ys5z3p7ogircd/characterSetMappingTables.jld?dl=0 | |
@load "c:\\Users\\Martin\\Downloads\\MAPPINGS\\characterSetMappingTables.jld" | |
function printEncodings() | |
for mappingTable in sort(collect(keys(characterSetMappingTables))) | |
println(mappingTable) | |
end | |
end | |
#printEncodings() | |
function decodeText(inputString, mappingTable) | |
return (join([mappingTable[x] for x in convert(Array{Uint8,1}, inputString)])) | |
end | |
#println(decodeText(inputString, characterSetMappingTables["vendors/micsft/windows/cp1250"])) |
using HDF5, JLD | |
# ftp://ftp.unicode.org/Public/MAPPINGS/ | |
mappingDirectory = "c:\\Users\\Martin\\Downloads\\MAPPINGS\\" | |
mappingFiles = ["ISO8859\\8859-1.TXT", | |
"ISO8859\\8859-10.TXT", | |
"ISO8859\\8859-11.TXT", | |
"ISO8859\\8859-13.TXT", | |
"ISO8859\\8859-14.TXT", | |
"ISO8859\\8859-15.TXT", | |
"ISO8859\\8859-16.TXT", | |
"ISO8859\\8859-2.TXT", | |
"ISO8859\\8859-3.TXT", | |
"ISO8859\\8859-4.TXT", | |
"ISO8859\\8859-5.TXT", | |
"ISO8859\\8859-6.TXT", | |
"ISO8859\\8859-7.TXT", | |
"ISO8859\\8859-8.TXT", | |
"ISO8859\\8859-9.TXT", | |
"VENDORS\\MICSFT\\MAC\\CYRILLIC.TXT", | |
"VENDORS\\MICSFT\\MAC\\GREEK.TXT", | |
"VENDORS\\MICSFT\\MAC\\ICELAND.TXT", | |
"VENDORS\\MICSFT\\MAC\\LATIN2.TXT", | |
"VENDORS\\MICSFT\\MAC\\ROMAN.TXT", | |
"VENDORS\\MICSFT\\MAC\\TURKISH.TXT", | |
"VENDORS\\MICSFT\\PC\\CP437.TXT", | |
"VENDORS\\MICSFT\\PC\\CP737.TXT", | |
"VENDORS\\MICSFT\\PC\\CP775.TXT", | |
"VENDORS\\MICSFT\\PC\\CP850.TXT", | |
"VENDORS\\MICSFT\\PC\\CP852.TXT", | |
"VENDORS\\MICSFT\\PC\\CP855.TXT", | |
"VENDORS\\MICSFT\\PC\\CP857.TXT", | |
"VENDORS\\MICSFT\\PC\\CP860.TXT", | |
"VENDORS\\MICSFT\\PC\\CP861.TXT", | |
"VENDORS\\MICSFT\\PC\\CP862.TXT", | |
"VENDORS\\MICSFT\\PC\\CP863.TXT", | |
"VENDORS\\MICSFT\\PC\\CP864.TXT", | |
"VENDORS\\MICSFT\\PC\\CP865.TXT", | |
"VENDORS\\MICSFT\\PC\\CP866.TXT", | |
"VENDORS\\MICSFT\\PC\\CP869.TXT", | |
"VENDORS\\MICSFT\\PC\\CP874.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1250.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1251.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1252.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1253.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1254.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1255.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1256.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1257.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP1258.TXT", | |
"VENDORS\\MICSFT\\WINDOWS\\CP874.TXT", | |
"VENDORS\\MISC\\KOI8-R.TXT", | |
"VENDORS\\MISC\\KOI8-U.TXT"] | |
function parseFile(fileName) | |
translateTable = Dict{Uint8, Char}() | |
for line in readlines(open(fileName, "r")) | |
words = split(line) | |
if length(words)<2 | |
continue | |
end | |
if contains(words[1], "#") | |
continue | |
end | |
uni = contains(words[2], "#") ? "0x20" : words[2] | |
translateTable[uint8(words[1])] = char(int(uni)) | |
end | |
return (translateTable) | |
end | |
characterSetMappingTables = Dict{ASCIIString, Dict{Uint8, Char}}() | |
for mappingFile in mappingFiles | |
tableName = replace(replace(lowercase(mappingFile), ".txt", ""), "\\", "/") | |
fileName = joinpath(mappingDirectory * mappingFile) | |
characterSetMappingTables[tableName] = parseFile(fileName) | |
end | |
@save "c:\\Users\\Martin\\Downloads\\MAPPINGS\\characterSetMappingTables.jld" characterSetMappingTables |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment