Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
This Julia Gist generates the character set mapping dictionaries from the mappings files (8 bit characters to Unicode; ftp.unicode.org/Public/MAPPINGS/) and it decodes the text from a given code page.
using HDF5, JLD
# https://www.dropbox.com/s/o4ys5z3p7ogircd/characterSetMappingTables.jld?dl=0
@load "c:\\Users\\Martin\\Downloads\\MAPPINGS\\characterSetMappingTables.jld"
function printEncodings()
for mappingTable in sort(collect(keys(characterSetMappingTables)))
println(mappingTable)
end
end
#printEncodings()
function decodeText(inputString, mappingTable)
return (join([mappingTable[x] for x in convert(Array{Uint8,1}, inputString)]))
end
#println(decodeText(inputString, characterSetMappingTables["vendors/micsft/windows/cp1250"]))
using HDF5, JLD
# ftp://ftp.unicode.org/Public/MAPPINGS/
mappingDirectory = "c:\\Users\\Martin\\Downloads\\MAPPINGS\\"
mappingFiles = ["ISO8859\\8859-1.TXT",
"ISO8859\\8859-10.TXT",
"ISO8859\\8859-11.TXT",
"ISO8859\\8859-13.TXT",
"ISO8859\\8859-14.TXT",
"ISO8859\\8859-15.TXT",
"ISO8859\\8859-16.TXT",
"ISO8859\\8859-2.TXT",
"ISO8859\\8859-3.TXT",
"ISO8859\\8859-4.TXT",
"ISO8859\\8859-5.TXT",
"ISO8859\\8859-6.TXT",
"ISO8859\\8859-7.TXT",
"ISO8859\\8859-8.TXT",
"ISO8859\\8859-9.TXT",
"VENDORS\\MICSFT\\MAC\\CYRILLIC.TXT",
"VENDORS\\MICSFT\\MAC\\GREEK.TXT",
"VENDORS\\MICSFT\\MAC\\ICELAND.TXT",
"VENDORS\\MICSFT\\MAC\\LATIN2.TXT",
"VENDORS\\MICSFT\\MAC\\ROMAN.TXT",
"VENDORS\\MICSFT\\MAC\\TURKISH.TXT",
"VENDORS\\MICSFT\\PC\\CP437.TXT",
"VENDORS\\MICSFT\\PC\\CP737.TXT",
"VENDORS\\MICSFT\\PC\\CP775.TXT",
"VENDORS\\MICSFT\\PC\\CP850.TXT",
"VENDORS\\MICSFT\\PC\\CP852.TXT",
"VENDORS\\MICSFT\\PC\\CP855.TXT",
"VENDORS\\MICSFT\\PC\\CP857.TXT",
"VENDORS\\MICSFT\\PC\\CP860.TXT",
"VENDORS\\MICSFT\\PC\\CP861.TXT",
"VENDORS\\MICSFT\\PC\\CP862.TXT",
"VENDORS\\MICSFT\\PC\\CP863.TXT",
"VENDORS\\MICSFT\\PC\\CP864.TXT",
"VENDORS\\MICSFT\\PC\\CP865.TXT",
"VENDORS\\MICSFT\\PC\\CP866.TXT",
"VENDORS\\MICSFT\\PC\\CP869.TXT",
"VENDORS\\MICSFT\\PC\\CP874.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1250.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1251.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1252.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1253.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1254.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1255.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1256.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1257.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1258.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP874.TXT",
"VENDORS\\MISC\\KOI8-R.TXT",
"VENDORS\\MISC\\KOI8-U.TXT"]
function parseFile(fileName)
translateTable = Dict{Uint8, Char}()
for line in readlines(open(fileName, "r"))
words = split(line)
if length(words)<2
continue
end
if contains(words[1], "#")
continue
end
uni = contains(words[2], "#") ? "0x20" : words[2]
translateTable[uint8(words[1])] = char(int(uni))
end
return (translateTable)
end
characterSetMappingTables = Dict{ASCIIString, Dict{Uint8, Char}}()
for mappingFile in mappingFiles
tableName = replace(replace(lowercase(mappingFile), ".txt", ""), "\\", "/")
fileName = joinpath(mappingDirectory * mappingFile)
characterSetMappingTables[tableName] = parseFile(fileName)
end
@save "c:\\Users\\Martin\\Downloads\\MAPPINGS\\characterSetMappingTables.jld" characterSetMappingTables
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment