Skip to content

Instantly share code, notes, and snippets.

@martinmev
Last active August 29, 2015 14:20
Show Gist options
  • Save martinmev/3b3b581627d2eaef4056 to your computer and use it in GitHub Desktop.
Save martinmev/3b3b581627d2eaef4056 to your computer and use it in GitHub Desktop.
This Julia Gist generates the character set mapping dictionaries from the mappings files (8 bit characters to Unicode; ftp.unicode.org/Public/MAPPINGS/) and it decodes the text from a given code page.
using HDF5, JLD
# https://www.dropbox.com/s/o4ys5z3p7ogircd/characterSetMappingTables.jld?dl=0
@load "c:\\Users\\Martin\\Downloads\\MAPPINGS\\characterSetMappingTables.jld"
function printEncodings()
for mappingTable in sort(collect(keys(characterSetMappingTables)))
println(mappingTable)
end
end
#printEncodings()
function decodeText(inputString, mappingTable)
return (join([mappingTable[x] for x in convert(Array{Uint8,1}, inputString)]))
end
#println(decodeText(inputString, characterSetMappingTables["vendors/micsft/windows/cp1250"]))
using HDF5, JLD
# ftp://ftp.unicode.org/Public/MAPPINGS/
mappingDirectory = "c:\\Users\\Martin\\Downloads\\MAPPINGS\\"
mappingFiles = ["ISO8859\\8859-1.TXT",
"ISO8859\\8859-10.TXT",
"ISO8859\\8859-11.TXT",
"ISO8859\\8859-13.TXT",
"ISO8859\\8859-14.TXT",
"ISO8859\\8859-15.TXT",
"ISO8859\\8859-16.TXT",
"ISO8859\\8859-2.TXT",
"ISO8859\\8859-3.TXT",
"ISO8859\\8859-4.TXT",
"ISO8859\\8859-5.TXT",
"ISO8859\\8859-6.TXT",
"ISO8859\\8859-7.TXT",
"ISO8859\\8859-8.TXT",
"ISO8859\\8859-9.TXT",
"VENDORS\\MICSFT\\MAC\\CYRILLIC.TXT",
"VENDORS\\MICSFT\\MAC\\GREEK.TXT",
"VENDORS\\MICSFT\\MAC\\ICELAND.TXT",
"VENDORS\\MICSFT\\MAC\\LATIN2.TXT",
"VENDORS\\MICSFT\\MAC\\ROMAN.TXT",
"VENDORS\\MICSFT\\MAC\\TURKISH.TXT",
"VENDORS\\MICSFT\\PC\\CP437.TXT",
"VENDORS\\MICSFT\\PC\\CP737.TXT",
"VENDORS\\MICSFT\\PC\\CP775.TXT",
"VENDORS\\MICSFT\\PC\\CP850.TXT",
"VENDORS\\MICSFT\\PC\\CP852.TXT",
"VENDORS\\MICSFT\\PC\\CP855.TXT",
"VENDORS\\MICSFT\\PC\\CP857.TXT",
"VENDORS\\MICSFT\\PC\\CP860.TXT",
"VENDORS\\MICSFT\\PC\\CP861.TXT",
"VENDORS\\MICSFT\\PC\\CP862.TXT",
"VENDORS\\MICSFT\\PC\\CP863.TXT",
"VENDORS\\MICSFT\\PC\\CP864.TXT",
"VENDORS\\MICSFT\\PC\\CP865.TXT",
"VENDORS\\MICSFT\\PC\\CP866.TXT",
"VENDORS\\MICSFT\\PC\\CP869.TXT",
"VENDORS\\MICSFT\\PC\\CP874.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1250.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1251.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1252.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1253.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1254.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1255.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1256.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1257.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP1258.TXT",
"VENDORS\\MICSFT\\WINDOWS\\CP874.TXT",
"VENDORS\\MISC\\KOI8-R.TXT",
"VENDORS\\MISC\\KOI8-U.TXT"]
function parseFile(fileName)
translateTable = Dict{Uint8, Char}()
for line in readlines(open(fileName, "r"))
words = split(line)
if length(words)<2
continue
end
if contains(words[1], "#")
continue
end
uni = contains(words[2], "#") ? "0x20" : words[2]
translateTable[uint8(words[1])] = char(int(uni))
end
return (translateTable)
end
characterSetMappingTables = Dict{ASCIIString, Dict{Uint8, Char}}()
for mappingFile in mappingFiles
tableName = replace(replace(lowercase(mappingFile), ".txt", ""), "\\", "/")
fileName = joinpath(mappingDirectory * mappingFile)
characterSetMappingTables[tableName] = parseFile(fileName)
end
@save "c:\\Users\\Martin\\Downloads\\MAPPINGS\\characterSetMappingTables.jld" characterSetMappingTables
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment