Last active
August 29, 2015 14:04
-
-
Save garborg/8df890d1603263b77793 to your computer and use it in GitHub Desktop.
Latin1Buffer type for Julia -- converts to UTF-8 on the fly -- quick workaround for importing Latin1-encoded files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Latin1Buffers | |
export Latin1Buffer | |
# safe because 0x00 not a valid continuation byte (leading bits must be '10') | |
const EMPTY = 0x00 | |
type Latin1Buffer <: IO | |
io::IO | |
continuation::Uint8 | |
Latin1Buffer(io::IO) = new(io, EMPTY) | |
end | |
function Base.read(l1::Latin1Buffer, ::Type{Uint8}) | |
if l1.continuation == EMPTY | |
c = read(l1.io, Uint8) | |
# past the 128 ascii characters, latin-1 & utf-8 don't map 1-1 -- | |
# they require a second ('continuation') byte in utf-8 encoding | |
if c >= 0x80 | |
c, l1.continuation = c < 0xc0 ? (0xc2, c) : (0xc3, uint8(c - 0x40)) | |
end | |
c | |
else | |
ret, l1.continuation = l1.continuation, EMPTY | |
ret | |
end | |
end | |
Base.eof(l1::Latin1Buffer) = l1.continuation == EMPTY && eof(l1.io) | |
end # module Latin1Buffers | |
# EXAMPLE: | |
using Latin1Buffers | |
tempfile = tempname() | |
open(io -> write(io, 0x00:0xff), tempfile, "w") | |
open(io -> readall(Latin1Buffer(io)), tempfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment