Skip to content

Instantly share code, notes, and snippets.

@quinnj
Last active August 29, 2015 14:18
Show Gist options
  • Save quinnj/2054b2018a5fb32ee983 to your computer and use it in GitHub Desktop.
Save quinnj/2054b2018a5fb32ee983 to your computer and use it in GitHub Desktop.
# Validation Rules
# Each row must contain the same number of columns
# Each field must be formatted correctly and be of the right type and format, must be one of:
# Empty, missing value
# DATE
# DATETIME
# INTEGER
# DOUBLE
# STRING
# quoted by f.quotechar, with f.escapechar allowed within f.quotechar to specify literal f.quotechar, f.delim, or f.newline
module CSV
export CRLF
if VERSION > v"0.4.0-dev"
using Base.Dates
else
AbstractString = String
using Dates
end
typealias Str AbstractString
immutable CSVError <: Exception
msg::ASCIIString
end
immutable Chars
one::Char
two::Char
end
const CRLF = Chars('\r','\n')
typealias OneOrTwoChars Union(Char,Chars)
@inline read!(f::IOStream,c::Ref{Char}) = ccall(:ios_getutf8, Cint, (Ptr{Void}, Ptr{Char}), f.ios, c)
immutable File
fullpath::Str
delim::Char
newline::OneOrTwoChars
quotechar::Char
escapechar::Char
headerrow::Int
datarow::Int
footerskip::Int
cols::Int
types::Vector
formats::Vector{Str}
end
function File(fullpath::Str,
delim::Char=',';
newline::OneOrTwoChars='\ufeff',
quotechar::Char='"',
escapechar::Char='\\',
numcols::Int=0,
types::Vector=[],
formats::Vector=[],
headerrow::Int=1,
datarow::Int=2,
footerskip::Int=0)
# argument checks
isfile(fullpath) || throw(ArgumentError("$fullpath is not a valid file"))
datarow > headerrow || throw(ArgumentError("data row ($datarow) must come after header row ($headerrow)"))
f = open(fullpath)
c = Ref{Char}()
# detect newline
dn = newline == '\ufeff'
# detect number of columns
dc = numcols == 0
if dn || dc
n = newline
ncols = 1
while !eof(f)
read!(f,c)
if c.x == quotechar
while !eof(f)
read!(f,c)
if c.x == e
read!(f,c)
elseif c.x == quotechar
break
end
end
elseif c.x == delim
ncols += 1
elseif c.x == '\n'
n = '\n'
break
elseif c.x == '\r'
read!(f,c)
n = c.x == '\n' ? CRLF : '\r'
break
end
end
dn && n == '\ufeff' && throw(CSVError("couldn't detect a default newline, one of '\r', '\n', or \"\r\n\""))
end
newline = dn ? n : newline
cols = dc ? ncols : numcols
# seekstart(f)
# # detect types of columns
# dt = isempty(types)
# lines = Array(AbstractString,50)
# i = 1
# while !eof(f)
# lines[i] = readuntil(f,newline)
# i += 1
# end
# types = dt ? ts : types
if isempty(types)
types = Array(DataType,cols)
fill!(types,Str)
end
if isempty(formats)
formats = Array(Str,length(types))
fill!(formats,"")
end
return File(fullpath,delim,newline,quotechar,escapechar,headerrow,datarow,footerskip,cols,types,formats)
end
validatetype{T<:Str}(value::Str,::Type{T},f) = return
function validatetype{T<:Real}(value::Str,::Type{T},f)
t = tryparse(T,value)
isnull(t) && throw(CSVError("$value is not a valid $T value"))
return
end
function validatetype{T<:TimeType}(value::Str,::Type{T},f)
T(value,f)
return
end
@inline checknewline(f,c::Ref{Char},n::Char) = c.x::Char == n
@inline function checknewline(f,c::Ref{Char},n::Chars)
if c.x::Char == newline.one
mark(f)
read!(f,c)
c.x::Char == newline.two && return true
reset(f)
end
return false
end
function validateline!(file,f,c,i,buf,isheader,::Type{Val{false}})
q = file.quotechar
e = file.escapechar
d = file.delim
n = file.newline
cols = file.cols
fieldsfound = 1
while !eof(f)
CSV.read!(f,c)
if c.x == q # if we run into a quote character
while !eof(f) # keep reading until we reach the closing q
CSV.read!(f,c)
# if we run into an escape character
if c.x == e
CSV.read!(f,c)
elseif c.x == q
break
end
end
elseif c.x == d # if we've found a delimiter
fieldsfound += 1
elseif c.x == n
break
end
end
fieldsfound == cols || throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound"))
end
function validateline!(file,f,c,i,buf,isheader,::Type{Val{true}})
q = file.quotechar
e = file.escapechar
d = file.delim
n = file.newline
cols = file.cols
types = file.types
formats = file.formats
fieldsfound = 1
while !eof(f)
read!(f,c)
if c.x == q # if we run into a quote character
while !eof(f) # keep reading until we reach the closing q
read!(f,c)
# if we run into an escape character
if c.x == e
write(buf,c.x)
read!(f,c)
write(buf,c.x) # auto read the next Char
elseif c.x == q
break
end
write(buf,c.x)
end
elseif c.x == d # if we've found a delimiter
t = takebuf_string(buf)
!isheader && validatetype(t,types[fieldsfound],formats[fieldsfound])
fieldsfound += 1
elseif c.x == n
break
else
write(buf,c.x)
end
end
t = takebuf_string(buf)
!isheader && validatetype(t,types[fieldsfound],formats[fieldsfound])
fieldsfound == cols || throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound"))
end
function skiplinesto!(file,f,c,i,n)
newline = file.newline
q = file.quotechar
e = file.escapechar
while !eof(f)
i == n && break
read!(f,c)
if c.x == q
while !eof(f)
read!(f,c)
if c.x == e
read!(f,c)
elseif c.x == q
break
end
end
elseif c.x == newline
i + 1
end
end
return i
end
# if size of buf is fixed, it needs to be at least as big as the largest expected field (value for a single column) in file
function validate(file::File,buf::IOBuffer=IOBuffer();verbose::Bool=true,checktypes::Bool=true)
f = open(file.fullpath)
c = Ref{Char}()
t = checktypes ? Val{true} : Val{false}
i = 1
i = skiplinesto!(file,f,c,i,file.headerrow)
validateline!(file,f,c,file.headerrow,buf,true,t)
i += 1
i = skiplinesto!(file,f,c,i,file.datarow)
while !eof(f)
validateline!(file,f,c,i,buf,false,t)
verbose && i % 100000 == 0 && println("Validated $i rows...")
i += 1
end
return nothing
end
end # module
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment