Skip to content

Instantly share code, notes, and snippets.

@quinnj
Created October 15, 2015 22:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quinnj/38ec81c770bed6291391 to your computer and use it in GitHub Desktop.
Save quinnj/38ec81c770bed6291391 to your computer and use it in GitHub Desktop.
Potential Julia tabular data structure
type Table{T} <: Source
schema::Schema
index::Vector{Int}
ints::Vector{NullableVector{Int}}
floats::Vector{NullableVector{Float64}}
ptrstrings::Vector{NullableVector{PointerString}}
strings::Vector{NullableVector{UTF8String}}
dates::Vector{NullableVector{Date}}
datetimes::Vector{NullableVector{DateTime}}
any::Vector{NullableVector{Any}}
other::Any # sometimes you just need to keep a reference around...
end
type D{T}
data::Dict{Type{T},Vector{NullableVector{T}}}
end
# Constructors
function DataTable(schema::Schema,other=0)
# allocate data
rows, cols = size(schema)
ints = NullableVector{Int}[]
floats = NullableVector{Float64}[]
ptrstrings = NullableVector{PointerString}[]
strings = NullableVector{UTF8String}[]
dates = NullableVector{Date}[]
datetimes = NullableVector{DateTime}[]
any = NullableVector{Any}[]
index = Array(Int,cols)
for col = 1:cols
T = schema.types[col]
if T == Int
push!(ints,NullableArray(T, rows))
index[col] = length(ints)
elseif T == Float64
push!(floats,NullableArray(T, rows))
index[col] = length(floats)
elseif T == PointerString
push!(ptrstrings,NullableArray(T, rows))
index[col] = length(ptrstrings)
elseif T <: AbstractString
push!(strings,NullableArray(UTF8String, rows))
index[col] = length(strings)
elseif T == Date
push!(dates,NullableArray(T, rows))
index[col] = length(dates)
elseif T == DateTime
push!(datetimes,NullableArray(T, rows))
index[col] = length(datetimes)
else
push!(any,NullableArray(T, rows))
index[col] = length(any)
end
end
return DataTable(schema,index,ints,floats,ptrstrings,strings,dates,datetimes,any,other)
end
DataTable(types::Vector{DataType},rows::Int,other=0) = DataTable(Schema(types,rows),other)
DataTable(source::Source) = DataTable(schema(source))
# Interface
# convert to DataFrame
function DataFrames.DataFrame(dt::DataStreams.DataTable)
cols = dt.schema.cols
data = Array(Any,cols)
types = DataStreams.types(dt)
for i = 1:cols
data[i] = DataStreams.column(dt,i,types[i])
end
return DataFrame(data,Symbol[symbol(x) for x in dt.schema.header])
end
# column access
export column
function column(dt::DataTable, j, T)
(0 < j < length(dt.index)+1) || throw(ArgumentError("column index $i out of range"))
return unsafe_column(dt, j, T)
end
@inline unsafe_column(dt::DataTable, j, ::Type{Int64}) = (@inbounds col = dt.ints[dt.index[j]]; return col)
@inline unsafe_column(dt::DataTable, j, ::Type{Float64}) = (@inbounds col = dt.floats[dt.index[j]]; return col)
@inline unsafe_column(dt::DataTable, j, ::Type{PointerString}) = (@inbounds col = dt.ptrstrings[dt.index[j]]; return col)
@inline unsafe_column{T<:AbstractString}(dt::DataTable, j, ::Type{T}) = (@inbounds col = dt.strings[dt.index[j]]; return col)
@inline unsafe_column(dt::DataTable, j, ::Type{Date}) = (@inbounds col = dt.dates[dt.index[j]]; return col)
@inline unsafe_column(dt::DataTable, j, ::Type{DateTime}) = (@inbounds col = dt.datetimes[dt.index[j]]; return col)
@inline unsafe_column(dt::DataTable, j, T) = (@inbounds col = dt.any[dt.index[j]]; return col)
# cell indexing
function Base.getindex(dt::DataTable, i, j)
col = column(dt, j, types(dt)[j])
return col[i]
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment