quinnj/gist:eb25a3f7493aab6cb450707c83f0b170

## gistfile1.txt
] add CSV#jq/lazystrings
using CSV
source = joinpath(dirname(pathof(CSV)), "../randoms.csv")
header = 1
normalizenames = false
datarow = -1
skipto = nothing
footerskip = 0
limit = typemax(Int64)
transpose = false
comment = nothing
use_mmap=true
ignoreemptylines=false
threaded=false
select=[:id]
drop=nothing
missingstrings=String[]
missingstring=""
delim=nothing
ignorerepeated=false
quotechar='"'
openquotechar=nothing
closequotechar=nothing
escapechar='"'
dateformat=nothing
dateformats=nothing
decimal=UInt8('.')
truestrings=["true", "True", "TRUE"]
falsestrings=["false", "False", "FALSE"]
type=nothing
types=Dict(1=>Int32)
typemap=Dict{Type,Type}()
categorical=false
pool=false
lazystrings=false
strict=false
silencewarnings=false
debug=false
parsingdebug=false
h = CSV.Header(source, header, normalizenames, datarow, skipto, footerskip, limit, transpose, comment, use_mmap, ignoreemptylines, threaded, select, drop, missingstrings, missingstring, delim, ignorerepeated, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, categorical, pool, lazystrings, strict, silencewarnings, debug, parsingdebug, false)
rowsguess, ncols, buf, len, datapos, options, coloptions, positions, types, flags, pool, categorical, customtypes = h.rowsguess, h.cols, h.buf, h.len, h.datapos, h.options, h.coloptions, h.positions, h.types, h.flags, h.pool, h.categorical, h.customtypes
refs = Vector{CSV.RefPool}(undef, ncols)
tapes = CSV.allocate(rowsguess, ncols, types, flags)

# here's where we parse the whole file and I currently see 3.43M allocations
@time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
@time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)

# julia> @time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
# customtypes = Tuple{Tuple{SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}},Int32}}
#   0.091579 seconds (3.43 M allocations: 85.382 MiB, 8.28% gc time)
# (70000, 4545084)

# customtypes is a Tuple type of 2-Tuple types, like Tuple{Tuple{ArrayType, ElementType}...}
# it's used in parsecustom! to generate inline parsevalue! calls for each ArrayType=>ElementType pair (there will be one for each non-standard type a user requests in parsing)

# but code_typed looks great! the call to parsecustom! generated correctly, looks like it inlined and everything is dandy
@code_typed debuginfo=:source CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)

# for reference, we can compare the code_typed output for a hard-coded Time column

#      │ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:591 within `parserow'
# 45 ──│ %127 = π (%49, SentinelArrays.SentinelArray{Dates.Time,1,Dates.Time,Missing,Array{Dates.Time,1}})
# │    │ %128 = invoke CSV.parsevalue!(CSV.Time::Type{Dates.Time}, %48::UInt8, %127::SentinelArrays.SentinelArray{Dates.Time,1,Dates.Time,Missing,Array{Dates.Time,1}}, _5::Array{AbstractArray{T,1} where T,1}, _6::Array{UInt8,1}, %45::Int64, _8::Int64, _17::Parsers.Options{false,false,true,false,Missing,UInt8,Nothing}, %31::Int64, %43::Int64, _14::Array{Type,1}, _15::Array{UInt8,1})::Tuple{Int64,Int16}

# with our custom type generated output

# 52 ──││││ %153 = Base.arrayref(false, tapes, %43)::AbstractArray{T,1} where T
# │    │││└
# │    │││ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:549 within `macro expansion'
# │    │││ %154 = (%153 isa SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}})::Bool
# └────│││        goto #54 if not %154
#      │││ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:550 within `macro expansion'
# 53 ──│││ %156 = π (%153, SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}})
# │    │││ %157 = invoke CSV.parsevalue!(Int32::Type{Int32}, %48::UInt8, %156::SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}}, _5::Array{AbstractArray{T,1} where T,1}, _6::Array{UInt8,1}, %45::Int64, _8::Int64, _17::Parsers.Options{false,false,true,false,Missing,UInt8,Nothing}, %31::Int64, %43::Int64, _14::Array{Type,1}, _15::Array{UInt8,1})::Tuple{Int64,Int16}

# but code_llvm is sad, several jl_box_int64 that seem to account for all the allocations
@code_llvm debuginfo=:source CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)

using Profile
GC.gc(); GC.gc(); Profile.clear(); @profile CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
GC.gc(); GC.gc(); Profile.clear(); @profile CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
# reveals lots of allocations/boxing
Profile.print(C=true)

# from what I can tell, there's some issue w/ inference not being able to treat the generated `CSV.parsevalue!(Int32::Type{Int32},...` call the same
# as our other hard-coded column types. I don't know if that's a constant prop thing, or dataflow analysis or what. But even if you _don't_ call with
# a custom type, it still leads to the spike in allocations, because the `row`, `pos`, and possibly `code` variables seem to all get boxed through
# the various layers of `parsetape!`, `parserow`, and `parsevalue!`.

# I did try avoiding the generated function and just doing a macro with an unrolled `Base.@nexprs`, but that didn't seem to change at all.
	] add CSV#jq/lazystrings
	using CSV
	source = joinpath(dirname(pathof(CSV)), "../randoms.csv")
	header = 1
	normalizenames = false
	datarow = -1
	skipto = nothing
	footerskip = 0
	limit = typemax(Int64)
	transpose = false
	comment = nothing
	use_mmap=true
	ignoreemptylines=false
	threaded=false
	select=[:id]
	drop=nothing
	missingstrings=String[]
	missingstring=""
	delim=nothing
	ignorerepeated=false
	quotechar='"'
	openquotechar=nothing
	closequotechar=nothing
	escapechar='"'
	dateformat=nothing
	dateformats=nothing
	decimal=UInt8('.')
	truestrings=["true", "True", "TRUE"]
	falsestrings=["false", "False", "FALSE"]
	type=nothing
	types=Dict(1=>Int32)
	typemap=Dict{Type,Type}()
	categorical=false
	pool=false
	lazystrings=false
	strict=false
	silencewarnings=false
	debug=false
	parsingdebug=false
	h = CSV.Header(source, header, normalizenames, datarow, skipto, footerskip, limit, transpose, comment, use_mmap, ignoreemptylines, threaded, select, drop, missingstrings, missingstring, delim, ignorerepeated, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, categorical, pool, lazystrings, strict, silencewarnings, debug, parsingdebug, false)
	rowsguess, ncols, buf, len, datapos, options, coloptions, positions, types, flags, pool, categorical, customtypes = h.rowsguess, h.cols, h.buf, h.len, h.datapos, h.options, h.coloptions, h.positions, h.types, h.flags, h.pool, h.categorical, h.customtypes
	refs = Vector{CSV.RefPool}(undef, ncols)
	tapes = CSV.allocate(rowsguess, ncols, types, flags)

	# here's where we parse the whole file and I currently see 3.43M allocations
	@time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
	@time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)

	# julia> @time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
	# customtypes = Tuple{Tuple{SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}},Int32}}
	# 0.091579 seconds (3.43 M allocations: 85.382 MiB, 8.28% gc time)
	# (70000, 4545084)

	# customtypes is a Tuple type of 2-Tuple types, like Tuple{Tuple{ArrayType, ElementType}...}
	# it's used in parsecustom! to generate inline parsevalue! calls for each ArrayType=>ElementType pair (there will be one for each non-standard type a user requests in parsing)

	# but code_typed looks great! the call to parsecustom! generated correctly, looks like it inlined and everything is dandy
	@code_typed debuginfo=:source CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)

	# for reference, we can compare the code_typed output for a hard-coded Time column

	# │ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:591 within `parserow'
	# 45 ──│ %127 = π (%49, SentinelArrays.SentinelArray{Dates.Time,1,Dates.Time,Missing,Array{Dates.Time,1}})
	# │ │ %128 = invoke CSV.parsevalue!(CSV.Time::Type{Dates.Time}, %48::UInt8, %127::SentinelArrays.SentinelArray{Dates.Time,1,Dates.Time,Missing,Array{Dates.Time,1}}, _5::Array{AbstractArray{T,1} where T,1}, _6::Array{UInt8,1}, %45::Int64, _8::Int64, _17::Parsers.Options{false,false,true,false,Missing,UInt8,Nothing}, %31::Int64, %43::Int64, _14::Array{Type,1}, _15::Array{UInt8,1})::Tuple{Int64,Int16}

	# with our custom type generated output

	# 52 ──││││ %153 = Base.arrayref(false, tapes, %43)::AbstractArray{T,1} where T
	# │ │││└
	# │ │││ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:549 within `macro expansion'
	# │ │││ %154 = (%153 isa SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}})::Bool
	# └────│││ goto #54 if not %154
	# │││ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:550 within `macro expansion'
	# 53 ──│││ %156 = π (%153, SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}})
	# │ │││ %157 = invoke CSV.parsevalue!(Int32::Type{Int32}, %48::UInt8, %156::SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}}, _5::Array{AbstractArray{T,1} where T,1}, _6::Array{UInt8,1}, %45::Int64, _8::Int64, _17::Parsers.Options{false,false,true,false,Missing,UInt8,Nothing}, %31::Int64, %43::Int64, _14::Array{Type,1}, _15::Array{UInt8,1})::Tuple{Int64,Int16}

	# but code_llvm is sad, several jl_box_int64 that seem to account for all the allocations
	@code_llvm debuginfo=:source CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)

	using Profile
	GC.gc(); GC.gc(); Profile.clear(); @profile CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
	GC.gc(); GC.gc(); Profile.clear(); @profile CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes)
	# reveals lots of allocations/boxing
	Profile.print(C=true)

	# from what I can tell, there's some issue w/ inference not being able to treat the generated `CSV.parsevalue!(Int32::Type{Int32},...` call the same
	# as our other hard-coded column types. I don't know if that's a constant prop thing, or dataflow analysis or what. But even if you _don't_ call with
	# a custom type, it still leads to the spike in allocations, because the `row`, `pos`, and possibly `code` variables seem to all get boxed through
	# the various layers of `parsetape!`, `parserow`, and `parsevalue!`.

	# I did try avoiding the generated function and just doing a macro with an unrolled `Base.@nexprs`, but that didn't seem to change at all.