Skip to content

Instantly share code, notes, and snippets.

@Andersgee
Created July 18, 2022 19:12
Show Gist options
  • Save Andersgee/570d39d53fcda2a450a428256f4c2d23 to your computer and use it in GitHub Desktop.
Save Andersgee/570d39d53fcda2a450a428256f4c2d23 to your computer and use it in GitHub Desktop.
generating some stats with dataframes
#=
trying out dataframes.
1. read .ldjson
2. store in dataframe
3. grab some reasonably simple stats from it
4. write to new .ldjson
=#
using JSON3;
using DataFrames
read_ldjson(filename) = JSON3.read(read(filename, String); jsonlines=true);
function dataframeFromLdjson(filename)
println("reading ldjson...")
jsonarray = read_ldjson(filename)
println("...done")
ks = collect(keys(jsonarray[1]))
println("putting in dataframe...")
df = DataFrame((Symbol(k)=>getindex.(jsonarray, k) for k in ks)...)
println("...done")
return df
end
#column names
#["_id", "roadId", "imageId", "datasetId", "blggnngsd", "mätdatm", "brghtsk", "hastght", "dou2017", "Ådt_frd", "Ådt_tng", "Ådt_mtr", "vägbrdd", "vägnmmr", "vägktgr", "vägtyp", "längd", "blggnngst", "tackning", "spårdjp", "iri", "IRI_maint", "SP_maint", "län_nr", "kmmn_nr", "Ålder", "FrvntdL", "ÅtrstnL", "Tllstnl", "IndxKls", "IKls_1", "IKls_2", "IKls_3", "__v"]
function getSubset(df, area_nr, symbol, symbolValue)
return subset(df, [:län_nr, :kmmn_nr, symbol] => ByRow((län_nr,kmmn_nr, x) -> (x == symbolValue && (län_nr == area_nr || kmmn_nr == area_nr))); skipmissing=true)
end
function getAreaSubset(df, area_nr)
return subset(df, [:län_nr, :kmmn_nr] => ByRow((län_nr,kmmn_nr) -> (län_nr == area_nr || kmmn_nr == area_nr)); skipmissing=true)
end
struct LengthStats
label::String
len::Int
vbad::Int
bad::Int
fair::Int
good::Int
vgood::Int
end
dictFromStruct(s) = Dict(key => getfield(s, key) for key in propertynames(s))
sumLängdCol(df) = sum(df[!, :längd])
getLänKmmnNumbers(df) = cat(unique(df[!, :län_nr]), unique(df[!, :kmmn_nr]), dims=1)
function sumToLengthStats(label, all, vbad, bad, fair,good,vgood)
return LengthStats(
label,
sumLängdCol(all),
sumLängdCol(vbad),
sumLängdCol(bad),
sumLängdCol(fair),
sumLängdCol(good),
sumLängdCol(vgood),
)
end
function getSubsets(area_nr, symbol, symbolValue, allArea, IndxKls1, IndxKls2,IndxKls3,IndxKls4,IndxKls5)
suball = getSubset(allArea, area_nr, symbol, symbolValue)
subIndxKls1 = getSubset(IndxKls1, area_nr, symbol, symbolValue)
subIndxKls2 = getSubset(IndxKls2, area_nr, symbol, symbolValue)
subIndxKls3 = getSubset(IndxKls3, area_nr, symbol, symbolValue)
subIndxKls4 = getSubset(IndxKls4, area_nr, symbol, symbolValue)
subIndxKls5 = getSubset(IndxKls5, area_nr, symbol, symbolValue)
return suball, subIndxKls1, subIndxKls2,subIndxKls3,subIndxKls4,subIndxKls5
end
function getStats(df, area_nr)
allArea = getAreaSubset(df, area_nr)
IndxKls1 = getSubset(df, area_nr, :IndxKls, 1)
IndxKls2 = getSubset(df, area_nr, :IndxKls, 2)
IndxKls3 = getSubset(df, area_nr, :IndxKls, 3)
IndxKls4 = getSubset(df, area_nr, :IndxKls, 4)
IndxKls5 = getSubset(df, area_nr, :IndxKls, 5)
areaDfs = (allArea, IndxKls1,IndxKls2,IndxKls3,IndxKls4,IndxKls5)
all = sumToLengthStats("all", areaDfs...)
#brghtsk
brghtsk1dfs = getSubsets(area_nr, :brghtsk, 1, areaDfs...)
brghtsk1 = sumToLengthStats("brghtsk1",brghtsk1dfs...)
brghtsk2dfs = getSubsets(area_nr, :brghtsk, 2, areaDfs...)
brghtsk2 = sumToLengthStats("brghtsk2", brghtsk2dfs...)
brghtsk3dfs = getSubsets(area_nr, :brghtsk, 3, areaDfs...)
brghtsk3 = sumToLengthStats("brghtsk3", brghtsk3dfs...)
brghtsk4dfs = getSubsets(area_nr, :brghtsk, 4, areaDfs...)
brghtsk4 = sumToLengthStats("brghtsk4", brghtsk4dfs...)
brghtsk5dfs = getSubsets(area_nr, :brghtsk, 5, areaDfs...)
brghtsk5 = sumToLengthStats("brghtsk5", brghtsk5dfs...)
#vägtyp
vägtyp1dfs = getSubsets(area_nr, :vägtyp, 1, areaDfs...)
vägtyp1 = sumToLengthStats("vägtyp1", vägtyp1dfs...)
vägtyp2dfs = getSubsets(area_nr, :vägtyp, 2, areaDfs...)
vägtyp2 = sumToLengthStats("vägtyp2", vägtyp2dfs...)
vägtyp3dfs = getSubsets(area_nr, :vägtyp, 3, areaDfs...)
vägtyp3 = sumToLengthStats("vägtyp3", vägtyp3dfs...)
vägtyp4dfs = getSubsets(area_nr, :vägtyp, 4, areaDfs...)
vägtyp4 = sumToLengthStats("vägtyp4", vägtyp4dfs...)
vägtyp5dfs = getSubsets(area_nr, :vägtyp, 5, areaDfs...)
vägtyp5 = sumToLengthStats("vägtyp5", vägtyp5dfs...)
return Dict(
"area_nr" => area_nr,
"all" => dictFromStruct(all),
"brghtsk" => [
dictFromStruct(brghtsk1),
dictFromStruct(brghtsk2),
dictFromStruct(brghtsk3),
dictFromStruct(brghtsk4),
dictFromStruct(brghtsk5)
],
"vägtyp" => [
dictFromStruct(vägtyp1),
dictFromStruct(vägtyp2),
dictFromStruct(vägtyp3),
dictFromStruct(vägtyp4),
dictFromStruct(vägtyp5)
]
)
end
function main(filename)
df = dataframeFromLdjson(filename)
areaNumbers = getLänKmmnNumbers(df)
open("stats.ldjson", "a") do io
for area_nr in areaNumbers
statsDict = getStats(df, area_nr)
println("writing area_nr: $(area_nr)")
JSON3.write(io, statsDict)
write(io, "\n")
end
end
end
filename = abspath(joinpath(pwd(), "../data/dataset.ldjson"))
main(filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment