Created
July 18, 2022 19:12
-
-
Save Andersgee/570d39d53fcda2a450a428256f4c2d23 to your computer and use it in GitHub Desktop.
generating some stats with dataframes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#= | |
trying out dataframes. | |
1. read .ldjson | |
2. store in dataframe | |
3. grab some reasonably simple stats from it | |
4. write to new .ldjson | |
=# | |
using JSON3; | |
using DataFrames | |
read_ldjson(filename) = JSON3.read(read(filename, String); jsonlines=true); | |
function dataframeFromLdjson(filename) | |
println("reading ldjson...") | |
jsonarray = read_ldjson(filename) | |
println("...done") | |
ks = collect(keys(jsonarray[1])) | |
println("putting in dataframe...") | |
df = DataFrame((Symbol(k)=>getindex.(jsonarray, k) for k in ks)...) | |
println("...done") | |
return df | |
end | |
#column names | |
#["_id", "roadId", "imageId", "datasetId", "blggnngsd", "mätdatm", "brghtsk", "hastght", "dou2017", "Ådt_frd", "Ådt_tng", "Ådt_mtr", "vägbrdd", "vägnmmr", "vägktgr", "vägtyp", "längd", "blggnngst", "tackning", "spårdjp", "iri", "IRI_maint", "SP_maint", "län_nr", "kmmn_nr", "Ålder", "FrvntdL", "ÅtrstnL", "Tllstnl", "IndxKls", "IKls_1", "IKls_2", "IKls_3", "__v"] | |
function getSubset(df, area_nr, symbol, symbolValue) | |
return subset(df, [:län_nr, :kmmn_nr, symbol] => ByRow((län_nr,kmmn_nr, x) -> (x == symbolValue && (län_nr == area_nr || kmmn_nr == area_nr))); skipmissing=true) | |
end | |
function getAreaSubset(df, area_nr) | |
return subset(df, [:län_nr, :kmmn_nr] => ByRow((län_nr,kmmn_nr) -> (län_nr == area_nr || kmmn_nr == area_nr)); skipmissing=true) | |
end | |
struct LengthStats | |
label::String | |
len::Int | |
vbad::Int | |
bad::Int | |
fair::Int | |
good::Int | |
vgood::Int | |
end | |
dictFromStruct(s) = Dict(key => getfield(s, key) for key in propertynames(s)) | |
sumLängdCol(df) = sum(df[!, :längd]) | |
getLänKmmnNumbers(df) = cat(unique(df[!, :län_nr]), unique(df[!, :kmmn_nr]), dims=1) | |
function sumToLengthStats(label, all, vbad, bad, fair,good,vgood) | |
return LengthStats( | |
label, | |
sumLängdCol(all), | |
sumLängdCol(vbad), | |
sumLängdCol(bad), | |
sumLängdCol(fair), | |
sumLängdCol(good), | |
sumLängdCol(vgood), | |
) | |
end | |
function getSubsets(area_nr, symbol, symbolValue, allArea, IndxKls1, IndxKls2,IndxKls3,IndxKls4,IndxKls5) | |
suball = getSubset(allArea, area_nr, symbol, symbolValue) | |
subIndxKls1 = getSubset(IndxKls1, area_nr, symbol, symbolValue) | |
subIndxKls2 = getSubset(IndxKls2, area_nr, symbol, symbolValue) | |
subIndxKls3 = getSubset(IndxKls3, area_nr, symbol, symbolValue) | |
subIndxKls4 = getSubset(IndxKls4, area_nr, symbol, symbolValue) | |
subIndxKls5 = getSubset(IndxKls5, area_nr, symbol, symbolValue) | |
return suball, subIndxKls1, subIndxKls2,subIndxKls3,subIndxKls4,subIndxKls5 | |
end | |
function getStats(df, area_nr) | |
allArea = getAreaSubset(df, area_nr) | |
IndxKls1 = getSubset(df, area_nr, :IndxKls, 1) | |
IndxKls2 = getSubset(df, area_nr, :IndxKls, 2) | |
IndxKls3 = getSubset(df, area_nr, :IndxKls, 3) | |
IndxKls4 = getSubset(df, area_nr, :IndxKls, 4) | |
IndxKls5 = getSubset(df, area_nr, :IndxKls, 5) | |
areaDfs = (allArea, IndxKls1,IndxKls2,IndxKls3,IndxKls4,IndxKls5) | |
all = sumToLengthStats("all", areaDfs...) | |
#brghtsk | |
brghtsk1dfs = getSubsets(area_nr, :brghtsk, 1, areaDfs...) | |
brghtsk1 = sumToLengthStats("brghtsk1",brghtsk1dfs...) | |
brghtsk2dfs = getSubsets(area_nr, :brghtsk, 2, areaDfs...) | |
brghtsk2 = sumToLengthStats("brghtsk2", brghtsk2dfs...) | |
brghtsk3dfs = getSubsets(area_nr, :brghtsk, 3, areaDfs...) | |
brghtsk3 = sumToLengthStats("brghtsk3", brghtsk3dfs...) | |
brghtsk4dfs = getSubsets(area_nr, :brghtsk, 4, areaDfs...) | |
brghtsk4 = sumToLengthStats("brghtsk4", brghtsk4dfs...) | |
brghtsk5dfs = getSubsets(area_nr, :brghtsk, 5, areaDfs...) | |
brghtsk5 = sumToLengthStats("brghtsk5", brghtsk5dfs...) | |
#vägtyp | |
vägtyp1dfs = getSubsets(area_nr, :vägtyp, 1, areaDfs...) | |
vägtyp1 = sumToLengthStats("vägtyp1", vägtyp1dfs...) | |
vägtyp2dfs = getSubsets(area_nr, :vägtyp, 2, areaDfs...) | |
vägtyp2 = sumToLengthStats("vägtyp2", vägtyp2dfs...) | |
vägtyp3dfs = getSubsets(area_nr, :vägtyp, 3, areaDfs...) | |
vägtyp3 = sumToLengthStats("vägtyp3", vägtyp3dfs...) | |
vägtyp4dfs = getSubsets(area_nr, :vägtyp, 4, areaDfs...) | |
vägtyp4 = sumToLengthStats("vägtyp4", vägtyp4dfs...) | |
vägtyp5dfs = getSubsets(area_nr, :vägtyp, 5, areaDfs...) | |
vägtyp5 = sumToLengthStats("vägtyp5", vägtyp5dfs...) | |
return Dict( | |
"area_nr" => area_nr, | |
"all" => dictFromStruct(all), | |
"brghtsk" => [ | |
dictFromStruct(brghtsk1), | |
dictFromStruct(brghtsk2), | |
dictFromStruct(brghtsk3), | |
dictFromStruct(brghtsk4), | |
dictFromStruct(brghtsk5) | |
], | |
"vägtyp" => [ | |
dictFromStruct(vägtyp1), | |
dictFromStruct(vägtyp2), | |
dictFromStruct(vägtyp3), | |
dictFromStruct(vägtyp4), | |
dictFromStruct(vägtyp5) | |
] | |
) | |
end | |
function main(filename) | |
df = dataframeFromLdjson(filename) | |
areaNumbers = getLänKmmnNumbers(df) | |
open("stats.ldjson", "a") do io | |
for area_nr in areaNumbers | |
statsDict = getStats(df, area_nr) | |
println("writing area_nr: $(area_nr)") | |
JSON3.write(io, statsDict) | |
write(io, "\n") | |
end | |
end | |
end | |
filename = abspath(joinpath(pwd(), "../data/dataset.ldjson")) | |
main(filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment