karajan9/benchmarksgameplots.jl

## benchmarksgameplots.jl
# %%
using HTTP
using Gumbo
using DataFrames
using StatsBase


# %%
url = "https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/"
benchmarks = [
    "revcomp",
    "regexredux",
    "fannkuchredux",
    "mandelbrot",
    "pidigits",
    "nbody",
    "spectralnorm",
    "fasta",
    "knucleotide",
    "binarytrees",
]


# %%
datatmp = []
for bm in benchmarks
    @show bm
    res = HTTP.get(url * bm * ".html")
    html = parsehtml(res.body |> String)
    body = html.root[2]

    # get the table with the data and iterate through the rows
    table = children(body[2][2][2][1])[2:end]
    for (i, row) in table |> enumerate
        # this checks whether we have a factor or the script failed/bad output
        # etc.; no tryparse because row[1][1] might not even exist
        factor = try
            parse(Float64, string(row[1][1]))
        catch
            continue
        end

        # different handling for the best entry (bold) and normal ones
        langtmp = children(row[2][1][1])
        if length(langtmp) == 1 && typeof(langtmp[1]) != HTMLElement{:strong}
            best = false
            lang = split(string(langtmp[1]))[1]
        else
            best = true
            lang = filter(n -> typeof(n) === HTMLElement{:strong}, langtmp) |>
                x -> x[1][1] |> string
        end

        gz = parse(Int, string(row[5][1]))

        # just push everything in there, DataFrames will sort it out
        # I haven't found the `log` option in DataVoyager, so just calculate it
        push!(datatmp, (lang, factor, best, gz, log10(factor), log10(gz),
                        bm, factor * gz))
    end
end

# %%
df = DataFrame(datatmp)
names!(df, [:lang, :factor, :best, :gz, :logfactor, :loggz, :benchmark,
            :factor_times_gz])

# A bunch of problems arise because the bold lang names don't fit with the
# normal ones. Sometimes there are also a bunch on non-ASCII chars in there
# which break stuff. Fixing everything afterwards was the simpler solution.
# Also we got like 8 Ruby versions, I just grouped them together.
df[startswith.(df.lang, "Substrate"), :lang] .= "Substrate VM"
df[startswith.(df.lang, "Python"), :lang] .= "Python 3"
df[startswith.(df.lang, "Node"), :lang] .= "Node js"
df[startswith.(df.lang, "Free"), :lang] .= "Pascal"
df[startswith.(df.lang, "Pharo"), :lang] .= "Smalltalk"
df[startswith.(df.lang, "VW"), :lang] .= "Smalltalk"
df[startswith.(df.lang, "Matz's"), :lang] .= "Ruby"
df[startswith.(df.lang, "Truffle"), :lang] .= "Ruby"
df[startswith.(df.lang, "JRuby"), :lang] .= "Ruby"


# %%
# Exploration!
using DataVoyager
df |> println
df[df.best .== true, :] |> Voyager()
df |> Voyager()


# %%
using Plots

plot(xlabel = "(gzipped) code size", ylabel = "factor vs. best (geom. mean)",
    legend = false)

best = df[df.best .== true, :]  # only the best entry for each language
# iterate through each language and calculate the mean over all benchmarks
for l in df.lang |> unique
    sel = best[best.lang .== l, :]
    factor = geomean(sel.factor)
    gz = mean(sel.gz)
    @show l, factor, gz
    scatter!([gz], [log10(factor)], label = l, ms = 5.0,
        series_annotations = [Plots.text(l, :bottom, 10)])
end

plot!(xlims = (0, 1600), ylims = (0, log10(100)))
# manual ticks, otherwise it's hard to read
ys = [1, 3, 5, 10, 30, 50, 100]
plot!(yticks = (log10.(ys), ys), yscale = :log10)
plot!(dpi = 300)
# savefig(plotsdir("size vs factor all langs.png"))
	# %%
	using HTTP
	using Gumbo
	using DataFrames
	using StatsBase


	# %%
	url = "https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/"
	benchmarks = [
	"revcomp",
	"regexredux",
	"fannkuchredux",
	"mandelbrot",
	"pidigits",
	"nbody",
	"spectralnorm",
	"fasta",
	"knucleotide",
	"binarytrees",
	]


	# %%
	datatmp = []
	for bm in benchmarks
	@show bm
	res = HTTP.get(url * bm * ".html")
	html = parsehtml(res.body \|> String)
	body = html.root[2]

	# get the table with the data and iterate through the rows
	table = children(body[2][2][2][1])[2:end]
	for (i, row) in table \|> enumerate
	# this checks whether we have a factor or the script failed/bad output
	# etc.; no tryparse because row[1][1] might not even exist
	factor = try
	parse(Float64, string(row[1][1]))
	catch
	continue
	end

	# different handling for the best entry (bold) and normal ones
	langtmp = children(row[2][1][1])
	if length(langtmp) == 1 && typeof(langtmp[1]) != HTMLElement{:strong}
	best = false
	lang = split(string(langtmp[1]))[1]
	else
	best = true
	lang = filter(n -> typeof(n) === HTMLElement{:strong}, langtmp) \|>
	x -> x[1][1] \|> string
	end

	gz = parse(Int, string(row[5][1]))

	# just push everything in there, DataFrames will sort it out
	# I haven't found the `log` option in DataVoyager, so just calculate it
	push!(datatmp, (lang, factor, best, gz, log10(factor), log10(gz),
	bm, factor * gz))
	end
	end

	# %%
	df = DataFrame(datatmp)
	names!(df, [:lang, :factor, :best, :gz, :logfactor, :loggz, :benchmark,
	:factor_times_gz])

	# A bunch of problems arise because the bold lang names don't fit with the
	# normal ones. Sometimes there are also a bunch on non-ASCII chars in there
	# which break stuff. Fixing everything afterwards was the simpler solution.
	# Also we got like 8 Ruby versions, I just grouped them together.
	df[startswith.(df.lang, "Substrate"), :lang] .= "Substrate VM"
	df[startswith.(df.lang, "Python"), :lang] .= "Python 3"
	df[startswith.(df.lang, "Node"), :lang] .= "Node js"
	df[startswith.(df.lang, "Free"), :lang] .= "Pascal"
	df[startswith.(df.lang, "Pharo"), :lang] .= "Smalltalk"
	df[startswith.(df.lang, "VW"), :lang] .= "Smalltalk"
	df[startswith.(df.lang, "Matz's"), :lang] .= "Ruby"
	df[startswith.(df.lang, "Truffle"), :lang] .= "Ruby"
	df[startswith.(df.lang, "JRuby"), :lang] .= "Ruby"


	# %%
	# Exploration!
	using DataVoyager
	df \|> println
	df[df.best .== true, :] \|> Voyager()
	df \|> Voyager()


	# %%
	using Plots

	plot(xlabel = "(gzipped) code size", ylabel = "factor vs. best (geom. mean)",
	legend = false)

	best = df[df.best .== true, :] # only the best entry for each language
	# iterate through each language and calculate the mean over all benchmarks
	for l in df.lang \|> unique
	sel = best[best.lang .== l, :]
	factor = geomean(sel.factor)
	gz = mean(sel.gz)
	@show l, factor, gz
	scatter!([gz], [log10(factor)], label = l, ms = 5.0,
	series_annotations = [Plots.text(l, :bottom, 10)])
	end

	plot!(xlims = (0, 1600), ylims = (0, log10(100)))
	# manual ticks, otherwise it's hard to read
	ys = [1, 3, 5, 10, 30, 50, 100]
	plot!(yticks = (log10.(ys), ys), yscale = :log10)
	plot!(dpi = 300)
	# savefig(plotsdir("size vs factor all langs.png"))