Skip to content

Instantly share code, notes, and snippets.

@hannes
Created February 26, 2018 11:59
Show Gist options
  • Save hannes/fbf26247d5879cffc9de2b31460c3d31 to your computer and use it in GitHub Desktop.
Save hannes/fbf26247d5879cffc9de2b31460c3d31 to your computer and use it in GitHub Desktop.
# Pkg.add("CSV")
Pkg.update()
using DataFrames, CSV
wr = true
ne = false
region = CSV.read("region.tbl", delim='|', header=["r_regionkey", "r_name", "r_comment"], weakrefstrings=wr, nullable=ne)
nation = CSV.read("nation.tbl", delim='|', header=["n_nationkey", "n_name", "n_regionkey", "n_comment"], weakrefstrings=wr, nullable=ne)
supplier = CSV.read("supplier.tbl", delim='|', header=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"], weakrefstrings=wr, nullable=ne)
customer = CSV.read("customer.tbl", delim='|', header=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], weakrefstrings=wr, nullable=ne)
part = CSV.read("part.tbl", delim='|', header=["p_partkey","p_name","p_mfgr","p_brand","p_type","p_size","p_container","p_retailprice","p_comment"], weakrefstrings=wr, nullable=ne)
partsupp = CSV.read("partsupp.tbl", delim='|', header=["ps_partkey","ps_suppkey","ps_availqty","ps_supplycost","ps_comment"], weakrefstrings=wr, nullable=ne)
orders = CSV.read("orders.tbl", delim='|', header=["o_orderkey","o_custkey","o_orderstatus","o_totalprice","o_orderdate","o_orderpriority","o_clerk","o_shippriority","o_comment"], weakrefstrings=wr, nullable=ne)
lineitem = CSV.read("lineitem.tbl", delim='|', header=["l_orderkey","l_partkey","l_suppkey","l_linenumber","l_quantity","l_extendedprice","l_discount","l_tax","l_returnflag","l_linestatus","l_shipdate","l_commitdate","l_receiptdate","l_shipinstruct","l_shipmode","l_comment"], weakrefstrings=wr, nullable=ne)
function q1()
gc_enable(false)
res = sort!(by(lineitem[lineitem[:l_shipdate] .<= Date("1998-09-01"), [:l_returnflag, :l_linestatus, :l_quantity, :l_extendedprice, :l_discount, :l_tax]], [:l_returnflag, :l_linestatus], df -> DataFrame(
sum_qty = sum(df[:l_quantity]),
sum_base_price = sum(df[:l_extendedprice]),
sum_disc_price = sum(df[:l_extendedprice] .* (1 - df[:l_discount])),
sum_charge = sum(df[:l_extendedprice] .* (1 - df[:l_discount]) .* (1 + df[:l_tax])),
avg_qty = mean(df[:l_quantity]),
avg_price = mean(df[:l_extendedprice]),
avg_disc = mean(df[:l_discount]),
count_order = nrow(df)
)), cols=[:l_returnflag, :l_linestatus])
gc_enable(true)
res
end
function q2()
gc_enable(false)
ps = partsupp[[:ps_partkey, :ps_suppkey, :ps_supplycost]]
p = part[part[:p_size] .== 15 .& map(x->ismatch(r".*BRASS$", x), part[:p_type]), [:p_partkey, :p_mfgr]]
psp = join(ps, p, on = :ps_partkey => :p_partkey)
sp = supplier[[:s_suppkey, :s_nationkey, :s_acctbal, :s_name, :s_address, :s_phone, :s_comment]]
psps = join(psp, sp, on = :ps_suppkey => :s_suppkey)
nr = join(nation, region, on = :n_regionkey => :r_regionkey)[[:n_nationkey, :n_name]]
pspsnr = join(psps, nr, on = :s_nationkey => :n_nationkey)[[:ps_partkey, :ps_supplycost, :p_mfgr, :n_name, :s_acctbal, :s_name, :s_address, :s_phone, :s_comment]]
aggr = by(pspsnr, :ps_partkey, df -> DataFrame(ps_supplycost = minimum(df[:ps_supplycost])))
sj = join(pspsnr, aggr, on = [:ps_partkey, :ps_supplycost])
res = head(sort(sj[[:s_acctbal, :s_name, :n_name, :ps_partkey, :p_mfgr, :s_address, :s_phone, :s_comment]],
cols=[order(:s_acctbal, rev=true), :n_name, :s_name, :ps_partkey]), 100)
gc_enable(true)
res
end
function q3()
gc_enable(false)
o = orders[orders[:o_orderdate] .<= Date("1995-03-15"), [:o_orderkey, :o_custkey, :o_orderdate, :o_shippriority]]
c = customer[customer[:c_mktsegment] .== "BUILDING", [:c_custkey, :c_mktsegment]]
oc = join(o, c, on = :o_custkey => :c_custkey)[[:o_orderkey, :o_orderdate, :o_shippriority]]
l = lineitem[lineitem[:l_shipdate] .> Date("1995-03-15"), [:l_orderkey, :l_extendedprice, :l_discount]]
loc = join(oc, l, on = :o_orderkey => :l_orderkey)
aggr = by(loc, [:o_orderkey, :o_orderdate, :o_shippriority], df -> DataFrame(
revenue = sum(df[:l_extendedprice] .* (1 - df[:l_discount]))
))
res = head(sort(aggr[[:o_orderkey, :revenue, :o_orderdate, :o_shippriority]],
cols=[order(:revenue, rev=true), :o_orderkey]), 10)
gc_enable(true)
res
end
function q4()
gc_enable(false)
l = lineitem[lineitem[:l_commitdate] .< lineitem[:l_receiptdate], [:l_orderkey]]
o = orders[(orders[:o_orderdate] .>= Date("1993-07-01")) .&
(orders[:o_orderdate] .< Date("1993-10-01")) , [:o_orderkey, :o_orderpriority]]
lo = unique(join(o, l, on = :o_orderkey => :l_orderkey))[[:o_orderpriority]]
res = sort(by(lo, :o_orderpriority, df -> DataFrame(order_count=nrow(df))), cols=[:o_orderpriority])
gc_enable(true)
res
end
function q5()
gc_enable(false)
nr = join(nation, region[region[:r_name] .== "ASIA", :],
on = :n_regionkey => :r_regionkey)[[:n_nationkey, :n_name]]
snr = join(supplier[[:s_suppkey, :s_nationkey]], nr, on = :s_nationkey => :n_nationkey)[[:s_suppkey, :s_nationkey, :n_name]]
lsnr = join(snr, lineitem[[:l_suppkey, :l_orderkey, :l_extendedprice, :l_discount]], on = :s_suppkey => :l_suppkey)[[:l_orderkey, :l_extendedprice, :l_discount, :n_name, :s_nationkey]]
o = orders[(orders[:o_orderdate] .>= Date("1994-01-01")) .&
(orders[:o_orderdate] .< Date("1995-01-01")), [:o_orderkey, :o_custkey]]
oc = join(o, customer[[:c_custkey, "c_nationkey"]], on = :o_custkey => :c_custkey)[[:o_orderkey, :c_nationkey]]
lsnroc = join(oc, lsnr, on = [:o_orderkey => :l_orderkey, :c_nationkey => :s_nationkey])[[:l_extendedprice, :l_discount, :n_name]]
res = sort(by(lsnroc, :n_name, df -> DataFrame(
revenue = sum(df[:l_extendedprice] .* (1 - df[:l_discount]))
)), cols=[order(:revenue, rev=true)])
gc_enable(true)
res
end
function q6()
gc_enable(false)
l = lineitem[(lineitem[:l_shipdate] .>= Date("1994-01-01")) .&
(lineitem[:l_shipdate] .< Date("1995-01-01")) .&
(lineitem[:l_discount] .>= 0.05) .&
(lineitem[:l_discount] .<= 0.07) .&
(lineitem[:l_quantity] .< 24), [:l_extendedprice, :l_discount]]
res = sum(l[:l_extendedprice] .* l[:l_discount])
gc_enable(true)
res
end
function q7()
gc_enable(false)
n = nation[(nation[:n_name] .== "GERMANY") .| (nation[:n_name] .== "FRANCE"), [:n_nationkey, :n_name]]
sn = rename(join(supplier[[:s_nationkey, :s_suppkey]], n, on = :s_nationkey => :n_nationkey), :n_name => :n1_name)[[:s_suppkey, :n1_name]]
cn = rename(join(customer[[:c_custkey, :c_nationkey]], n,
on = :c_nationkey => :n_nationkey), :n_name => :n2_name)
cno = join(orders, cn, on = :o_custkey => :c_custkey)[[:o_orderkey, :n2_name]]
l = lineitem[(lineitem[:l_shipdate] .>= Date("1995-01-01")) .&
(lineitem[:l_shipdate] .<= Date("1995-12-31")), [:l_orderkey, :l_suppkey, :l_shipdate, :l_extendedprice, :l_discount]]
cnol = join(cno, l, on=:o_orderkey => :l_orderkey)[[:l_suppkey, :l_shipdate, :l_extendedprice, :l_discount, :n2_name]]
cnolsn = join(cnol,  sn, on=:l_suppkey => :s_suppkey)
cnolsnf = cnolsn[((cnolsn[:n1_name] .== "FRANCE") .&
(cnolsn[:n2_name] .== "GERMANY")) .|
((cnolsn[:n1_name] .== "GERMANY") .&
(cnolsn[:n2_name] .== "FRANCE")), :]
cnolsnf[:l_year] = map(x -> Dates.year(x), cnolsnf[:l_shipdate])
res = sort(by(cnolsnf, [:n1_name, :n2_name, :l_year], df -> DataFrame(
revenue = sum(df[:l_extendedprice] .* (1 - df[:l_discount]))
)), cols=[:n1_name, :n2_name, :l_year])
gc_enable(true)
res
end
function q8()
gc_enable(false)
nr = join(nation, region[region[:r_name] .== "AMERICA", :], on = :n_regionkey => :r_regionkey)[[:n_nationkey]]
cnr = join(customer[[:c_custkey, :c_nationkey]], nr, on=:c_nationkey => :n_nationkey)[[:c_custkey]]
o = orders[(orders[:o_orderdate] .>= Date("1995-01-01")) .&
(orders[:o_orderdate] .<= Date("1996-12-31")) ,[:o_orderkey, :o_orderdate, :o_custkey]]
ocnr = join(o, cnr, on=:o_custkey => :c_custkey)[[:o_orderkey, :o_orderdate]]
locnr = join(lineitem[[:l_orderkey, :l_partkey, :l_suppkey, :l_extendedprice, :l_discount]], ocnr,
on = :l_orderkey => :o_orderkey)[[:l_partkey, :l_suppkey, :l_extendedprice, :l_discount, :o_orderdate]]
p = part[part[:p_type] .== "ECONOMY ANODIZED STEEL", [:p_partkey]]
locnrp = join(locnr, p, on=:l_partkey => :p_partkey)[[:l_suppkey, :l_extendedprice, :l_discount, :o_orderdate]]
locnrps = join(supplier[[:s_suppkey, :s_nationkey]], locnrp,
on=:s_suppkey => :l_suppkey)[[:l_extendedprice, :l_discount, :o_orderdate, :s_nationkey]]
locnrpsn = join(locnrps, nation[[:n_nationkey, :n_name]], on=:s_nationkey => :n_nationkey)
locnrpsn[:o_year] = map(x -> Dates.year(x), locnrpsn[:o_orderdate])
locnrpsn[:volume] = locnrpsn[:l_extendedprice] .* (1 - locnrpsn[:l_discount])
res = sort(by(locnrpsn, :o_year, df -> DataFrame(
mkt_share = sum(ifelse.(df[:n_name] .== "BRAZIL", df[:volume], 0)) /
sum(df[:volume])
)), cols=:o_year)
gc_enable(true)
res
end
function q9()
gc_enable(false)
p = part[map(x->ismatch(r".*green.*", x), part[:p_name]), [:p_partkey]]
psp = join(partsupp[[:ps_suppkey, :ps_partkey, :ps_supplycost]], p, on=:ps_partkey => :p_partkey)
sn = join(supplier[[:s_suppkey, :s_nationkey]], nation[[:n_nationkey, :n_name]], on=:s_nationkey => :n_nationkey)[[:s_suppkey, :n_name]]
pspsn = join(psp, sn, on=:ps_suppkey => :s_suppkey)
lpspsn = join(lineitem[[:l_suppkey, :l_partkey, :l_orderkey, :l_extendedprice, :l_discount, :l_quantity]], pspsn,
on=[:l_suppkey => :ps_suppkey, :l_partkey => :ps_partkey])[[:l_orderkey, :l_extendedprice, :l_discount, :l_quantity, :ps_supplycost, :n_name]]
lpspsno = join(lpspsn, orders[[:o_orderkey, :o_orderdate]],
on=:l_orderkey => :o_orderkey)[[:l_extendedprice, :l_discount, :l_quantity, :ps_supplycost, :n_name, :o_orderdate]]
lpspsno[:o_year] = map(x -> Dates.year(x), lpspsno[:o_orderdate])
lpspsno[:amount] = lpspsno[:l_extendedprice] .* (1 .- lpspsno[:l_discount]) .-
lpspsno[:ps_supplycost] .* lpspsno[:l_quantity]
res = sort(by(lpspsno[[:n_name, :o_year, :amount]], [:n_name, :o_year], df -> DataFrame(
sum_profit = sum(df[:amount])
)), cols = [:n_name, order(:o_year, rev = true)])
gc_enable(true)
res
end
function q10()
gc_enable(false)
l = lineitem[lineitem[:l_returnflag] .== "R", [:l_orderkey, :l_extendedprice, :l_discount]]
o = orders[(orders[:o_orderdate] .>= Date("1993-10-01")) .&
(orders[:o_orderdate] .< Date("1994-01-01")) ,[:o_orderkey, :o_custkey]]
lo = join(o, l, on = :o_orderkey => :l_orderkey)
lo[:volume] = lo[:l_extendedprice] .* (1 - lo[:l_discount])
lo_aggr = by(lo, :o_custkey, df -> DataFrame(revenue = sum(df[:volume])))
c = customer[[:c_custkey, :c_nationkey, :c_name, :c_acctbal, :c_phone, :c_address, :c_comment]]
loc = join(lo_aggr, c, on = :o_custkey => :c_custkey)
locn = join(loc, nation[[:n_nationkey, :n_name]], on = :c_nationkey => :n_nationkey)
res = head(sort(locn[[:o_custkey, :c_name, :revenue, :c_acctbal, :n_name, :c_address, :c_phone, :c_comment]], cols = order(:revenue, rev = true)), 20)
gc_enable(true)
res
end
function bench(f, n)
tic()
f()
r1 = toc()
tic()
f()
r2 = toc()
tic()
f()
r3 = toc()
tic()
f()
r4 = toc()
tic()
f()
r5 = toc()
CSV.write("julia.csv", DataFrame(exp="Julia", query=n, time_median_ms=median([r1, r2, r3, r4, r5])*1000); append=true)
end
bench(q1, 1)
bench(q2, 2)
bench(q3, 3)
bench(q4, 4)
bench(q5, 5)
bench(q6, 6)
bench(q7, 7)
bench(q8, 8)
bench(q9, 9)
bench(q10, 10)
import pandas as pd
import numpy as np
import timeit, csv
region = pd.read_csv("region.tbl", sep='|', names=["r_regionkey", "r_name", "r_comment"])
nation = pd.read_csv("nation.tbl", sep='|', names=["n_nationkey", "n_name", "n_regionkey", "n_comment"])
supplier = pd.read_csv("supplier.tbl", sep='|', names=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"])
customer = pd.read_csv("customer.tbl", sep='|', names=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], dtype={'c_mktsegment' : 'category'})
part = pd.read_csv("part.tbl", sep='|', names=["p_partkey","p_name","p_mfgr","p_brand","p_type","p_size","p_container","p_retailprice","p_comment"], dtype={'p_container' : 'category'})
partsupp = pd.read_csv("partsupp.tbl", sep='|', names=["ps_partkey","ps_suppkey","ps_availqty","ps_supplycost","ps_comment"])
orders = pd.read_csv("orders.tbl", sep='|', names=["o_orderkey","o_custkey","o_orderstatus","o_totalprice","o_orderdate","o_orderpriority","o_clerk","o_shippriority","o_comment"], dtype={'o_orderstatus' : 'category', 'o_orderpriority' : 'category'}, parse_dates=['o_orderdate'])
lineitem = pd.read_csv("lineitem.tbl", sep='|', names=["l_orderkey","l_partkey","l_suppkey","l_linenumber","l_quantity","l_extendedprice","l_discount","l_tax","l_returnflag","l_linestatus","l_shipdate","l_commitdate","l_receiptdate","l_shipinstruct","l_shipmode","l_comment"], dtype={'l_returnflag': 'category', 'l_linestatus': 'category'}, parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate'])
def udf_disc_price(extended, discount):
return np.multiply(extended, np.subtract(1, discount))
def udf_charge(extended, discount, tax):
return np.multiply(extended, np.multiply(np.subtract(1, discount), np.add(1, tax)))
def q1():
df = lineitem[["l_shipdate", "l_returnflag", "l_linestatus", "l_quantity", "l_extendedprice", "l_discount", "l_tax"]][(lineitem['l_shipdate'] <= '1998-09-01')]
df['disc_price'] = udf_disc_price(df['l_extendedprice'], df['l_discount'])
df['charge'] = udf_charge(df['l_extendedprice'], df['l_discount'], df['l_tax'])
return df.groupby(['l_returnflag', 'l_linestatus'])\
.agg({'l_quantity': 'sum', 'l_extendedprice': 'sum', 'disc_price': 'sum', 'charge': 'sum',
'l_quantity': 'mean', 'l_extendedprice': 'mean', 'l_discount': 'mean', 'l_shipdate': 'count'})
def q2():
ps = partsupp[["ps_partkey", "ps_suppkey", "ps_supplycost"]]
p = part[["p_partkey", "p_mfgr", "p_size", "p_type"]][(part.p_size == 15) & (part.p_type.str.match(".*BRASS$"))][["p_partkey", "p_mfgr"]]
psp = ps.merge(p, left_on="ps_partkey", right_on="p_partkey")
s = supplier[["s_suppkey", "s_nationkey", "s_acctbal", "s_name", "s_address", "s_phone", "s_comment"]]
psps = psp.merge(s, left_on="ps_suppkey", right_on="s_suppkey")[["ps_partkey", "ps_supplycost", "p_mfgr", "s_nationkey", "s_acctbal", "s_name", "s_address", "s_phone", "s_comment"]]
nr = nation.merge(region[region.r_name == "EUROPE"], left_on="n_regionkey", right_on="r_regionkey")[["n_nationkey", "n_name"]]
pspsnr = psps.merge(nr, left_on="s_nationkey", right_on="n_nationkey")[["ps_partkey", "ps_supplycost", "p_mfgr", "n_name", "s_acctbal", "s_name", "s_address", "s_phone", "s_comment"]]
aggr = pspsnr.groupby("ps_partkey").agg({'ps_supplycost' : min}).reset_index()
sj = pspsnr.merge(aggr, left_on=["ps_partkey", "ps_supplycost"], right_on=["ps_partkey", "ps_supplycost"])
res = sj[["s_acctbal", "s_name", "n_name", "ps_partkey", "p_mfgr", "s_address", "s_phone", "s_comment"]].sort_values(["s_acctbal", "n_name", "s_name", "ps_partkey"], ascending=[False, True, True, True]).head(100)
return res
def q3():
o = orders[["o_orderkey", "o_custkey", "o_orderdate", "o_shippriority"]][orders.o_orderdate < "1995-03-15"][["o_orderkey", "o_custkey", "o_orderdate", "o_shippriority"]]
c = customer[["c_custkey", "c_mktsegment"]][customer.c_mktsegment == "BUILDING"][["c_custkey", "c_mktsegment"]]
oc = o.merge(c, left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "o_orderdate", "o_shippriority"]]
l = lineitem[["l_orderkey", "l_extendedprice", "l_discount", "l_shipdate"]][lineitem.l_shipdate > "1995-03-15"][["l_orderkey", "l_extendedprice", "l_discount"]]
loc = l.merge(oc, left_on="l_orderkey", right_on="o_orderkey")
loc["volume"] = loc.l_extendedprice * (1 - loc.l_discount)
res = loc.groupby(["l_orderkey", "o_orderdate", "o_shippriority"]).agg({'volume' : sum}).reset_index()[["l_orderkey", "volume", "o_orderdate", "o_shippriority"]].sort_values(["volume", "o_orderdate"], ascending=[False, True]).head(10)
return res
def q4():
l = lineitem[["l_orderkey", "l_commitdate"]][lineitem.l_commitdate < lineitem.l_receiptdate][["l_orderkey"]]
o = orders[["o_orderkey", "o_orderpriority", "o_orderdate"]][(orders.o_orderdate >= "1993-07-01") & (orders.o_orderdate < "1993-10-01")][["o_orderkey", "o_orderpriority"]]
lo = l.merge(o, left_on="l_orderkey", right_on="o_orderkey").drop_duplicates()[["o_orderpriority"]]
res = lo.groupby("o_orderpriority").size().reset_index(name='counts').sort_values('o_orderpriority')
return res
def q5():
nr = nation.merge(region[region.r_name == "ASIA"], left_on="n_regionkey", right_on="r_regionkey")[["n_nationkey", "n_name"]]
snr = supplier[["s_suppkey", "s_nationkey"]].merge(nr, left_on="s_nationkey", right_on="n_nationkey")[["s_suppkey", "s_nationkey", "n_name"]]
lsnr = lineitem[["l_suppkey", "l_orderkey", "l_extendedprice", "l_discount"]].merge(snr, left_on="l_suppkey", right_on="s_suppkey")
o = orders[["o_orderkey", "o_custkey", "o_orderdate"]][(orders.o_orderdate >= "1994-01-01") & (orders.o_orderdate < "1995-01-01")][["o_orderkey", "o_custkey"]]
oc = o.merge(customer[["c_custkey", "c_nationkey"]], left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "c_nationkey"]]
lsnroc = lsnr.merge(oc, left_on=["l_orderkey", "s_nationkey"], right_on=["o_orderkey", "c_nationkey"])[["l_extendedprice", "l_discount", "n_name"]]
lsnroc["volume"] = lsnroc.l_extendedprice * (1 - lsnroc.l_discount)
res = lsnroc.groupby("n_name").agg({'volume' : sum}).reset_index().sort_values("volume", ascending=False)
return res
def q6():
l = lineitem[["l_extendedprice", "l_discount", "l_shipdate", "l_quantity"]][
(lineitem.l_shipdate >= "1994-01-01") &
(lineitem.l_shipdate < "1995-01-01") &
(lineitem.l_discount >= 0.05) &
(lineitem.l_discount <= 0.07) &
(lineitem.l_quantity < 24)][["l_extendedprice", "l_discount"]]
res = (l.l_extendedprice * l.l_discount).sum()
return res
def q7():
sn = supplier[["s_nationkey", "s_suppkey"]].merge(nation[["n_nationkey", "n_name"]][(nation.n_name == "FRANCE") | (nation.n_name == "GERMANY")], left_on="s_nationkey", right_on="n_nationkey")[["s_suppkey", "n_name"]]
sn.columns = ["s_suppkey", "n1_name"]
cn = customer[["c_custkey", "c_nationkey"]].merge(nation[["n_nationkey", "n_name"]][(nation.n_name == "FRANCE") | (nation.n_name == "GERMANY")], left_on="c_nationkey", right_on="n_nationkey")[["c_custkey", "n_name"]]
cn.columns = ["c_custkey", "n2_name"]
cno = orders[["o_custkey", "o_orderkey"]].merge(cn, left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "n2_name"]]
cnol = lineitem[["l_orderkey", "l_suppkey", "l_shipdate", "l_extendedprice", "l_discount"]][(lineitem.l_shipdate >= "1995-01-01") & (lineitem.l_shipdate <= "1996-12-31")][["l_orderkey", "l_suppkey", "l_shipdate", "l_extendedprice", "l_discount"]].merge(cno, left_on="l_orderkey", right_on="o_orderkey")[["l_suppkey", "l_shipdate", "l_extendedprice", "l_discount", "n2_name"]]
cnolsn = cnol.merge(sn, left_on="l_suppkey", right_on="s_suppkey")
cnolsn["volume"] = cnolsn.l_extendedprice * (1 - cnolsn.l_discount)
cnolsn["l_year"] = cnolsn.l_shipdate.dt.year
cnolsnf = cnolsn[((cnolsn.n1_name == "FRANCE") & (cnolsn.n2_name == "GERMANY")) | ((cnolsn.n1_name == "GERMANY") & (cnolsn.n2_name == "FRANCE"))]
res = cnolsnf[["n1_name", "n2_name", "l_year", "volume"]].groupby(["n1_name", "n2_name", "l_year"]).agg({'volume' : sum}).reset_index().sort_values(["n1_name", "n2_name", "l_year"])
return res
def q8():
nr = nation.merge(region[region.r_name == "AMERICA"], left_on="n_regionkey", right_on="r_regionkey")[["n_nationkey"]]
cnr = customer[["c_custkey", "c_nationkey"]].merge(nr, left_on="c_nationkey", right_on="n_nationkey")[["c_custkey"]]
ocnr = orders[["o_orderkey", "o_custkey", "o_orderdate"]][(orders.o_orderdate >= "1995-01-01") & (orders.o_orderdate <= "1996-12-31")].merge(cnr, left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "o_orderdate"]]
locnr = lineitem[["l_orderkey", "l_partkey", "l_suppkey", "l_extendedprice", "l_discount"]].merge(ocnr, left_on="l_orderkey", right_on="o_orderkey")[["l_partkey", "l_suppkey", "l_extendedprice", "l_discount", "o_orderdate"]]
p = part[["p_partkey", "p_type"]][part.p_type == "ECONOMY ANODIZED STEEL"][["p_partkey"]]
locnrp = locnr.merge(p, left_on="l_partkey", right_on="p_partkey")[["l_suppkey", "l_extendedprice", "l_discount", "o_orderdate"]]
locnrps = locnrp.merge(supplier[["s_suppkey", "s_nationkey"]], left_on="l_suppkey", right_on="s_suppkey")[["l_extendedprice", "l_discount", "o_orderdate", "s_nationkey"]]
locnrpsn = locnrps.merge(nation[["n_nationkey", "n_name"]], left_on="s_nationkey", right_on="n_nationkey")[["l_extendedprice", "l_discount", "o_orderdate", "n_name"]]
locnrpsn["volume"] = locnrpsn.l_extendedprice * (1 - locnrpsn.l_discount)
locnrpsn["o_year"] = locnrpsn.o_orderdate.dt.year
res = locnrpsn[["o_year", "volume", "n_name"]].groupby("o_year").apply(lambda df : pd.DataFrame({
'mkt_share' : np.where(df.n_name == "BRAZIL", df.volume, 0).sum()
}, index=[0])).reset_index().sort_values("o_year")
return res
def q9():
p = part[["p_partkey", "p_name"]][part.p_name.str.match(".*green.*")][["p_partkey"]]
psp = partsupp[["ps_suppkey", "ps_partkey", "ps_supplycost"]].merge(p, left_on="ps_partkey", right_on="p_partkey")
sn = supplier[["s_suppkey", "s_nationkey"]].merge(nation[["n_nationkey", "n_name"]], left_on="s_nationkey", right_on="n_nationkey")[["s_suppkey", "n_name"]]
pspsn = psp.merge(sn, left_on="ps_suppkey", right_on="s_suppkey")
lpspsn = lineitem[["l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"]].merge(pspsn, left_on=["l_suppkey", "l_partkey"], right_on=["ps_suppkey", "ps_partkey"])[["l_orderkey", "l_extendedprice", "l_discount", "l_quantity", "ps_supplycost", "n_name"]]
olpspsn = orders[["o_orderkey", "o_orderdate"]].merge(lpspsn, left_on="o_orderkey", right_on="l_orderkey")[["l_extendedprice", "l_discount", "l_quantity", "ps_supplycost", "n_name", "o_orderdate"]]
olpspsn["amount"] = olpspsn.l_extendedprice * (1 - olpspsn.l_discount) - olpspsn.ps_supplycost * olpspsn.l_quantity
olpspsn["o_year"] = olpspsn.o_orderdate.dt.year
res = olpspsn[["n_name", "o_year", "amount"]].groupby(["n_name", "o_year"]).agg({'amount' : sum}).reset_index().sort_values(["n_name", "o_year" ], ascending=[True, False])
return res
def q10():
l = lineitem[["l_orderkey", "l_extendedprice", "l_discount", "l_returnflag"]][lineitem.l_returnflag == "R"][["l_orderkey", "l_extendedprice", "l_discount"]]
o = orders[["o_orderkey", "o_custkey", "o_orderdate"]][(orders.o_orderdate >= "1993-10-01") & (orders.o_orderdate < "1994-01-01")][["o_orderkey", "o_custkey"]]
lo = l.merge(o, left_on="l_orderkey", right_on="o_orderkey")[["l_extendedprice", "l_discount", "o_custkey"]]
lo["volume"] = lo.l_extendedprice * (1 - lo.l_discount)
lo_aggr = lo.groupby("o_custkey").agg({'volume' : sum}).reset_index()
c = customer[["c_custkey", "c_nationkey", "c_name", "c_acctbal", "c_phone", "c_address", "c_comment"]]
loc = lo_aggr.merge(c, left_on="o_custkey", right_on="c_custkey")
locn = loc.merge(nation[["n_nationkey", "n_name"]], left_on="c_nationkey", right_on="n_nationkey")
res = locn[["o_custkey", "c_name", "volume", "c_acctbal", "n_name", "c_address", "c_phone", "c_comment"]].sort_values("volume", ascending=False).head(20)
return res
#####
n = 5
f = open("pandas.csv", 'w')
writer = csv.writer(f)
def bench(q):
res = np.median(timeit.repeat("q%d()" % q, setup="from __main__ import q%d" % q, number=1, repeat=n))
print(res)
writer.writerow(["pandas", "%d" % q, "%f" % res])
f.flush()
bench(1)
bench(2)
bench(3)
bench(4)
bench(5)
bench(6)
bench(7)
bench(8)
bench(9)
bench(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment