Mark Mytherin

## cancel_workflows.py
import subprocess
import duckdb
import os
import pandas as pd
import argparse


parser = argparse.ArgumentParser(description='Cancel all workflows related to a PR.')
parser.add_argument('--max_workflows', dest='max_workflows',
                     action='store', help='The maximum number of workflows to look at (starting from the latest)', default=200)

## anonymize.py
import os

def generate_integers(count, np_dtype, min_val=-1000000, max_val=1000000):
    import numpy as np
    return np.random.randint(low=min_val, high=max_val, size=(count,), dtype=np_dtype)

def generate_floats(count, min_val=0, max_val=1.0):
    import numpy as np
    return np.random.uniform(low=min_val, high=max_val, size=(count,))

## Julia TPC-H Benchmarks v0.1
| Queries | DuckDB.jl (1T) | DF Scan (1T) | DuckDB.jl (8T) | DF Scan (8T) | Native (8T) |
|---------|----------------|--------------|----------------|--------------|-------------|
| Q01     | 2.45s          | 3.34s        | 0.35s          | 0.46s        | 0.34s       |
| Q02     | 0.22s          | 0.70s        | 0.40s          | 0.89s        | 0.07s       |
| Q03     | 1.07s          | 1.36s        | 1.17s          | 1.66s        | 0.27s       |
| Q04     | 1.32s          | 1.17s        | 0.99s          | 0.94s        | 0.57s       |
| Q05     | 1.04s          | 1.72s        | 1.18s          | 1.78s        | 0.22s       |
| Q06     | 0.47s          | 0.50s        | 0.07s          | 0.08s        | 0.07s       |
| Q07     | 1.73s          | 2.89s        | 1.79s          | 4.38s        | 0.47s       |
| Q08     | 1.40s          | 1.95s        | 1.44s          | 2.04s        | 0.30s       |

## cleverboysum.cpp
#include <cstdint>
#include <chrono>
#include <iostream>

using namespace std;

int64_t naive_sum(int64_t x) {
    int64_t sum = 0;
    for(int64_t i = 0; i < x; i++) {
        sum += i;

## gencov.sh
lcov --config-file .github/workflows/lcovrc --zerocounters --directory .
lcov --config-file .github/workflows/lcovrc --capture --initial --directory . --base-directory . --no-external --output-file coverage.info
mkdir -p build/coverage
(cd build/coverage && cmake -E env CXXFLAGS="--coverage" cmake -DBUILD_PYTHON=1 -DBUILD_PARQUET_EXTENSION=1 -DENABLE_SANITIZER=0 -DCMAKE_BUILD_TYPE=Debug ../.. && cmake --build .)
build/coverage/test/unittest test/sql/join/inner/test_join.test
lcov --config-file .github/workflows/lcovrc --directory . --base-directory . --no-external --capture --output-file coverage.info
lcov --config-file .github/workflows/lcovrc --remove coverage.info $(< .github/workflows/lcov_exclude) -o lcov.info
genhtml --ignore-errors source lcov.info --legend --title "commit SHA1" --output-directory=build/coverage/coverage-html
open build/coverage/coverage-html/index.html

## benchmark.txt
TPC-H

| Query | DuckDB |    DataFusion    |
|-------|--------|------------------|
| Q01   | 0.05s  | 0.07             |
| Q02   | 0.02s  | Unsupported      |
| Q03   | 0.03s  | 0.06s            |
| Q04   | 0.05s  | Unsupported      |
| Q05   | 0.03s  | 0.11s            |
| Q06   | 0.01s  | 0.01s            |

## gist:60ffdb9960588cb0a9e95c99ee0efb3f
M1 Mac clang++

master branch (throw exception instead of setting an error flag)
benchmark/micro/cast/cast_int32_int64_x.benchmark	1	0.053170
benchmark/micro/cast/cast_int32_int64_x.benchmark	2	0.053137
benchmark/micro/cast/cast_int32_int64_x.benchmark	3	0.053495
benchmark/micro/cast/cast_int32_int64_x.benchmark	4	0.055435
benchmark/micro/cast/cast_int32_int64_x.benchmark	5	0.052032
benchmark/micro/cast/cast_int64_int32_x.benchmark	1	0.168896
benchmark/micro/cast/cast_int64_int32_x.benchmark	2	0.168962

## shared_ptr_benchmark.cpp

#include <memory>
#include <vector>

#include <chrono>
#include <iostream>

using namespace std;

#define BENCH_COUNT 10000000

## duckdb-example.R
# install.packages("duckdb")
library(duckdb)
# open a connection to an in-memory database
db <- dbConnect(duckdb::duckdb())
# run a SELECT * query that reads from a CSV file at the specified path
df <- dbGetQuery(db, 'SELECT * FROM "test/sql/copy/csv/data/real/voter.tsv"')
head(df)

## benchmark-sqlite.py
import duckdb
import sqlite3
import time, random, string

base_tuples = 200*1000
repeat_insert = 8

def run_benchmark_strings(db, dbname, print_tuple_count, count_distinct=True):
	db.execute("""CREATE TABLE data(id INTEGER, dt INTEGER, shop TEXT, product TEXT, aa TEXT, bb INTEGER,
			customer TEXT);""")
	import subprocess
	import duckdb
	import os
	import pandas as pd
	import argparse


	parser = argparse.ArgumentParser(description='Cancel all workflows related to a PR.')
	parser.add_argument('--max_workflows', dest='max_workflows',
	action='store', help='The maximum number of workflows to look at (starting from the latest)', default=200)
	import os

	def generate_integers(count, np_dtype, min_val=-1000000, max_val=1000000):
	import numpy as np
	return np.random.randint(low=min_val, high=max_val, size=(count,), dtype=np_dtype)

	def generate_floats(count, min_val=0, max_val=1.0):
	import numpy as np
	return np.random.uniform(low=min_val, high=max_val, size=(count,))
	\| Queries \| DuckDB.jl (1T) \| DF Scan (1T) \| DuckDB.jl (8T) \| DF Scan (8T) \| Native (8T) \|
	\|---------\|----------------\|--------------\|----------------\|--------------\|-------------\|
	\| Q01 \| 2.45s \| 3.34s \| 0.35s \| 0.46s \| 0.34s \|
	\| Q02 \| 0.22s \| 0.70s \| 0.40s \| 0.89s \| 0.07s \|
	\| Q03 \| 1.07s \| 1.36s \| 1.17s \| 1.66s \| 0.27s \|
	\| Q04 \| 1.32s \| 1.17s \| 0.99s \| 0.94s \| 0.57s \|
	\| Q05 \| 1.04s \| 1.72s \| 1.18s \| 1.78s \| 0.22s \|
	\| Q06 \| 0.47s \| 0.50s \| 0.07s \| 0.08s \| 0.07s \|
	\| Q07 \| 1.73s \| 2.89s \| 1.79s \| 4.38s \| 0.47s \|
	\| Q08 \| 1.40s \| 1.95s \| 1.44s \| 2.04s \| 0.30s \|
	#include <cstdint>
	#include <chrono>
	#include <iostream>

	using namespace std;

	int64_t naive_sum(int64_t x) {
	int64_t sum = 0;
	for(int64_t i = 0; i < x; i++) {
	sum += i;
	lcov --config-file .github/workflows/lcovrc --zerocounters --directory .
	lcov --config-file .github/workflows/lcovrc --capture --initial --directory . --base-directory . --no-external --output-file coverage.info
	mkdir -p build/coverage
	(cd build/coverage && cmake -E env CXXFLAGS="--coverage" cmake -DBUILD_PYTHON=1 -DBUILD_PARQUET_EXTENSION=1 -DENABLE_SANITIZER=0 -DCMAKE_BUILD_TYPE=Debug ../.. && cmake --build .)
	build/coverage/test/unittest test/sql/join/inner/test_join.test
	lcov --config-file .github/workflows/lcovrc --directory . --base-directory . --no-external --capture --output-file coverage.info
	lcov --config-file .github/workflows/lcovrc --remove coverage.info $(< .github/workflows/lcov_exclude) -o lcov.info
	genhtml --ignore-errors source lcov.info --legend --title "commit SHA1" --output-directory=build/coverage/coverage-html
	open build/coverage/coverage-html/index.html
	TPC-H

	\| Query \| DuckDB \| DataFusion \|
	\|-------\|--------\|------------------\|
	\| Q01 \| 0.05s \| 0.07 \|
	\| Q02 \| 0.02s \| Unsupported \|
	\| Q03 \| 0.03s \| 0.06s \|
	\| Q04 \| 0.05s \| Unsupported \|
	\| Q05 \| 0.03s \| 0.11s \|
	\| Q06 \| 0.01s \| 0.01s \|
	M1 Mac clang++

	master branch (throw exception instead of setting an error flag)
	benchmark/micro/cast/cast_int32_int64_x.benchmark 1 0.053170
	benchmark/micro/cast/cast_int32_int64_x.benchmark 2 0.053137
	benchmark/micro/cast/cast_int32_int64_x.benchmark 3 0.053495
	benchmark/micro/cast/cast_int32_int64_x.benchmark 4 0.055435
	benchmark/micro/cast/cast_int32_int64_x.benchmark 5 0.052032
	benchmark/micro/cast/cast_int64_int32_x.benchmark 1 0.168896
	benchmark/micro/cast/cast_int64_int32_x.benchmark 2 0.168962

	#include <memory>
	#include <vector>

	#include <chrono>
	#include <iostream>

	using namespace std;

	#define BENCH_COUNT 10000000
	# install.packages("duckdb")
	library(duckdb)
	# open a connection to an in-memory database
	db <- dbConnect(duckdb::duckdb())
	# run a SELECT * query that reads from a CSV file at the specified path
	df <- dbGetQuery(db, 'SELECT * FROM "test/sql/copy/csv/data/real/voter.tsv"')
	head(df)
	import duckdb
	import sqlite3
	import time, random, string

	base_tuples = 200*1000
	repeat_insert = 8

	def run_benchmark_strings(db, dbname, print_tuple_count, count_distinct=True):
	db.execute("""CREATE TABLE data(id INTEGER, dt INTEGER, shop TEXT, product TEXT, aa TEXT, bb INTEGER,
	customer TEXT);""")