Skip to content

Instantly share code, notes, and snippets.

@andrwng
Created October 18, 2020 05:22
Show Gist options
  • Save andrwng/bae4a4696eabb501b3b69f8db263a745 to your computer and use it in GitHub Desktop.
Save andrwng/bae4a4696eabb501b3b69f8db263a745 to your computer and use it in GitHub Desktop.
import kudu
import pandas as pd
import pyarrow as pa
import time
def main():
client = kudu.connect("0.0.0.0", 8764)
num_rows = 100000
num_rows_per_batch = 1000
num_non_pks = 100
table_name = 'default.{}_non_keys'.format(num_non_pks)
if not client.table_exists(table_name):
builder = kudu.schema_builder()
builder.add_column('key').type(kudu.int64).nullable(False)
for i in range(num_non_pks):
builder.add_column('col{}'.format(i)).type(kudu.string)
builder.set_primary_keys(['key'])
schema = builder.build()
partitioning = kudu.client.Partitioning().add_hash_partitions(column_names=['key'], num_buckets=2)
client.create_table(table_name, schema, partitioning)
print("Created table {}".format(table_name))
table = client.table(table_name)
session = client.new_session()
for i in range(num_rows):
op = table.new_insert()
op[0] = i
for i in range(num_non_pks):
op[1 + i] = "string-{}".format(i)
session.apply(op)
if i % num_rows_per_batch:
session.flush()
num_trials = 5
tuple_total_time = 0
arrow_total_time = 0
table = client.table(table_name)
for i in range(num_trials):
scanner = table.scanner().open()
col_scanner = table.scanner().enable_columnar_layout().open()
start_tuples = time.perf_counter()
scanner.to_pandas()
start_arrow = time.perf_counter()
col_scanner.to_pandas()
end = time.perf_counter()
tuple_total_time += (start_arrow - start_tuples)
arrow_total_time += (end - start_arrow)
print("Materialized tuple DataFrame with {} rows in {} secs, averaged across {} runs".format(
num_rows, tuple_total_time / num_trials, num_trials))
print("Materialized arrow DataFrame with {} rows in {} secs, averaged across {} runs".format(
num_rows, arrow_total_time / num_trials, num_trials))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment