Skip to content

Instantly share code, notes, and snippets.

@vogelsgesang
Created December 1, 2021 17:40
Show Gist options
  • Save vogelsgesang/1c1e72975e89565a547c45fc75a940c4 to your computer and use it in GitHub Desktop.
Save vogelsgesang/1c1e72975e89565a547c45fc75a940c4 to your computer and use it in GitHub Desktop.
Counter example of how not to merge multiple Hyper files into a single file
from tableauhyperapi import HyperProcess, Connection, Telemetry, TableDefinition, TableName, SchemaName, Inserter, CreateMode
from glob import glob
from time import time
input_files = glob("WorldIndicators_*.hyper")
table_name = TableName('Extract','Extract')
output_file = "WorldIndicatorsMerged.hyper"
# Start a new Hyper instance
start_time = time()
with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'unionfiles_inefficient') as hyper:
# Get the table's schema from the first input file.
# We will just assume that all files have the exact same schema.
with Connection(hyper.endpoint, database=input_files[0]) as connection:
table_def = connection.catalog.get_table_definition(table_name)
# Read the data out of all files
# DON'T DO THIS! This is an counterexample which shows how not to write this code
unioned_data = []
for file in input_files:
# Some poor-man's tracing, so we can see how we make progress
print(f"{time() - start_time}: Reading {file}")
# Connect to the Hyper file and read all data from it
with Connection(hyper.endpoint, database=file) as connection:
unioned_data += connection.execute_list_query(f"SELECT * FROM {table_name}")
# Create the output file and insert the data
print (f"{time() - start_time}: Inserting data...")
with Connection(hyper.endpoint, database=output_file, create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
# Reset the table_def.table_name
table_def.table_name = table_name
# Create the output table
connection.catalog.create_schema(SchemaName(table_name.schema_name))
connection.catalog.create_table(table_def)
# Insert the data
with Inserter(connection, table_def) as inserter:
inserter.add_rows(unioned_data)
inserter.execute()
print(f"{time() - start_time}: Done :)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment