Skip to content

Instantly share code, notes, and snippets.

@vogelsgesang
Created December 1, 2021 17:41
Show Gist options
  • Save vogelsgesang/e83260fd3e1429aefed99ad30a27f196 to your computer and use it in GitHub Desktop.
Save vogelsgesang/e83260fd3e1429aefed99ad30a27f196 to your computer and use it in GitHub Desktop.
Efficient merging of multiple Hyper files into a single file
from tableauhyperapi import HyperProcess, Connection, Telemetry, TableDefinition, TableName, SchemaName, Inserter, CreateMode
from glob import glob
from time import time
import os
input_files = glob("WorldIndicators_*.hyper")
table_name = TableName('Extract','Extract')
output_file = "WorldIndicatorsMerged.hyper"
# Let's delete the output file to make sure we can rerun this script
# even if the output already exists.
if os.path.exists(output_file):
os.remove(output_file)
# Start a new Hyper instance
start_time = time()
with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'unionfiles_efficient') as hyper:
# Create a connection without any connected databases
with Connection(hyper.endpoint) as connection:
# Connect to all our input databases
for i, file in enumerate(input_files):
connection.catalog.attach_database(file, alias=f"input{i}")
print(f"{time() - start_time}: Attached all input databases...")
# Prepare the output database
connection.catalog.create_database(output_file)
connection.catalog.attach_database(output_file, alias="output")
print(table_name)
connection.catalog.create_schema(SchemaName("output", table_name.schema_name))
print(f"{time() - start_time}: Prepared output database")
# Build the CREATE TABLE AS command which unions all our inputs
union_query = ' UNION ALL\n'.join(
f'SELECT * FROM "input{i}".{table_name}' for i in range(len(input_files)))
create_table_sql = f'CREATE TABLE "output".{table_name} AS \n{union_query}'
# And execute it
connection.execute_command(create_table_sql)
print(f"{time() - start_time}: Done :)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment