-
-
Save vogelsgesang/e83260fd3e1429aefed99ad30a27f196 to your computer and use it in GitHub Desktop.
Efficient merging of multiple Hyper files into a single file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tableauhyperapi import HyperProcess, Connection, Telemetry, TableDefinition, TableName, SchemaName, Inserter, CreateMode | |
from glob import glob | |
from time import time | |
import os | |
input_files = glob("WorldIndicators_*.hyper") | |
table_name = TableName('Extract','Extract') | |
output_file = "WorldIndicatorsMerged.hyper" | |
# Let's delete the output file to make sure we can rerun this script | |
# even if the output already exists. | |
if os.path.exists(output_file): | |
os.remove(output_file) | |
# Start a new Hyper instance | |
start_time = time() | |
with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'unionfiles_efficient') as hyper: | |
# Create a connection without any connected databases | |
with Connection(hyper.endpoint) as connection: | |
# Connect to all our input databases | |
for i, file in enumerate(input_files): | |
connection.catalog.attach_database(file, alias=f"input{i}") | |
print(f"{time() - start_time}: Attached all input databases...") | |
# Prepare the output database | |
connection.catalog.create_database(output_file) | |
connection.catalog.attach_database(output_file, alias="output") | |
print(table_name) | |
connection.catalog.create_schema(SchemaName("output", table_name.schema_name)) | |
print(f"{time() - start_time}: Prepared output database") | |
# Build the CREATE TABLE AS command which unions all our inputs | |
union_query = ' UNION ALL\n'.join( | |
f'SELECT * FROM "input{i}".{table_name}' for i in range(len(input_files))) | |
create_table_sql = f'CREATE TABLE "output".{table_name} AS \n{union_query}' | |
# And execute it | |
connection.execute_command(create_table_sql) | |
print(f"{time() - start_time}: Done :)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment