-
-
Save vogelsgesang/1c1e72975e89565a547c45fc75a940c4 to your computer and use it in GitHub Desktop.
Counter example of how not to merge multiple Hyper files into a single file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tableauhyperapi import HyperProcess, Connection, Telemetry, TableDefinition, TableName, SchemaName, Inserter, CreateMode | |
from glob import glob | |
from time import time | |
input_files = glob("WorldIndicators_*.hyper") | |
table_name = TableName('Extract','Extract') | |
output_file = "WorldIndicatorsMerged.hyper" | |
# Start a new Hyper instance | |
start_time = time() | |
with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'unionfiles_inefficient') as hyper: | |
# Get the table's schema from the first input file. | |
# We will just assume that all files have the exact same schema. | |
with Connection(hyper.endpoint, database=input_files[0]) as connection: | |
table_def = connection.catalog.get_table_definition(table_name) | |
# Read the data out of all files | |
# DON'T DO THIS! This is an counterexample which shows how not to write this code | |
unioned_data = [] | |
for file in input_files: | |
# Some poor-man's tracing, so we can see how we make progress | |
print(f"{time() - start_time}: Reading {file}") | |
# Connect to the Hyper file and read all data from it | |
with Connection(hyper.endpoint, database=file) as connection: | |
unioned_data += connection.execute_list_query(f"SELECT * FROM {table_name}") | |
# Create the output file and insert the data | |
print (f"{time() - start_time}: Inserting data...") | |
with Connection(hyper.endpoint, database=output_file, create_mode=CreateMode.CREATE_AND_REPLACE) as connection: | |
# Reset the table_def.table_name | |
table_def.table_name = table_name | |
# Create the output table | |
connection.catalog.create_schema(SchemaName(table_name.schema_name)) | |
connection.catalog.create_table(table_def) | |
# Insert the data | |
with Inserter(connection, table_def) as inserter: | |
inserter.add_rows(unioned_data) | |
inserter.execute() | |
print(f"{time() - start_time}: Done :)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment