vogelsgesang/inefficient_merge.py Secret

## inefficient_merge.py
from tableauhyperapi import HyperProcess, Connection, Telemetry, TableDefinition, TableName, SchemaName, Inserter, CreateMode
from glob import glob
from time import time

input_files = glob("WorldIndicators_*.hyper")
table_name = TableName('Extract','Extract')
output_file = "WorldIndicatorsMerged.hyper"

# Start a new Hyper instance
start_time = time()
with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'unionfiles_inefficient') as hyper:
    # Get the table's schema from the first input file.
    # We will just assume that all files have the exact same schema.
    with Connection(hyper.endpoint, database=input_files[0]) as connection:
        table_def = connection.catalog.get_table_definition(table_name)
    # Read the data out of all files
    # DON'T DO THIS! This is an counterexample which shows how not to write this code
    unioned_data = []
    for file in input_files:
        # Some poor-man's tracing, so we can see how we make progress
        print(f"{time() - start_time}: Reading {file}")
        # Connect to the Hyper file and read all data from it
        with Connection(hyper.endpoint, database=file) as connection:
            unioned_data += connection.execute_list_query(f"SELECT * FROM {table_name}")
    # Create the output file and insert the data
    print (f"{time() - start_time}: Inserting data...")
    with Connection(hyper.endpoint, database=output_file, create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
        # Reset the table_def.table_name
        table_def.table_name = table_name
        # Create the output table
        connection.catalog.create_schema(SchemaName(table_name.schema_name))
        connection.catalog.create_table(table_def)
        # Insert the data
        with Inserter(connection, table_def) as inserter:
            inserter.add_rows(unioned_data)
            inserter.execute()
    print(f"{time() - start_time}: Done :)")
	from tableauhyperapi import HyperProcess, Connection, Telemetry, TableDefinition, TableName, SchemaName, Inserter, CreateMode
	from glob import glob
	from time import time

	input_files = glob("WorldIndicators_*.hyper")
	table_name = TableName('Extract','Extract')
	output_file = "WorldIndicatorsMerged.hyper"

	# Start a new Hyper instance
	start_time = time()
	with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'unionfiles_inefficient') as hyper:
	# Get the table's schema from the first input file.
	# We will just assume that all files have the exact same schema.
	with Connection(hyper.endpoint, database=input_files[0]) as connection:
	table_def = connection.catalog.get_table_definition(table_name)
	# Read the data out of all files
	# DON'T DO THIS! This is an counterexample which shows how not to write this code
	unioned_data = []
	for file in input_files:
	# Some poor-man's tracing, so we can see how we make progress
	print(f"{time() - start_time}: Reading {file}")
	# Connect to the Hyper file and read all data from it
	with Connection(hyper.endpoint, database=file) as connection:
	unioned_data += connection.execute_list_query(f"SELECT * FROM {table_name}")
	# Create the output file and insert the data
	print (f"{time() - start_time}: Inserting data...")
	with Connection(hyper.endpoint, database=output_file, create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
	# Reset the table_def.table_name
	table_def.table_name = table_name
	# Create the output table
	connection.catalog.create_schema(SchemaName(table_name.schema_name))
	connection.catalog.create_table(table_def)
	# Insert the data
	with Inserter(connection, table_def) as inserter:
	inserter.add_rows(unioned_data)
	inserter.execute()
	print(f"{time() - start_time}: Done :)")