d33bs/10000_x_5000_parquet_datasets.py

## 10000_x_5000_parquet_datasets.py
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import os

# Constants for the table dimensions and chunk size
n_rows = 10000
n_cols = 5000
chunk_size = 1000

# Generate unique keys
keys = np.arange(1, n_rows + 1)

# Create the first DataFrame with 'key' column
data1 = {
    'key': keys,
}
for i in range(1, n_cols):
    data1[f'col1_{i}'] = np.random.random(n_rows)

df1 = pd.DataFrame(data1)

# Create the second DataFrame with 'key' column
data2 = {
    'key': keys,
}
for i in range(1, n_cols):
    data2[f'col2_{i}'] = np.random.random(n_rows)

df2 = pd.DataFrame(data2)

# Define output directories
output_dir1 = 'table1'
output_dir2 = 'table2'

# Create directories if they don't exist
os.makedirs(output_dir1, exist_ok=True)
os.makedirs(output_dir2, exist_ok=True)

# Function to save DataFrame in chunks
def save_dataframe_in_chunks(df, output_dir, chunk_size):
    for start in range(0, len(df), chunk_size):
        end = start + chunk_size
        chunk = df.iloc[start:end]
        table = pa.Table.from_pandas(chunk)
        file_path = os.path.join(output_dir, f'chunk_{start // chunk_size + 1}.parquet')
        pq.write_table(table, file_path)

# Save both DataFrames in chunks
save_dataframe_in_chunks(df1, output_dir1, chunk_size)
save_dataframe_in_chunks(df2, output_dir2, chunk_size)

print("Parquet datasets created successfully in chunks!")

## 10000_x_5000_parquet_default.py
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

# Constants for the table dimensions
n_rows = 10000
n_cols = 5000

# Generate unique keys
keys = np.arange(1, n_rows + 1)

# Create the first DataFrame with 'key' column
data1 = {
    'key': keys,
}
for i in range(1, n_cols):
    data1[f'col1_{i}'] = np.random.random(n_rows)

df1 = pd.DataFrame(data1)

# Create the second DataFrame with 'key' column
data2 = {
    'key': keys,
}
for i in range(1, n_cols):
    data2[f'col2_{i}'] = np.random.random(n_rows)

df2 = pd.DataFrame(data2)

# Convert DataFrames to PyArrow Tables
table1 = pa.Table.from_pandas(df1)
table2 = pa.Table.from_pandas(df2)

# Save the tables as Parquet files
pq.write_table(table1, 'table1.parquet')
pq.write_table(table2, 'table2.parquet')

print("Parquet tables created successfully!")
	import pandas as pd
	import numpy as np
	import pyarrow as pa
	import pyarrow.parquet as pq
	import os

	# Constants for the table dimensions and chunk size
	n_rows = 10000
	n_cols = 5000
	chunk_size = 1000

	# Generate unique keys
	keys = np.arange(1, n_rows + 1)

	# Create the first DataFrame with 'key' column
	data1 = {
	'key': keys,
	}
	for i in range(1, n_cols):
	data1[f'col1_{i}'] = np.random.random(n_rows)

	df1 = pd.DataFrame(data1)

	# Create the second DataFrame with 'key' column
	data2 = {
	'key': keys,
	}
	for i in range(1, n_cols):
	data2[f'col2_{i}'] = np.random.random(n_rows)

	df2 = pd.DataFrame(data2)

	# Define output directories
	output_dir1 = 'table1'
	output_dir2 = 'table2'

	# Create directories if they don't exist
	os.makedirs(output_dir1, exist_ok=True)
	os.makedirs(output_dir2, exist_ok=True)

	# Function to save DataFrame in chunks
	def save_dataframe_in_chunks(df, output_dir, chunk_size):
	for start in range(0, len(df), chunk_size):
	end = start + chunk_size
	chunk = df.iloc[start:end]
	table = pa.Table.from_pandas(chunk)
	file_path = os.path.join(output_dir, f'chunk_{start // chunk_size + 1}.parquet')
	pq.write_table(table, file_path)

	# Save both DataFrames in chunks
	save_dataframe_in_chunks(df1, output_dir1, chunk_size)
	save_dataframe_in_chunks(df2, output_dir2, chunk_size)

	print("Parquet datasets created successfully in chunks!")