Skip to content

Instantly share code, notes, and snippets.

@d33bs
Created July 3, 2024 14:22
Show Gist options
  • Save d33bs/99249d0f8dfcb9b1c04b624116484f7d to your computer and use it in GitHub Desktop.
Save d33bs/99249d0f8dfcb9b1c04b624116484f7d to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import os
# Constants for the table dimensions and chunk size
n_rows = 10000
n_cols = 5000
chunk_size = 1000
# Generate unique keys
keys = np.arange(1, n_rows + 1)
# Create the first DataFrame with 'key' column
data1 = {
'key': keys,
}
for i in range(1, n_cols):
data1[f'col1_{i}'] = np.random.random(n_rows)
df1 = pd.DataFrame(data1)
# Create the second DataFrame with 'key' column
data2 = {
'key': keys,
}
for i in range(1, n_cols):
data2[f'col2_{i}'] = np.random.random(n_rows)
df2 = pd.DataFrame(data2)
# Define output directories
output_dir1 = 'table1'
output_dir2 = 'table2'
# Create directories if they don't exist
os.makedirs(output_dir1, exist_ok=True)
os.makedirs(output_dir2, exist_ok=True)
# Function to save DataFrame in chunks
def save_dataframe_in_chunks(df, output_dir, chunk_size):
for start in range(0, len(df), chunk_size):
end = start + chunk_size
chunk = df.iloc[start:end]
table = pa.Table.from_pandas(chunk)
file_path = os.path.join(output_dir, f'chunk_{start // chunk_size + 1}.parquet')
pq.write_table(table, file_path)
# Save both DataFrames in chunks
save_dataframe_in_chunks(df1, output_dir1, chunk_size)
save_dataframe_in_chunks(df2, output_dir2, chunk_size)
print("Parquet datasets created successfully in chunks!")
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
# Constants for the table dimensions
n_rows = 10000
n_cols = 5000
# Generate unique keys
keys = np.arange(1, n_rows + 1)
# Create the first DataFrame with 'key' column
data1 = {
'key': keys,
}
for i in range(1, n_cols):
data1[f'col1_{i}'] = np.random.random(n_rows)
df1 = pd.DataFrame(data1)
# Create the second DataFrame with 'key' column
data2 = {
'key': keys,
}
for i in range(1, n_cols):
data2[f'col2_{i}'] = np.random.random(n_rows)
df2 = pd.DataFrame(data2)
# Convert DataFrames to PyArrow Tables
table1 = pa.Table.from_pandas(df1)
table2 = pa.Table.from_pandas(df2)
# Save the tables as Parquet files
pq.write_table(table1, 'table1.parquet')
pq.write_table(table2, 'table2.parquet')
print("Parquet tables created successfully!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment