Skip to content

Instantly share code, notes, and snippets.

@BexTuychiev
Created May 24, 2023 12:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BexTuychiev/af8076016de7eb9d65d2038a1dd701ef to your computer and use it in GitHub Desktop.
Save BexTuychiev/af8076016de7eb9d65d2038a1dd701ef to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import string
# Set the desired number of rows and columns
num_rows = 10_000_000
num_cols = 10
chunk_size = 100_000
# Define an empty DataFrame to store the chunks
df_chunks = pd.DataFrame()
# Generate and write the dataset in chunks
for i in range(0, num_rows, chunk_size):
# Generate random numeric data
numeric_data = np.random.rand(chunk_size, num_cols)
# Generate random categorical data
letters = list(string.ascii_uppercase)
categorical_data = np.random.choice(letters, (chunk_size, num_cols))
# Combine numeric and categorical data into a Pandas DataFrame
df_chunk = pd.DataFrame(np.concatenate([numeric_data, categorical_data], axis=1))
# Set column names for better understanding
column_names = [f'Numeric_{i}' for i in range(num_cols)] + [f'Categorical_{i}' for i in range(num_cols)]
df_chunk.columns = column_names
# Append the current chunk to the DataFrame holding all chunks
df_chunks = pd.concat([df_chunks, df_chunk], ignore_index=True)
# Write the DataFrame chunk to a CSV file incrementally
if (i + chunk_size) >= num_rows or (i // chunk_size) % 10 == 0:
df_chunks.to_csv('large_dataset.csv', index=False, mode='a', header=(i == 0))
df_chunks = pd.DataFrame()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment