Skip to content

Instantly share code, notes, and snippets.

@yuhr123
Created October 11, 2023 05:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yuhr123/7acb7e6bb42fb0ff12f3ba64d2cdd7da to your computer and use it in GitHub Desktop.
Save yuhr123/7acb7e6bb42fb0ff12f3ba64d2cdd7da to your computer and use it in GitHub Desktop.
Script for testing the write performance of MinIO, S3FS-FUSE, and JuiceFS.
import os
import time
import argparse
# Create a file
def create_local_file(file_number, local_directory, file_size_mb):
local_file_name = f'sample_file_{file_number}.txt'
local_file_path = os.path.join(local_directory, local_file_name)
with open(local_file_path, 'wb') as file:
file.write(os.urandom(file_size_mb * 1024 * 1024))
# Calculate the write-in time
def write_to_local_files(num_files, local_directory, file_size_mb):
start_time = time.time()
for i in range(1, num_files + 1):
create_local_file(i, local_directory, file_size_mb)
end_time = time.time()
elapsed_time = end_time - start_time
return elapsed_time
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create local files with specified options.")
parser.add_argument("-n", "--num_files", type=int, default=10000, help="Number of files to create")
parser.add_argument("-d", "--local_directory", type=str, default="/root/mnt-juicefs/", help="Local directory to create files in")
parser.add_argument("-s", "--file_size_mb", type=int, default=1, help="Size of each file in megabytes")
args = parser.parse_args()
num_files = args.num_files
local_directory = args.local_directory
file_size_mb = args.file_size_mb
juicefs_write_time = write_to_local_files(num_files, local_directory, file_size_mb)
print(f"Created {num_files} {file_size_mb}MB files in {local_directory}")
print(f"Total time taken: {juicefs_write_time:.2f} seconds")
import timeit
import pandas as pd
# Write a dummy CSV file to test-bucket
df = pd.DataFrame({"column1": ["new_value1"], "column2": ["new_value2"]})
df.to_csv("/root/mnt-juicefs/test-data.csv", index=False)
def process_s3fs():
for i in range(100):
# Read the existing data
print(i)
df = pd.read_csv('/root/mnt-juicefs/test-data.csv')
# Append a new row
new_df = pd.concat([df, pd.DataFrame([{"column1": f"value{i}", "column2": f"value{i}"}])], ignore_index=True)
# Write the data back to the file
new_df.to_csv('/root/mnt-juicefs/test-data.csv', index=False)
execution_time = timeit.timeit(process_s3fs, number=1)
print(f"Execution time: {execution_time:.2f} seconds")
import timeit
import fsspec
import s3fs
import pandas as pd
fsspec.config.conf = {
"s3":
{
"key": "admin",
"secret": "abc123abc",
"client_kwargs": {
"endpoint_url": "http://127.0.0.1:9000"
}
}
}
s3 = s3fs.S3FileSystem()
df = pd.DataFrame({"column1": ["new_value1"], "column2": ["new_value2"]})
df.to_csv("s3://myjfs/test-data.csv", index=False)
def process_s3():
for i in range(100):
# Read the existing data
print(i)
df = pd.read_csv('s3://myjfs/test-data.csv')
# Append a new row
new_df = pd.concat([df, pd.DataFrame([{"column1": f"value{i}", "column2": f"value{i}"}])], ignore_index=True)
# Write the data back to the file
new_df.to_csv('s3://myjfs/test-data.csv', index=False)
execution_time = timeit.timeit(process_s3, number=1)
print(f"Execution time: {execution_time:.2f} seconds")
import timeit
import fsspec
import s3fs
import pandas as pd
fsspec.config.conf = {
"s3":
{
"key": "admin",
"secret": "abc123abc",
"client_kwargs": {
"endpoint_url": "http://172.16.254.18:9000"
}
}
}
s3 = s3fs.S3FileSystem()
df = pd.DataFrame({"column1": ["new_value1"], "column2": ["new_value2"]})
df.to_csv("s3://test-minio/test-data.csv", index=False)
def process_s3():
for i in range(100):
# Read the existing data
print(i)
df = pd.read_csv('s3://test-minio/test-data.csv')
# Append a new row
new_df = pd.concat([df, pd.DataFrame([{"column1": f"value{i}", "column2": f"value{i}"}])], ignore_index=True)
# Write the data back to the file
new_df.to_csv('s3://test-minio/test-data.csv', index=False)
execution_time = timeit.timeit(process_s3, number=1)
print(f"Execution time: {execution_time:.2f} seconds")
import timeit
import pandas as pd
# Write a dummy CSV file to test-bucket
df = pd.DataFrame({"column1": ["new_value1"], "column2": ["new_value2"]})
df.to_csv("/root/mnt-s3fs/test-data.csv", index=False)
def process_s3fs():
for i in range(100):
# Read the existing data
print(i)
df = pd.read_csv('/root/mnt-s3fs/test-data.csv')
# Append a new row
new_df = pd.concat([df, pd.DataFrame([{"column1": f"value{i}", "column2": f"value{i}"}])], ignore_index=True)
# Write the data back to the file
new_df.to_csv('/root/mnt-s3fs/test-data.csv', index=False)
execution_time = timeit.timeit(process_s3fs, number=1)
print(f"Execution time: {execution_time:.2f} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment