ashvardanian/benchmark_huggingface_datasets.py

## benchmark_huggingface_datasets.py
import argparse
import time
from datasets import load_dataset
from datasets import disable_caching

# Set up argument parser
parser = argparse.ArgumentParser(description="Benchmark HuggingFace datasets library for a large textual file.")
parser.add_argument("file_path", type=str, help="Path to the textual file to be parsed and chunked.")
parser.add_argument("--sample_by", type=str, help="How to split - by line or by paragraph.", default="line")

# Parse command line arguments
args = parser.parse_args()
file_path = args.file_path
sample_by = args.sample_by


# Function to benchmark dataset loading and chunking
def benchmark_datasets(file_path, chunk_size=10000):
    # Measure loading time
    start_time = time.time()
    dataset = load_dataset(
        "text",
        data_files=file_path,
        split="train",
        keep_in_memory=True,
        sample_by=sample_by,
    )
    loading_time = time.time() - start_time
    print(f"Time taken to load the dataset: {loading_time} seconds")

    # Measure chunking time
    start_time = time.time()
    _ = dataset.train_test_split(
        test_size=chunk_size,
        seed=42,
        shuffle=True,
    )
    chunking_time = time.time() - start_time
    print(f"Time taken to chunk the dataset into parts of size {chunk_size}: {chunking_time} seconds")

    # Return the total time taken
    return loading_time + chunking_time


# Call the benchmark function
disable_caching()
total_time = benchmark_datasets(file_path)
print(f"Total time taken: {total_time} seconds")
	import argparse
	import time
	from datasets import load_dataset
	from datasets import disable_caching

	# Set up argument parser
	parser = argparse.ArgumentParser(description="Benchmark HuggingFace datasets library for a large textual file.")
	parser.add_argument("file_path", type=str, help="Path to the textual file to be parsed and chunked.")
	parser.add_argument("--sample_by", type=str, help="How to split - by line or by paragraph.", default="line")

	# Parse command line arguments
	args = parser.parse_args()
	file_path = args.file_path
	sample_by = args.sample_by


	# Function to benchmark dataset loading and chunking
	def benchmark_datasets(file_path, chunk_size=10000):
	# Measure loading time
	start_time = time.time()
	dataset = load_dataset(
	"text",
	data_files=file_path,
	split="train",
	keep_in_memory=True,
	sample_by=sample_by,
	)
	loading_time = time.time() - start_time
	print(f"Time taken to load the dataset: {loading_time} seconds")

	# Measure chunking time
	start_time = time.time()
	_ = dataset.train_test_split(
	test_size=chunk_size,
	seed=42,
	shuffle=True,
	)
	chunking_time = time.time() - start_time
	print(f"Time taken to chunk the dataset into parts of size {chunk_size}: {chunking_time} seconds")

	# Return the total time taken
	return loading_time + chunking_time


	# Call the benchmark function
	disable_caching()
	total_time = benchmark_datasets(file_path)
	print(f"Total time taken: {total_time} seconds")