Skip to content

Instantly share code, notes, and snippets.

@BramVanroy
Last active May 29, 2024 11:33
Show Gist options
  • Save BramVanroy/448b7508ddeec5e215c1a33d155aae64 to your computer and use it in GitHub Desktop.
Save BramVanroy/448b7508ddeec5e215c1a33d155aae64 to your computer and use it in GitHub Desktop.
Fast method of "first-fit-decreasing" packing benchmark. Around 5x faster than baseline. Baseline taken from https://huggingface.co/DiscoResearch/Llama3-German-8B#document-packing. Note that memory usage will be higher in the optimized version.
import gc
import numpy as np
import time
import pandas as pd
from tqdm import tqdm
def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True):
start_time = time.perf_counter()
sorted_docs = sorted(tokenized_documents, key=len, reverse=True)
bins = []
def find_bin(doc):
for b in bins:
if sum(len(d) for d in b) + len(doc) <= block_size:
return b
return None
for doc in tqdm(sorted_docs, disable=not use_tqdm, leave=False):
target_bin = find_bin(doc)
if target_bin is not None:
target_bin.append(doc)
else:
bins.append([doc])
end_time = time.perf_counter()
return bins, end_time - start_time
def pack_documents_optimized(tokenized_documents, block_size: int = 8192, use_tqdm=True):
start_time = time.perf_counter()
doc_lengths = np.array([len(doc) for doc in tokenized_documents])
sorted_indices = np.argsort(-doc_lengths)
bins = []
bin_lengths = []
def find_bin(_doc_length):
for i, b_length in enumerate(bin_lengths):
if b_length + _doc_length <= block_size:
return i
return None
for idx in tqdm(sorted_indices, disable=not use_tqdm, leave=False):
doc_length = doc_lengths[idx]
bin_idx = find_bin(doc_length)
if bin_idx is not None:
bins[bin_idx].append(idx)
bin_lengths[bin_idx] += doc_length
else:
bins.append([idx])
bin_lengths.append(doc_length)
packed_bins = [[tokenized_documents[idx] for idx in b] for b in bins]
end_time = time.perf_counter()
return packed_bins, end_time - start_time
def benchmark():
results = []
for num_documents in (1000, 10_000):
for max_doc_length in (2048, 4096, 8192):
original_times = []
optimized_times = []
result = {
"num_documents": num_documents,
"max_doc_length": max_doc_length,
}
for run_idx in range(1, 4):
# Generate `num_documents` random strings of length up to `max_doc_length`
tokenized_documents = [
"".join(list(np.random.choice(['a', 'b', 'c', 'd', ' '], size=np.random.randint(1, max_doc_length))))
for _ in range(num_documents)
]
packed_bins_optimized, time_optimized = pack_documents_optimized(tokenized_documents, max_doc_length)
gc.collect()
packed_bins_original, time_original = pack_documents_original(tokenized_documents, max_doc_length)
gc.collect()
# Ensure results are equivalent. Minor differences in sorting may have happened when two strings have
# the same length, but both approaches should yield the same results in terms of lengths of bins
grouped_lens_original = [sum(len(item) for item in _bin) for _bin in packed_bins_original]
grouped_lens_optimized = [sum(len(item) for item in _bin) for _bin in packed_bins_optimized]
assert grouped_lens_original == grouped_lens_optimized, "Results are not equivalent"
original_times.append(time_original)
optimized_times.append(time_optimized)
result[f"time_{run_idx}_original"] = time_original
result[f"run_{run_idx}_optimized"] = time_optimized
# Print mean and standard deviation of times
print(f"Number of documents: {num_documents}, max document length: {max_doc_length}")
print(f"Optimized approach mean time: {np.mean(optimized_times):.4f} seconds (std {np.std(optimized_times):.4f})")
print(f"Original approach mean time: {np.mean(original_times):.4f} seconds (std {np.std(original_times):.4f})")
print()
result["mean_time_original"] = np.mean(original_times)
result["std_time_original"] = np.std(original_times)
result["mean_time_optimized"] = np.mean(optimized_times)
result["std_time_optimized"] = np.std(optimized_times)
results.append(result)
df = pd.DataFrame(results)
df.to_csv("benchmark_results.csv", index=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df[["num_documents", "max_doc_length", "mean_time_original", "mean_time_optimized"]])
if __name__ == "__main__":
benchmark()
@BramVanroy
Copy link
Author

Results:

num_documents max_doc_length mean_time_original mean_time_optimized
1000 2048 0.0894 0.0219
1000 4096 0.0896 0.0181
1000 8192 0.0899 0.0179
10000 2048 9.0949 1.8311
10000 4096 9.2626 1.8398
10000 8192 9.2381 1.7905

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment