Created
March 29, 2023 16:04
-
-
Save finbarrtimbers/1728037381d27ebc7b4cdd828a6f1f9a to your computer and use it in GitHub Desktop.
Script to calculate tokens in bookcorpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a modified version of https://github.com/karpathy/nanoGPT/blob/master/data/openwebtext/prepare.py. | |
import os | |
import requests | |
import tiktoken | |
import numpy as np | |
import tarfile | |
import glob | |
import shutil | |
# download the bookcorpus dataset. Note: this needs to be concatenated. | |
input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') | |
if not os.path.exists(input_file_path): | |
data_url = 'https://t.co/J3EaSEgwW0' | |
gzip_path = os.path.join(os.path.dirname(__file__), 'input.tar.gz') | |
with open(gzip_path, 'w') as f: | |
f.write(requests.get(data_url).text) | |
with tarfile.open(gzip_path) as f: | |
f.extractall('./bookcorpus') | |
with open(input_file_path, 'wb') as outfile: | |
for filename in glob.glob('./bookcorpus/epubtxt/*.txt'): | |
with open(filename, 'rb') as readfile: | |
shutil.copyfileobj(readfile, outfile) | |
with open(input_file_path, 'r') as f: | |
data = f.read() | |
n = len(data) | |
train_data = data[:int(n*0.9)] | |
val_data = data[int(n*0.9):] | |
# encode with tiktoken gpt2 bpe | |
enc = tiktoken.get_encoding("gpt2") | |
train_ids = enc.encode_ordinary(train_data) | |
val_ids = enc.encode_ordinary(val_data) | |
print(f'{train_data[:100]=}') | |
print(f'{train_ids[:100]=}') | |
print(f"train has {len(train_ids):,} tokens") | |
print(f"val has {len(val_ids):,} tokens") | |
# export to bin files | |
train_ids = np.array(train_ids, dtype=np.uint16) | |
val_ids = np.array(val_ids, dtype=np.uint16) | |
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) | |
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment