Skip to content

Instantly share code, notes, and snippets.

@enijkamp
Created July 14, 2021 21:09
Show Gist options
  • Save enijkamp/e4898037ef106672a3dbfe1239d0662b to your computer and use it in GitHub Desktop.
Save enijkamp/e4898037ef106672a3dbfe1239d0662b to your computer and use it in GitHub Desktop.
bpe_ratio.py
import os
import io
import tempfile
import tensorflow as tf
import transformers
def write_to_file(writer, data):
feature = { 'text': tf.train.Feature(int64_list=tf.train.Int64List(value=data)) }
tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(tf_example.SerializeToString())
def compression_ratio(n=int(2**12), compression=''):
data_unicode = 'EleutherAI is a decentralized grassroots collective of volunteer researchers, engineers, and developers focused on AI alignment, scaling, and open source AI research. Founded in July of 2020, our flagship project is the GPT-Neo family of models designed to replicate those developed by OpenAI as GPT-3. Our Discord server is open and welcomes contributors.'
tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
data_bpe = tokenizer.encode(data_unicode)
with io.BytesIO() as f:
for _ in range(n):
f.write(data_unicode.encode())
n1 = f.getbuffer().nbytes
with tempfile.NamedTemporaryFile() as f:
with tf.io.TFRecordWriter(f.name, options=compression) as w:
for _ in range(n):
write_to_file(w, data_bpe)
n2 = os.path.getsize(f.name)
return float(n2) / float(n1)
if __name__ == '__main__':
print(compression_ratio(compression=''))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment