Skip to content

Instantly share code, notes, and snippets.

@justheuristic
Last active April 11, 2022 16:23
Show Gist options
  • Save justheuristic/cfb1cb85314b985326b04535883020b1 to your computer and use it in GitHub Desktop.
Save justheuristic/cfb1cb85314b985326b04535883020b1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download data from source: https://drive.google.com/drive/folders/1IaD_SIIB-K3Sij_-JjWoPy_UrWqQRdjx\n",
"\n",
"Install OWT:\n",
"```bash\n",
"git clone https://github.com/jcpeterson/openwebtext\n",
"cd openwebtext\n",
"pip install -r requirements.txt\n",
"\n",
"```\n",
"\n",
"Extract raw files:\n",
"```bash\n",
"!7za x openwebtext.tar.xz\n",
"!tar -xf openwebtext.tar\n",
"```\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"ARCHIVES_PATH = \"./openwebtext\"\n",
"DOCUMENTS_PATH = \"./openwebtext_documents\"\n",
"NUM_VAL_FILES = 5_000\n",
"\n",
"!mkdir -p {DOCUMENTS_PATH}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "50adfc5326a74e86b52bba1cfef0ce16",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/20610 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "76fa7be89dc24581bb70914643a9e638",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/5000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b6ec2530fb0845228fdca31107808b6d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/8008769 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"IOPub message rate exceeded.\n",
"The notebook server will temporarily stop sending output\n",
"to the client in order to avoid crashing it.\n",
"To change this limit, set the config variable\n",
"`--NotebookApp.iopub_msg_rate_limit`.\n",
"\n",
"Current values:\n",
"NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
"NotebookApp.rate_limit_window=3.0 (secs)\n",
"\n"
]
}
],
"source": [
"import os, random\n",
"import tarfile\n",
"from tqdm.auto import tqdm\n",
"for name in tqdm(os.listdir(ARCHIVES_PATH)):\n",
" assert name.endswith('.xz')\n",
" with tarfile.open(os.path.join(ARCHIVES_PATH, name)) as tar:\n",
" tar.extractall(os.path.join(DOCUMENTS_PATH, name.rstrip('.xz')))\n",
" \n",
"all_files = [os.path.join(path, file)\n",
" for path, subdirs, files in os.walk(DOCUMENTS_PATH)\n",
" for file in files\n",
"]\n",
"random.Random(42).shuffle(all_files)\n",
"train_files, valid_files = all_files[:-NUM_VAL_FILES], all_files[-NUM_VAL_FILES:]\n",
"\n",
"with open(\"openwebtext.valid.raw\", 'w') as f_out:\n",
" for file in tqdm(valid_files):\n",
" chunk = open(file).read()\n",
" f_out.write(chunk)\n",
" f_out.write('\\n')\n",
"\n",
"\n",
"with open(\"openwebtext.train.raw\", 'w') as f_out:\n",
" for file in tqdm(train_files):\n",
" chunk = open(file).read()\n",
" f_out.write(chunk)\n",
" f_out.write('\\n')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-rw-r-- 1 jheuristic jheuristic 38G мар 31 02:02 openwebtext.train.raw\r\n",
"-rw-rw-r-- 1 jheuristic jheuristic 24M мар 30 19:53 openwebtext.valid.raw\r\n"
]
}
],
"source": [
"!ls -lht openwebtext.valid.raw openwebtext.train.raw"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install fairseq:\n",
"```bash\n",
"git clone https://github.com/pytorch/fairseq\n",
"cd fairseq && python setup.py develop\n",
"\n",
"```\n",
"\n",
"apply gpt2-style preprocessing:\n",
"```bash\n",
"\n",
"cd fairseq\n",
"\n",
"\n",
"\n",
"mkdir -p gpt2_bpe\n",
"wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json\n",
"wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe\n",
"wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt\n",
"\n",
"for SPLIT in valid train; do \\\n",
" python -m examples.roberta.multiprocessing_bpe_encoder \\\n",
" --encoder-json gpt2_bpe/encoder.json \\\n",
" --vocab-bpe gpt2_bpe/vocab.bpe \\\n",
" --inputs ../openwebtext.${SPLIT}.raw \\\n",
" --outputs ../openwebtext.${SPLIT}.bpe \\\n",
" --keep-empty \\\n",
" --workers 60; \\\n",
"done\n",
"\n",
"\n",
"fairseq-preprocess \\\n",
" --only-source \\\n",
" --srcdict gpt2_bpe/dict.txt \\\n",
" --trainpref ../openwebtext.train.bpe \\\n",
" --validpref ../openwebtext.valid.bpe \\\n",
" --destdir ../data-bin/openwebtext \\\n",
" --workers 60\n",
"\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py38",
"language": "python",
"name": "py38"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment