-
-
Save justheuristic/cfb1cb85314b985326b04535883020b1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Download data from source: https://drive.google.com/drive/folders/1IaD_SIIB-K3Sij_-JjWoPy_UrWqQRdjx\n", | |
"\n", | |
"Install OWT:\n", | |
"```bash\n", | |
"git clone https://github.com/jcpeterson/openwebtext\n", | |
"cd openwebtext\n", | |
"pip install -r requirements.txt\n", | |
"\n", | |
"```\n", | |
"\n", | |
"Extract raw files:\n", | |
"```bash\n", | |
"!7za x openwebtext.tar.xz\n", | |
"!tar -xf openwebtext.tar\n", | |
"```\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ARCHIVES_PATH = \"./openwebtext\"\n", | |
"DOCUMENTS_PATH = \"./openwebtext_documents\"\n", | |
"NUM_VAL_FILES = 5_000\n", | |
"\n", | |
"!mkdir -p {DOCUMENTS_PATH}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "50adfc5326a74e86b52bba1cfef0ce16", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/20610 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "76fa7be89dc24581bb70914643a9e638", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/5000 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b6ec2530fb0845228fdca31107808b6d", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/8008769 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"IOPub message rate exceeded.\n", | |
"The notebook server will temporarily stop sending output\n", | |
"to the client in order to avoid crashing it.\n", | |
"To change this limit, set the config variable\n", | |
"`--NotebookApp.iopub_msg_rate_limit`.\n", | |
"\n", | |
"Current values:\n", | |
"NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", | |
"NotebookApp.rate_limit_window=3.0 (secs)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"import os, random\n", | |
"import tarfile\n", | |
"from tqdm.auto import tqdm\n", | |
"for name in tqdm(os.listdir(ARCHIVES_PATH)):\n", | |
" assert name.endswith('.xz')\n", | |
" with tarfile.open(os.path.join(ARCHIVES_PATH, name)) as tar:\n", | |
" tar.extractall(os.path.join(DOCUMENTS_PATH, name.rstrip('.xz')))\n", | |
" \n", | |
"all_files = [os.path.join(path, file)\n", | |
" for path, subdirs, files in os.walk(DOCUMENTS_PATH)\n", | |
" for file in files\n", | |
"]\n", | |
"random.Random(42).shuffle(all_files)\n", | |
"train_files, valid_files = all_files[:-NUM_VAL_FILES], all_files[-NUM_VAL_FILES:]\n", | |
"\n", | |
"with open(\"openwebtext.valid.raw\", 'w') as f_out:\n", | |
" for file in tqdm(valid_files):\n", | |
" chunk = open(file).read()\n", | |
" f_out.write(chunk)\n", | |
" f_out.write('\\n')\n", | |
"\n", | |
"\n", | |
"with open(\"openwebtext.train.raw\", 'w') as f_out:\n", | |
" for file in tqdm(train_files):\n", | |
" chunk = open(file).read()\n", | |
" f_out.write(chunk)\n", | |
" f_out.write('\\n')\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-rw-r-- 1 jheuristic jheuristic 38G мар 31 02:02 openwebtext.train.raw\r\n", | |
"-rw-rw-r-- 1 jheuristic jheuristic 24M мар 30 19:53 openwebtext.valid.raw\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lht openwebtext.valid.raw openwebtext.train.raw" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Install fairseq:\n", | |
"```bash\n", | |
"git clone https://github.com/pytorch/fairseq\n", | |
"cd fairseq && python setup.py develop\n", | |
"\n", | |
"```\n", | |
"\n", | |
"apply gpt2-style preprocessing:\n", | |
"```bash\n", | |
"\n", | |
"cd fairseq\n", | |
"\n", | |
"\n", | |
"\n", | |
"mkdir -p gpt2_bpe\n", | |
"wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json\n", | |
"wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe\n", | |
"wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt\n", | |
"\n", | |
"for SPLIT in valid train; do \\\n", | |
" python -m examples.roberta.multiprocessing_bpe_encoder \\\n", | |
" --encoder-json gpt2_bpe/encoder.json \\\n", | |
" --vocab-bpe gpt2_bpe/vocab.bpe \\\n", | |
" --inputs ../openwebtext.${SPLIT}.raw \\\n", | |
" --outputs ../openwebtext.${SPLIT}.bpe \\\n", | |
" --keep-empty \\\n", | |
" --workers 60; \\\n", | |
"done\n", | |
"\n", | |
"\n", | |
"fairseq-preprocess \\\n", | |
" --only-source \\\n", | |
" --srcdict gpt2_bpe/dict.txt \\\n", | |
" --trainpref ../openwebtext.train.bpe \\\n", | |
" --validpref ../openwebtext.valid.bpe \\\n", | |
" --destdir ../data-bin/openwebtext \\\n", | |
" --workers 60\n", | |
"\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "py38", | |
"language": "python", | |
"name": "py38" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment