Skip to content

Instantly share code, notes, and snippets.

Last active April 11, 2022 16:23
Show Gist options
  • Save justheuristic/cfb1cb85314b985326b04535883020b1 to your computer and use it in GitHub Desktop.
Save justheuristic/cfb1cb85314b985326b04535883020b1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
"cells": [
"cell_type": "markdown",
"metadata": {},
"source": [
"Download data from source:\n",
"Install OWT:\n",
"git clone\n",
"cd openwebtext\n",
"pip install -r requirements.txt\n",
"Extract raw files:\n",
"!7za x openwebtext.tar.xz\n",
"!tar -xf openwebtext.tar\n",
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"ARCHIVES_PATH = \"./openwebtext\"\n",
"DOCUMENTS_PATH = \"./openwebtext_documents\"\n",
"NUM_VAL_FILES = 5_000\n",
"!mkdir -p {DOCUMENTS_PATH}"
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "50adfc5326a74e86b52bba1cfef0ce16",
"version_major": 2,
"version_minor": 0
"text/plain": [
" 0%| | 0/20610 [00:00<?, ?it/s]"
"metadata": {},
"output_type": "display_data"
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "76fa7be89dc24581bb70914643a9e638",
"version_major": 2,
"version_minor": 0
"text/plain": [
" 0%| | 0/5000 [00:00<?, ?it/s]"
"metadata": {},
"output_type": "display_data"
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b6ec2530fb0845228fdca31107808b6d",
"version_major": 2,
"version_minor": 0
"text/plain": [
" 0%| | 0/8008769 [00:00<?, ?it/s]"
"metadata": {},
"output_type": "display_data"
"name": "stderr",
"output_type": "stream",
"text": [
"IOPub message rate exceeded.\n",
"The notebook server will temporarily stop sending output\n",
"to the client in order to avoid crashing it.\n",
"To change this limit, set the config variable\n",
"Current values:\n",
"NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
"NotebookApp.rate_limit_window=3.0 (secs)\n",
"source": [
"import os, random\n",
"import tarfile\n",
"from import tqdm\n",
"for name in tqdm(os.listdir(ARCHIVES_PATH)):\n",
" assert name.endswith('.xz')\n",
" with, name)) as tar:\n",
" tar.extractall(os.path.join(DOCUMENTS_PATH, name.rstrip('.xz')))\n",
" \n",
"all_files = [os.path.join(path, file)\n",
" for path, subdirs, files in os.walk(DOCUMENTS_PATH)\n",
" for file in files\n",
"train_files, valid_files = all_files[:-NUM_VAL_FILES], all_files[-NUM_VAL_FILES:]\n",
"with open(\"openwebtext.valid.raw\", 'w') as f_out:\n",
" for file in tqdm(valid_files):\n",
" chunk = open(file).read()\n",
" f_out.write(chunk)\n",
" f_out.write('\\n')\n",
"with open(\"openwebtext.train.raw\", 'w') as f_out:\n",
" for file in tqdm(train_files):\n",
" chunk = open(file).read()\n",
" f_out.write(chunk)\n",
" f_out.write('\\n')\n",
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-rw-r-- 1 jheuristic jheuristic 38G мар 31 02:02 openwebtext.train.raw\r\n",
"-rw-rw-r-- 1 jheuristic jheuristic 24M мар 30 19:53 openwebtext.valid.raw\r\n"
"source": [
"!ls -lht openwebtext.valid.raw openwebtext.train.raw"
"cell_type": "markdown",
"metadata": {},
"source": [
"Install fairseq:\n",
"git clone\n",
"cd fairseq && python develop\n",
"apply gpt2-style preprocessing:\n",
"cd fairseq\n",
"mkdir -p gpt2_bpe\n",
"wget -O gpt2_bpe/encoder.json\n",
"wget -O gpt2_bpe/vocab.bpe\n",
"wget -O gpt2_bpe/dict.txt\n",
"for SPLIT in valid train; do \\\n",
" python -m examples.roberta.multiprocessing_bpe_encoder \\\n",
" --encoder-json gpt2_bpe/encoder.json \\\n",
" --vocab-bpe gpt2_bpe/vocab.bpe \\\n",
" --inputs ../openwebtext.${SPLIT}.raw \\\n",
" --outputs ../openwebtext.${SPLIT}.bpe \\\n",
" --keep-empty \\\n",
" --workers 60; \\\n",
"fairseq-preprocess \\\n",
" --only-source \\\n",
" --srcdict gpt2_bpe/dict.txt \\\n",
" --trainpref ../openwebtext.train.bpe \\\n",
" --validpref ../openwebtext.valid.bpe \\\n",
" --destdir ../data-bin/openwebtext \\\n",
" --workers 60\n",
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"metadata": {
"kernelspec": {
"display_name": "py38",
"language": "python",
"name": "py38"
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
"nbformat": 4,
"nbformat_minor": 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment