justheuristic/fairseq_owt.ipynb Secret

## fairseq_owt.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download data from source: https://drive.google.com/drive/folders/1IaD_SIIB-K3Sij_-JjWoPy_UrWqQRdjx\n",
    "\n",
    "Install OWT:\n",
    "```bash\n",
    "git clone https://github.com/jcpeterson/openwebtext\n",
    "cd openwebtext\n",
    "pip install -r requirements.txt\n",
    "\n",
    "```\n",
    "\n",
    "Extract raw files:\n",
    "```bash\n",
    "!7za x openwebtext.tar.xz\n",
    "!tar -xf openwebtext.tar\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "ARCHIVES_PATH = \"./openwebtext\"\n",
    "DOCUMENTS_PATH = \"./openwebtext_documents\"\n",
    "NUM_VAL_FILES = 5_000\n",
    "\n",
    "!mkdir -p {DOCUMENTS_PATH}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "50adfc5326a74e86b52bba1cfef0ce16",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/20610 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "76fa7be89dc24581bb70914643a9e638",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/5000 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b6ec2530fb0845228fdca31107808b6d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/8008769 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "IOPub message rate exceeded.\n",
      "The notebook server will temporarily stop sending output\n",
      "to the client in order to avoid crashing it.\n",
      "To change this limit, set the config variable\n",
      "`--NotebookApp.iopub_msg_rate_limit`.\n",
      "\n",
      "Current values:\n",
      "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
      "NotebookApp.rate_limit_window=3.0 (secs)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import os, random\n",
    "import tarfile\n",
    "from tqdm.auto import tqdm\n",
    "for name in tqdm(os.listdir(ARCHIVES_PATH)):\n",
    "    assert name.endswith('.xz')\n",
    "    with tarfile.open(os.path.join(ARCHIVES_PATH, name)) as tar:\n",
    "        tar.extractall(os.path.join(DOCUMENTS_PATH, name.rstrip('.xz')))\n",
    "        \n",
    "all_files = [os.path.join(path, file)\n",
    "    for path, subdirs, files in os.walk(DOCUMENTS_PATH)\n",
    "    for file in files\n",
    "]\n",
    "random.Random(42).shuffle(all_files)\n",
    "train_files, valid_files = all_files[:-NUM_VAL_FILES], all_files[-NUM_VAL_FILES:]\n",
    "\n",
    "with open(\"openwebtext.valid.raw\", 'w') as f_out:\n",
    "    for file in tqdm(valid_files):\n",
    "        chunk = open(file).read()\n",
    "        f_out.write(chunk)\n",
    "        f_out.write('\\n')\n",
    "\n",
    "\n",
    "with open(\"openwebtext.train.raw\", 'w') as f_out:\n",
    "    for file in tqdm(train_files):\n",
    "        chunk = open(file).read()\n",
    "        f_out.write(chunk)\n",
    "        f_out.write('\\n')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-rw-r-- 1 jheuristic jheuristic 38G мар 31 02:02 openwebtext.train.raw\r\n",
      "-rw-rw-r-- 1 jheuristic jheuristic 24M мар 30 19:53 openwebtext.valid.raw\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lht openwebtext.valid.raw openwebtext.train.raw"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install fairseq:\n",
    "```bash\n",
    "git clone https://github.com/pytorch/fairseq\n",
    "cd fairseq && python setup.py develop\n",
    "\n",
    "```\n",
    "\n",
    "apply gpt2-style preprocessing:\n",
    "```bash\n",
    "\n",
    "cd fairseq\n",
    "\n",
    "\n",
    "\n",
    "mkdir -p gpt2_bpe\n",
    "wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json\n",
    "wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe\n",
    "wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt\n",
    "\n",
    "for SPLIT in valid train; do \\\n",
    "    python -m examples.roberta.multiprocessing_bpe_encoder \\\n",
    "        --encoder-json gpt2_bpe/encoder.json \\\n",
    "        --vocab-bpe gpt2_bpe/vocab.bpe \\\n",
    "        --inputs ../openwebtext.${SPLIT}.raw \\\n",
    "        --outputs ../openwebtext.${SPLIT}.bpe \\\n",
    "        --keep-empty \\\n",
    "        --workers 60; \\\n",
    "done\n",
    "\n",
    "\n",
    "fairseq-preprocess \\\n",
    "    --only-source \\\n",
    "    --srcdict gpt2_bpe/dict.txt \\\n",
    "    --trainpref ../openwebtext.train.bpe \\\n",
    "    --validpref ../openwebtext.valid.bpe \\\n",
    "    --destdir ../data-bin/openwebtext \\\n",
    "    --workers 60\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py38",
   "language": "python",
   "name": "py38"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download data from source: https://drive.google.com/drive/folders/1IaD_SIIB-K3Sij_-JjWoPy_UrWqQRdjx\n",
	"\n",
	"Install OWT:\n",
	"```bash\n",
	"git clone https://github.com/jcpeterson/openwebtext\n",
	"cd openwebtext\n",
	"pip install -r requirements.txt\n",
	"\n",
	"```\n",
	"\n",
	"Extract raw files:\n",
	"```bash\n",
	"!7za x openwebtext.tar.xz\n",
	"!tar -xf openwebtext.tar\n",
	"```\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"ARCHIVES_PATH = \"./openwebtext\"\n",
	"DOCUMENTS_PATH = \"./openwebtext_documents\"\n",
	"NUM_VAL_FILES = 5_000\n",
	"\n",
	"!mkdir -p {DOCUMENTS_PATH}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "50adfc5326a74e86b52bba1cfef0ce16",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	" 0%\| \| 0/20610 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "76fa7be89dc24581bb70914643a9e638",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	" 0%\| \| 0/5000 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "b6ec2530fb0845228fdca31107808b6d",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	" 0%\| \| 0/8008769 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"IOPub message rate exceeded.\n",
	"The notebook server will temporarily stop sending output\n",
	"to the client in order to avoid crashing it.\n",
	"To change this limit, set the config variable\n",
	"`--NotebookApp.iopub_msg_rate_limit`.\n",
	"\n",
	"Current values:\n",
	"NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
	"NotebookApp.rate_limit_window=3.0 (secs)\n",
	"\n"
	]
	}
	],
	"source": [
	"import os, random\n",
	"import tarfile\n",
	"from tqdm.auto import tqdm\n",
	"for name in tqdm(os.listdir(ARCHIVES_PATH)):\n",
	" assert name.endswith('.xz')\n",
	" with tarfile.open(os.path.join(ARCHIVES_PATH, name)) as tar:\n",
	" tar.extractall(os.path.join(DOCUMENTS_PATH, name.rstrip('.xz')))\n",
	" \n",
	"all_files = [os.path.join(path, file)\n",
	" for path, subdirs, files in os.walk(DOCUMENTS_PATH)\n",
	" for file in files\n",
	"]\n",
	"random.Random(42).shuffle(all_files)\n",
	"train_files, valid_files = all_files[:-NUM_VAL_FILES], all_files[-NUM_VAL_FILES:]\n",
	"\n",
	"with open(\"openwebtext.valid.raw\", 'w') as f_out:\n",
	" for file in tqdm(valid_files):\n",
	" chunk = open(file).read()\n",
	" f_out.write(chunk)\n",
	" f_out.write('\\n')\n",
	"\n",
	"\n",
	"with open(\"openwebtext.train.raw\", 'w') as f_out:\n",
	" for file in tqdm(train_files):\n",
	" chunk = open(file).read()\n",
	" f_out.write(chunk)\n",
	" f_out.write('\\n')\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-rw-r-- 1 jheuristic jheuristic 38G мар 31 02:02 openwebtext.train.raw\r\n",
	"-rw-rw-r-- 1 jheuristic jheuristic 24M мар 30 19:53 openwebtext.valid.raw\r\n"
	]
	}
	],
	"source": [
	"!ls -lht openwebtext.valid.raw openwebtext.train.raw"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Install fairseq:\n",
	"```bash\n",
	"git clone https://github.com/pytorch/fairseq\n",
	"cd fairseq && python setup.py develop\n",
	"\n",
	"```\n",
	"\n",
	"apply gpt2-style preprocessing:\n",
	"```bash\n",
	"\n",
	"cd fairseq\n",
	"\n",
	"\n",
	"\n",
	"mkdir -p gpt2_bpe\n",
	"wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json\n",
	"wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe\n",
	"wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt\n",
	"\n",
	"for SPLIT in valid train; do \\\n",
	" python -m examples.roberta.multiprocessing_bpe_encoder \\\n",
	" --encoder-json gpt2_bpe/encoder.json \\\n",
	" --vocab-bpe gpt2_bpe/vocab.bpe \\\n",
	" --inputs ../openwebtext.${SPLIT}.raw \\\n",
	" --outputs ../openwebtext.${SPLIT}.bpe \\\n",
	" --keep-empty \\\n",
	" --workers 60; \\\n",
	"done\n",
	"\n",
	"\n",
	"fairseq-preprocess \\\n",
	" --only-source \\\n",
	" --srcdict gpt2_bpe/dict.txt \\\n",
	" --trainpref ../openwebtext.train.bpe \\\n",
	" --validpref ../openwebtext.valid.bpe \\\n",
	" --destdir ../data-bin/openwebtext \\\n",
	" --workers 60\n",
	"\n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "py38",
	"language": "python",
	"name": "py38"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}