joshlk/inference_pretraining_data.ipynb

## inference_pretraining_data.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "22f560bd-e9b8-4ddb-90be-fe08b817ff40",
   "metadata": {},
   "source": [
    "Using 40GB A100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7e85863-26c6-41aa-ac9f-9c686e684bd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = 'cuda:3'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "499b5ee2-102c-4151-8212-cb663456862b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import joblib\n",
    "from transformers import GPTNeoXForCausalLM, AutoTokenizer\n",
    "from joblib import Parallel, delayed\n",
    "from itertools import islice\n",
    "from transformers import DataCollatorForLanguageModeling\n",
    "from torch.nn.utils.rnn import pad_packed_sequence\n",
    "from transformers import (\n",
    "    LlamaForCausalLM,\n",
    "    LlamaTokenizer,\n",
    "    LlamaConfig,\n",
    "    default_data_collator,\n",
    ")\n",
    "import torch\n",
    "from random import randint\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5149fbbe-0e71-47de-87d3-be2a4b482c8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "89b96b71-2a9d-445b-9dab-c13749727b4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9d74155b599c45e5afce210feb8f3358",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "93ae4af1fde24da9b269df2a4c7ca7d3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "01907d96af6e44869423ff9964b3edb6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a326d777849c4c4bb5890cb61258fb04",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n",
      "/home/joshl/miniconda3/envs/lit-gpt/lib/python3.9/site-packages/transformers/generation/utils.py:1355: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Hello, I am a newbie in the world of programming. I am trying to make a program'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = GPTNeoXForCausalLM.from_pretrained(\n",
    "  \"EleutherAI/pythia-12b-deduped\",\n",
    "  #revision=\"step3000\",\n",
    "  #cache_dir=\"./pythia-70m-deduped/step3000\",\n",
    ")\n",
    "model.to(device=device, dtype=torch.bfloat16)\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\n",
    "  \"EleutherAI/pythia-12b-deduped\",\n",
    "  #revision=\"step3000\",\n",
    "  #cache_dir=\"./pythia-70m-deduped/step3000\",\n",
    ")\n",
    "\n",
    "inputs = tokenizer(\"Hello, I am\", return_tensors=\"pt\")\n",
    "inputs = {k: v.to(device=device) for k, v in inputs.items()}\n",
    "tokens = model.generate(**inputs)\n",
    "tokenizer.decode(tokens[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5b685afb-8f36-4e26-a424-9b19a206758a",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = datasets.load_dataset('wikipedia', '20220301.en', split='train', trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4726ae79-7adb-4e74-8c4a-061ef78ba586",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tok(text):\n",
    "    return tokenizer(text+'<|endoftext|>', return_tensors=\"pt\")['input_ids']\n",
    "    \n",
    "n_docs = 300\n",
    "tokens = Parallel(n_jobs=100)(delayed(tok)(t['text']) for t in islice(dataset, n_docs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7995d4a8-f694-4797-bc5f-aef4e89b4eab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Indication of memory usage\n",
    "max_length = 128\n",
    "batch_size = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5ec844e5-f21f-4262-9367-86a3deefdcc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_all = torch.concatenate([t.flatten() for t in tokens])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "93e5fc60-b447-4515-9153-70dd66a1494e",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch = []\n",
    "for i in range(batch_size):\n",
    "    idx = randint(0, len(tokens_all)-1-max_length)\n",
    "    seq = tokens_all[idx:idx+max_length]\n",
    "    batch += [seq]\n",
    "batch = torch.stack(batch).to(device=device) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "9cbec62c-1f5d-4f8a-980f-c20fa2515e63",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(2.6094, device='cuda:3', dtype=torch.bfloat16,\n",
       "       grad_fn=<NllLossBackward0>)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out = model(batch, labels=batch)  # Automatically offset data\n",
    "out.loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "52de44ee-4864-46b7-8e9e-3d84e343579a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Equivalent to uniformly choosing from 6.09375 options\n"
     ]
    }
   ],
   "source": [
    "print(f'Equivalent to uniformly choosing from {2**out.loss} options')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "5db659e3-45ad-4efa-b43f-4308907cc0b2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### Input\n",
      " for J. K. Rowling, 78,770 for Roald Dahl and 75,841 for J. R. R. Tolkien. In 2015, the Christie estate claimed And Then There Were None was \"the best-selling crime novel of all time\", with approximately 100 million sales, also making it one of the highest-selling books of all time. More than two million copies of her books were sold in English in 2020.\n",
      "\n",
      "Legacy \n",
      "In 2016, one hundred years after Christie wrote her first detective story, the Royal Mail released six stamps in her honour, featuring The Mysterious Affair at is to move and work in this symbol space, being capable of being in, and operating in but one box at a time.... a box is to admit of but two possible conditions, i.e., being empty or unmarked, and having a single mark in it, say a vertical stroke.\n",
      "\n",
      "\"One box is to be singled out and called the starting point....a specific problem is to be given in symbolic form by a finite number of boxes [i.e., INPUT] being marked with a stroke. Likewise, the answer [i.e., OUTPUT] is to be given in symbolic form by\n",
      "### Output\n",
      " the.C. Rowling, and, who, Jald Dahl, 78,000 for J. R. R. Tolkien.\n",
      " the, the top's sold thatr There Were None was thethe most-selling Ag novel of all time\". with over 1, copies. and claiming it the of the best-selling books of all time.\n",
      " than 100- copies of the books are sold in the in the.\n",
      "\n",
      "Inacy\n",
      "\n",
      "In the, the of years after the's her first novel novel, The BBC Mint issued a stamps featuring a honour. including her Murdersterious Affair at Sty the be the to with the worldism. and able of moving in the on moving in, not place at a time.\n",
      " single that a be of being one dimensions states, either.e. the empty or fulloccupied, or being a mark symbol on it. and, cross line.\n",
      "\n",
      "TheThe of at to be distinguishedled out from distinguished a'point, The The second point is to be stated to the form, a single number of symbols,i.e., symbolsPUTS and filled in a single,\n",
      ", a solution isi.e., OUTPUT] is to be given by the form by a\n"
     ]
    }
   ],
   "source": [
    "input_ = tokenizer.decode(batch.flatten().tolist())\n",
    "output_ = tokenizer.decode(out.logits.argmax(axis=-1).flatten().tolist())\n",
    "print(\"### Input\")\n",
    "print(input_)\n",
    "print(\"### Output\")\n",
    "print(output_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "508b4494-7202-4a68-a00c-097a8783aed6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "22f560bd-e9b8-4ddb-90be-fe08b817ff40",
	"metadata": {},
	"source": [
	"Using 40GB A100"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f7e85863-26c6-41aa-ac9f-9c686e684bd4",
	"metadata": {},
	"outputs": [],
	"source": [
	"device = 'cuda:3'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "499b5ee2-102c-4151-8212-cb663456862b",
	"metadata": {},
	"outputs": [],
	"source": [
	"import datasets\n",
	"import joblib\n",
	"from transformers import GPTNeoXForCausalLM, AutoTokenizer\n",
	"from joblib import Parallel, delayed\n",
	"from itertools import islice\n",
	"from transformers import DataCollatorForLanguageModeling\n",
	"from torch.nn.utils.rnn import pad_packed_sequence\n",
	"from transformers import (\n",
	" LlamaForCausalLM,\n",
	" LlamaTokenizer,\n",
	" LlamaConfig,\n",
	" default_data_collator,\n",
	")\n",
	"import torch\n",
	"from random import randint\n",
	"import gc"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "5149fbbe-0e71-47de-87d3-be2a4b482c8e",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "89b96b71-2a9d-445b-9dab-c13749727b4a",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "9d74155b599c45e5afce210feb8f3358",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Loading checkpoint shards: 0%\| \| 0/3 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "93ae4af1fde24da9b269df2a4c7ca7d3",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"tokenizer_config.json: 0%\| \| 0.00/396 [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "01907d96af6e44869423ff9964b3edb6",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"tokenizer.json: 0%\| \| 0.00/2.11M [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "a326d777849c4c4bb5890cb61258fb04",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"special_tokens_map.json: 0%\| \| 0.00/99.0 [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
	"Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n",
	"/home/joshl/miniconda3/envs/lit-gpt/lib/python3.9/site-packages/transformers/generation/utils.py:1355: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
	" warnings.warn(\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"'Hello, I am a newbie in the world of programming. I am trying to make a program'"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model = GPTNeoXForCausalLM.from_pretrained(\n",
	" \"EleutherAI/pythia-12b-deduped\",\n",
	" #revision=\"step3000\",\n",
	" #cache_dir=\"./pythia-70m-deduped/step3000\",\n",
	")\n",
	"model.to(device=device, dtype=torch.bfloat16)\n",
	"\n",
	"tokenizer = AutoTokenizer.from_pretrained(\n",
	" \"EleutherAI/pythia-12b-deduped\",\n",
	" #revision=\"step3000\",\n",
	" #cache_dir=\"./pythia-70m-deduped/step3000\",\n",
	")\n",
	"\n",
	"inputs = tokenizer(\"Hello, I am\", return_tensors=\"pt\")\n",
	"inputs = {k: v.to(device=device) for k, v in inputs.items()}\n",
	"tokens = model.generate(**inputs)\n",
	"tokenizer.decode(tokens[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "5b685afb-8f36-4e26-a424-9b19a206758a",
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset = datasets.load_dataset('wikipedia', '20220301.en', split='train', trust_remote_code=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4726ae79-7adb-4e74-8c4a-061ef78ba586",
	"metadata": {},
	"outputs": [],
	"source": [
	"def tok(text):\n",
	" return tokenizer(text+'<\|endoftext\|>', return_tensors=\"pt\")['input_ids']\n",
	" \n",
	"n_docs = 300\n",
	"tokens = Parallel(n_jobs=100)(delayed(tok)(t['text']) for t in islice(dataset, n_docs))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"id": "7995d4a8-f694-4797-bc5f-aef4e89b4eab",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Indication of memory usage\n",
	"max_length = 128\n",
	"batch_size = 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"id": "5ec844e5-f21f-4262-9367-86a3deefdcc5",
	"metadata": {},
	"outputs": [],
	"source": [
	"tokens_all = torch.concatenate([t.flatten() for t in tokens])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"id": "93e5fc60-b447-4515-9153-70dd66a1494e",
	"metadata": {},
	"outputs": [],
	"source": [
	"batch = []\n",
	"for i in range(batch_size):\n",
	" idx = randint(0, len(tokens_all)-1-max_length)\n",
	" seq = tokens_all[idx:idx+max_length]\n",
	" batch += [seq]\n",
	"batch = torch.stack(batch).to(device=device) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"id": "9cbec62c-1f5d-4f8a-980f-c20fa2515e63",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"tensor(2.6094, device='cuda:3', dtype=torch.bfloat16,\n",
	" grad_fn=<NllLossBackward0>)"
	]
	},
	"execution_count": 39,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"out = model(batch, labels=batch) # Automatically offset data\n",
	"out.loss"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"id": "52de44ee-4864-46b7-8e9e-3d84e343579a",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Equivalent to uniformly choosing from 6.09375 options\n"
	]
	}
	],
	"source": [
	"print(f'Equivalent to uniformly choosing from {2**out.loss} options')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"id": "5db659e3-45ad-4efa-b43f-4308907cc0b2",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"### Input\n",
	" for J. K. Rowling, 78,770 for Roald Dahl and 75,841 for J. R. R. Tolkien. In 2015, the Christie estate claimed And Then There Were None was \"the best-selling crime novel of all time\", with approximately 100 million sales, also making it one of the highest-selling books of all time. More than two million copies of her books were sold in English in 2020.\n",
	"\n",
	"Legacy \n",
	"In 2016, one hundred years after Christie wrote her first detective story, the Royal Mail released six stamps in her honour, featuring The Mysterious Affair at is to move and work in this symbol space, being capable of being in, and operating in but one box at a time.... a box is to admit of but two possible conditions, i.e., being empty or unmarked, and having a single mark in it, say a vertical stroke.\n",
	"\n",
	"\"One box is to be singled out and called the starting point....a specific problem is to be given in symbolic form by a finite number of boxes [i.e., INPUT] being marked with a stroke. Likewise, the answer [i.e., OUTPUT] is to be given in symbolic form by\n",
	"### Output\n",
	" the.C. Rowling, and, who, Jald Dahl, 78,000 for J. R. R. Tolkien.\n",
	" the, the top's sold thatr There Were None was thethe most-selling Ag novel of all time\". with over 1, copies. and claiming it the of the best-selling books of all time.\n",
	" than 100- copies of the books are sold in the in the.\n",
	"\n",
	"Inacy\n",
	"\n",
	"In the, the of years after the's her first novel novel, The BBC Mint issued a stamps featuring a honour. including her Murdersterious Affair at Sty the be the to with the worldism. and able of moving in the on moving in, not place at a time.\n",
	" single that a be of being one dimensions states, either.e. the empty or fulloccupied, or being a mark symbol on it. and, cross line.\n",
	"\n",
	"TheThe of at to be distinguishedled out from distinguished a'point, The The second point is to be stated to the form, a single number of symbols,i.e., symbolsPUTS and filled in a single,\n",
	", a solution isi.e., OUTPUT] is to be given by the form by a\n"
	]
	}
	],
	"source": [
	"input_ = tokenizer.decode(batch.flatten().tolist())\n",
	"output_ = tokenizer.decode(out.logits.argmax(axis=-1).flatten().tolist())\n",
	"print(\"### Input\")\n",
	"print(input_)\n",
	"print(\"### Output\")\n",
	"print(output_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "508b4494-7202-4a68-a00c-097a8783aed6",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.18"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}