fpgaminer/DatasetTest.ipynb Secret

## DatasetTest.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up logging\n",
    "import logging\n",
    "logging.basicConfig(\n",
    "        format=\"%(asctime)s - %(levelname)s - %(name)s -   %(message)s\",\n",
    "        datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
    "        level=logging.INFO,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make deterministic\n",
    "from mingpt.utils import set_seed\n",
    "set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.nn import functional as F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "from torch.utils.data import Dataset\n",
    "\n",
    "class CharDataset(Dataset):\n",
    "\n",
    "    def __init__(self, data, block_size):\n",
    "        chars = sorted(list(set(data)))\n",
    "        data_size, vocab_size = len(data), len(chars)\n",
    "        print('data has %d characters, %d unique.' % (data_size, vocab_size))\n",
    "        \n",
    "        self.stoi = { ch:i for i,ch in enumerate(chars) }\n",
    "        self.itos = { i:ch for i,ch in enumerate(chars) }\n",
    "        self.block_size = block_size\n",
    "        self.vocab_size = vocab_size\n",
    "        self.data = data\n",
    "    \n",
    "    def __len__(self):\n",
    "        return math.ceil(len(self.data) / (self.block_size + 1))\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        # we're actually going to \"cheat\" and pick a spot in the dataset at random\n",
    "        i = np.random.randint(0, len(self.data) - (self.block_size + 1))\n",
    "        x = torch.tensor([i], dtype=torch.long)\n",
    "        y = torch.tensor([i], dtype=torch.long)\n",
    "        return x, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "block_size = 128 # spatial extent of the model for its context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data has 1115394 characters, 65 unique.\n"
     ]
    }
   ],
   "source": [
    "# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt\n",
    "text = open('input.txt', 'r').read() # don't worry we won't run out of file handles\n",
    "train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(tensor([121958]), tensor([121958]))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 671155],\n",
      "        [ 131932],\n",
      "        [ 259178],\n",
      "        [ 110268],\n",
      "        [ 732180],\n",
      "        [1103462],\n",
      "        [ 137337],\n",
      "        [ 999890]])\n",
      "\n",
      "tensor([[ 671155],\n",
      "        [ 131932],\n",
      "        [ 259178],\n",
      "        [ 110268],\n",
      "        [ 732180],\n",
      "        [1103462],\n",
      "        [ 137337],\n",
      "        [ 999890]])\n",
      "\n",
      "tensor([[ 671155],\n",
      "        [ 131932],\n",
      "        [ 259178],\n",
      "        [ 110268],\n",
      "        [ 732180],\n",
      "        [1103462],\n",
      "        [ 137337],\n",
      "        [ 999890]])\n",
      "\n",
      "tensor([[ 671155],\n",
      "        [ 131932],\n",
      "        [ 259178],\n",
      "        [ 110268],\n",
      "        [ 732180],\n",
      "        [1103462],\n",
      "        [ 137337],\n",
      "        [ 999890]])\n",
      "\n",
      "tensor([[ 912756],\n",
      "        [ 175203],\n",
      "        [ 278167],\n",
      "        [  41090],\n",
      "        [ 329365],\n",
      "        [1113396],\n",
      "        [ 787201],\n",
      "        [ 327069]])\n",
      "\n",
      "tensor([[ 912756],\n",
      "        [ 175203],\n",
      "        [ 278167],\n",
      "        [  41090],\n",
      "        [ 329365],\n",
      "        [1113396],\n",
      "        [ 787201],\n",
      "        [ 327069]])\n",
      "\n",
      "tensor([[ 912756],\n",
      "        [ 175203],\n",
      "        [ 278167],\n",
      "        [  41090],\n",
      "        [ 329365],\n",
      "        [1113396],\n",
      "        [ 787201],\n",
      "        [ 327069]])\n",
      "\n",
      "tensor([[ 912756],\n",
      "        [ 175203],\n",
      "        [ 278167],\n",
      "        [  41090],\n",
      "        [ 329365],\n",
      "        [1113396],\n",
      "        [ 787201],\n",
      "        [ 327069]])\n",
      "\n",
      "tensor([[791743],\n",
      "        [103355],\n",
      "        [184779],\n",
      "        [989436],\n",
      "        [486232],\n",
      "        [917040],\n",
      "        [156730],\n",
      "        [654811]])\n",
      "\n",
      "tensor([[791743],\n",
      "        [103355],\n",
      "        [184779],\n",
      "        [989436],\n",
      "        [486232],\n",
      "        [917040],\n",
      "        [156730],\n",
      "        [654811]])\n",
      "\n",
      "tensor([[791743],\n",
      "        [103355],\n",
      "        [184779],\n",
      "        [989436],\n",
      "        [486232],\n",
      "        [917040],\n",
      "        [156730],\n",
      "        [654811]])\n",
      "\n",
      "tensor([[791743],\n",
      "        [103355],\n",
      "        [184779],\n",
      "        [989436],\n",
      "        [486232],\n",
      "        [917040],\n",
      "        [156730],\n",
      "        [654811]])\n",
      "\n",
      "tensor([[527035],\n",
      "        [648143],\n",
      "        [ 65725],\n",
      "        [ 84654],\n",
      "        [953277],\n",
      "        [591723],\n",
      "        [319030],\n",
      "        [555839]])\n",
      "\n",
      "tensor([[527035],\n",
      "        [648143],\n",
      "        [ 65725],\n",
      "        [ 84654],\n",
      "        [953277],\n",
      "        [591723],\n",
      "        [319030],\n",
      "        [555839]])\n",
      "\n",
      "tensor([[527035],\n",
      "        [648143],\n",
      "        [ 65725],\n",
      "        [ 84654],\n",
      "        [953277],\n",
      "        [591723],\n",
      "        [319030],\n",
      "        [555839]])\n",
      "\n",
      "tensor([[527035],\n",
      "        [648143],\n",
      "        [ 65725],\n",
      "        [ 84654],\n",
      "        [953277],\n",
      "        [591723],\n",
      "        [319030],\n",
      "        [555839]])\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from torch.utils.data.dataloader import DataLoader\n",
    "\n",
    "loader = DataLoader(train_dataset, batch_size=8, num_workers=4)\n",
    "\n",
    "debug_iters = 16\n",
    "\n",
    "for it, (x, y) in enumerate(loader):\n",
    "    \n",
    "    if it == debug_iters:\n",
    "        break\n",
    "    \n",
    "    print(x)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# set up logging\n",
	"import logging\n",
	"logging.basicConfig(\n",
	" format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
	" datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
	" level=logging.INFO,\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# make deterministic\n",
	"from mingpt.utils import set_seed\n",
	"set_seed(42)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import torch\n",
	"import torch.nn as nn\n",
	"from torch.nn import functional as F"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"import math\n",
	"from torch.utils.data import Dataset\n",
	"\n",
	"class CharDataset(Dataset):\n",
	"\n",
	" def __init__(self, data, block_size):\n",
	" chars = sorted(list(set(data)))\n",
	" data_size, vocab_size = len(data), len(chars)\n",
	" print('data has %d characters, %d unique.' % (data_size, vocab_size))\n",
	" \n",
	" self.stoi = { ch:i for i,ch in enumerate(chars) }\n",
	" self.itos = { i:ch for i,ch in enumerate(chars) }\n",
	" self.block_size = block_size\n",
	" self.vocab_size = vocab_size\n",
	" self.data = data\n",
	" \n",
	" def __len__(self):\n",
	" return math.ceil(len(self.data) / (self.block_size + 1))\n",
	"\n",
	" def __getitem__(self, idx):\n",
	" # we're actually going to \"cheat\" and pick a spot in the dataset at random\n",
	" i = np.random.randint(0, len(self.data) - (self.block_size + 1))\n",
	" x = torch.tensor([i], dtype=torch.long)\n",
	" y = torch.tensor([i], dtype=torch.long)\n",
	" return x, y"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"block_size = 128 # spatial extent of the model for its context"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"data has 1115394 characters, 65 unique.\n"
	]
	}
	],
	"source": [
	"# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt\n",
	"text = open('input.txt', 'r').read() # don't worry we won't run out of file handles\n",
	"train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(tensor([121958]), tensor([121958]))"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"train_dataset[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"tensor([[ 671155],\n",
	" [ 131932],\n",
	" [ 259178],\n",
	" [ 110268],\n",
	" [ 732180],\n",
	" [1103462],\n",
	" [ 137337],\n",
	" [ 999890]])\n",
	"\n",
	"tensor([[ 671155],\n",
	" [ 131932],\n",
	" [ 259178],\n",
	" [ 110268],\n",
	" [ 732180],\n",
	" [1103462],\n",
	" [ 137337],\n",
	" [ 999890]])\n",
	"\n",
	"tensor([[ 671155],\n",
	" [ 131932],\n",
	" [ 259178],\n",
	" [ 110268],\n",
	" [ 732180],\n",
	" [1103462],\n",
	" [ 137337],\n",
	" [ 999890]])\n",
	"\n",
	"tensor([[ 671155],\n",
	" [ 131932],\n",
	" [ 259178],\n",
	" [ 110268],\n",
	" [ 732180],\n",
	" [1103462],\n",
	" [ 137337],\n",
	" [ 999890]])\n",
	"\n",
	"tensor([[ 912756],\n",
	" [ 175203],\n",
	" [ 278167],\n",
	" [ 41090],\n",
	" [ 329365],\n",
	" [1113396],\n",
	" [ 787201],\n",
	" [ 327069]])\n",
	"\n",
	"tensor([[ 912756],\n",
	" [ 175203],\n",
	" [ 278167],\n",
	" [ 41090],\n",
	" [ 329365],\n",
	" [1113396],\n",
	" [ 787201],\n",
	" [ 327069]])\n",
	"\n",
	"tensor([[ 912756],\n",
	" [ 175203],\n",
	" [ 278167],\n",
	" [ 41090],\n",
	" [ 329365],\n",
	" [1113396],\n",
	" [ 787201],\n",
	" [ 327069]])\n",
	"\n",
	"tensor([[ 912756],\n",
	" [ 175203],\n",
	" [ 278167],\n",
	" [ 41090],\n",
	" [ 329365],\n",
	" [1113396],\n",
	" [ 787201],\n",
	" [ 327069]])\n",
	"\n",
	"tensor([[791743],\n",
	" [103355],\n",
	" [184779],\n",
	" [989436],\n",
	" [486232],\n",
	" [917040],\n",
	" [156730],\n",
	" [654811]])\n",
	"\n",
	"tensor([[791743],\n",
	" [103355],\n",
	" [184779],\n",
	" [989436],\n",
	" [486232],\n",
	" [917040],\n",
	" [156730],\n",
	" [654811]])\n",
	"\n",
	"tensor([[791743],\n",
	" [103355],\n",
	" [184779],\n",
	" [989436],\n",
	" [486232],\n",
	" [917040],\n",
	" [156730],\n",
	" [654811]])\n",
	"\n",
	"tensor([[791743],\n",
	" [103355],\n",
	" [184779],\n",
	" [989436],\n",
	" [486232],\n",
	" [917040],\n",
	" [156730],\n",
	" [654811]])\n",
	"\n",
	"tensor([[527035],\n",
	" [648143],\n",
	" [ 65725],\n",
	" [ 84654],\n",
	" [953277],\n",
	" [591723],\n",
	" [319030],\n",
	" [555839]])\n",
	"\n",
	"tensor([[527035],\n",
	" [648143],\n",
	" [ 65725],\n",
	" [ 84654],\n",
	" [953277],\n",
	" [591723],\n",
	" [319030],\n",
	" [555839]])\n",
	"\n",
	"tensor([[527035],\n",
	" [648143],\n",
	" [ 65725],\n",
	" [ 84654],\n",
	" [953277],\n",
	" [591723],\n",
	" [319030],\n",
	" [555839]])\n",
	"\n",
	"tensor([[527035],\n",
	" [648143],\n",
	" [ 65725],\n",
	" [ 84654],\n",
	" [953277],\n",
	" [591723],\n",
	" [319030],\n",
	" [555839]])\n",
	"\n"
	]
	}
	],
	"source": [
	"from torch.utils.data.dataloader import DataLoader\n",
	"\n",
	"loader = DataLoader(train_dataset, batch_size=8, num_workers=4)\n",
	"\n",
	"debug_iters = 16\n",
	"\n",
	"for it, (x, y) in enumerate(loader):\n",
	" \n",
	" if it == debug_iters:\n",
	" break\n",
	" \n",
	" print(x)\n",
	" print()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}