Skip to content

Instantly share code, notes, and snippets.

@fpgaminer
Created August 24, 2020 21:40
Show Gist options
  • Save fpgaminer/7737a9377e3379fe17dc5bb83d4db69c to your computer and use it in GitHub Desktop.
Save fpgaminer/7737a9377e3379fe17dc5bb83d4db69c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# set up logging\n",
"import logging\n",
"logging.basicConfig(\n",
" format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
" datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
" level=logging.INFO,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# make deterministic\n",
"from mingpt.utils import set_seed\n",
"set_seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"from torch.nn import functional as F"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"from torch.utils.data import Dataset\n",
"\n",
"class CharDataset(Dataset):\n",
"\n",
" def __init__(self, data, block_size):\n",
" chars = sorted(list(set(data)))\n",
" data_size, vocab_size = len(data), len(chars)\n",
" print('data has %d characters, %d unique.' % (data_size, vocab_size))\n",
" \n",
" self.stoi = { ch:i for i,ch in enumerate(chars) }\n",
" self.itos = { i:ch for i,ch in enumerate(chars) }\n",
" self.block_size = block_size\n",
" self.vocab_size = vocab_size\n",
" self.data = data\n",
" \n",
" def __len__(self):\n",
" return math.ceil(len(self.data) / (self.block_size + 1))\n",
"\n",
" def __getitem__(self, idx):\n",
" # we're actually going to \"cheat\" and pick a spot in the dataset at random\n",
" i = np.random.randint(0, len(self.data) - (self.block_size + 1))\n",
" x = torch.tensor([i], dtype=torch.long)\n",
" y = torch.tensor([i], dtype=torch.long)\n",
" return x, y"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"block_size = 128 # spatial extent of the model for its context"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data has 1115394 characters, 65 unique.\n"
]
}
],
"source": [
"# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt\n",
"text = open('input.txt', 'r').read() # don't worry we won't run out of file handles\n",
"train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(tensor([121958]), tensor([121958]))"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 671155],\n",
" [ 131932],\n",
" [ 259178],\n",
" [ 110268],\n",
" [ 732180],\n",
" [1103462],\n",
" [ 137337],\n",
" [ 999890]])\n",
"\n",
"tensor([[ 671155],\n",
" [ 131932],\n",
" [ 259178],\n",
" [ 110268],\n",
" [ 732180],\n",
" [1103462],\n",
" [ 137337],\n",
" [ 999890]])\n",
"\n",
"tensor([[ 671155],\n",
" [ 131932],\n",
" [ 259178],\n",
" [ 110268],\n",
" [ 732180],\n",
" [1103462],\n",
" [ 137337],\n",
" [ 999890]])\n",
"\n",
"tensor([[ 671155],\n",
" [ 131932],\n",
" [ 259178],\n",
" [ 110268],\n",
" [ 732180],\n",
" [1103462],\n",
" [ 137337],\n",
" [ 999890]])\n",
"\n",
"tensor([[ 912756],\n",
" [ 175203],\n",
" [ 278167],\n",
" [ 41090],\n",
" [ 329365],\n",
" [1113396],\n",
" [ 787201],\n",
" [ 327069]])\n",
"\n",
"tensor([[ 912756],\n",
" [ 175203],\n",
" [ 278167],\n",
" [ 41090],\n",
" [ 329365],\n",
" [1113396],\n",
" [ 787201],\n",
" [ 327069]])\n",
"\n",
"tensor([[ 912756],\n",
" [ 175203],\n",
" [ 278167],\n",
" [ 41090],\n",
" [ 329365],\n",
" [1113396],\n",
" [ 787201],\n",
" [ 327069]])\n",
"\n",
"tensor([[ 912756],\n",
" [ 175203],\n",
" [ 278167],\n",
" [ 41090],\n",
" [ 329365],\n",
" [1113396],\n",
" [ 787201],\n",
" [ 327069]])\n",
"\n",
"tensor([[791743],\n",
" [103355],\n",
" [184779],\n",
" [989436],\n",
" [486232],\n",
" [917040],\n",
" [156730],\n",
" [654811]])\n",
"\n",
"tensor([[791743],\n",
" [103355],\n",
" [184779],\n",
" [989436],\n",
" [486232],\n",
" [917040],\n",
" [156730],\n",
" [654811]])\n",
"\n",
"tensor([[791743],\n",
" [103355],\n",
" [184779],\n",
" [989436],\n",
" [486232],\n",
" [917040],\n",
" [156730],\n",
" [654811]])\n",
"\n",
"tensor([[791743],\n",
" [103355],\n",
" [184779],\n",
" [989436],\n",
" [486232],\n",
" [917040],\n",
" [156730],\n",
" [654811]])\n",
"\n",
"tensor([[527035],\n",
" [648143],\n",
" [ 65725],\n",
" [ 84654],\n",
" [953277],\n",
" [591723],\n",
" [319030],\n",
" [555839]])\n",
"\n",
"tensor([[527035],\n",
" [648143],\n",
" [ 65725],\n",
" [ 84654],\n",
" [953277],\n",
" [591723],\n",
" [319030],\n",
" [555839]])\n",
"\n",
"tensor([[527035],\n",
" [648143],\n",
" [ 65725],\n",
" [ 84654],\n",
" [953277],\n",
" [591723],\n",
" [319030],\n",
" [555839]])\n",
"\n",
"tensor([[527035],\n",
" [648143],\n",
" [ 65725],\n",
" [ 84654],\n",
" [953277],\n",
" [591723],\n",
" [319030],\n",
" [555839]])\n",
"\n"
]
}
],
"source": [
"from torch.utils.data.dataloader import DataLoader\n",
"\n",
"loader = DataLoader(train_dataset, batch_size=8, num_workers=4)\n",
"\n",
"debug_iters = 16\n",
"\n",
"for it, (x, y) in enumerate(loader):\n",
" \n",
" if it == debug_iters:\n",
" break\n",
" \n",
" print(x)\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment