-
-
Save fpgaminer/7737a9377e3379fe17dc5bb83d4db69c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# set up logging\n", | |
"import logging\n", | |
"logging.basicConfig(\n", | |
" format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", | |
" datefmt=\"%m/%d/%Y %H:%M:%S\",\n", | |
" level=logging.INFO,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make deterministic\n", | |
"from mingpt.utils import set_seed\n", | |
"set_seed(42)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import torch\n", | |
"import torch.nn as nn\n", | |
"from torch.nn import functional as F" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import math\n", | |
"from torch.utils.data import Dataset\n", | |
"\n", | |
"class CharDataset(Dataset):\n", | |
"\n", | |
" def __init__(self, data, block_size):\n", | |
" chars = sorted(list(set(data)))\n", | |
" data_size, vocab_size = len(data), len(chars)\n", | |
" print('data has %d characters, %d unique.' % (data_size, vocab_size))\n", | |
" \n", | |
" self.stoi = { ch:i for i,ch in enumerate(chars) }\n", | |
" self.itos = { i:ch for i,ch in enumerate(chars) }\n", | |
" self.block_size = block_size\n", | |
" self.vocab_size = vocab_size\n", | |
" self.data = data\n", | |
" \n", | |
" def __len__(self):\n", | |
" return math.ceil(len(self.data) / (self.block_size + 1))\n", | |
"\n", | |
" def __getitem__(self, idx):\n", | |
" # we're actually going to \"cheat\" and pick a spot in the dataset at random\n", | |
" i = np.random.randint(0, len(self.data) - (self.block_size + 1))\n", | |
" x = torch.tensor([i], dtype=torch.long)\n", | |
" y = torch.tensor([i], dtype=torch.long)\n", | |
" return x, y" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"block_size = 128 # spatial extent of the model for its context" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"data has 1115394 characters, 65 unique.\n" | |
] | |
} | |
], | |
"source": [ | |
"# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt\n", | |
"text = open('input.txt', 'r').read() # don't worry we won't run out of file handles\n", | |
"train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(tensor([121958]), tensor([121958]))" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"tensor([[ 671155],\n", | |
" [ 131932],\n", | |
" [ 259178],\n", | |
" [ 110268],\n", | |
" [ 732180],\n", | |
" [1103462],\n", | |
" [ 137337],\n", | |
" [ 999890]])\n", | |
"\n", | |
"tensor([[ 671155],\n", | |
" [ 131932],\n", | |
" [ 259178],\n", | |
" [ 110268],\n", | |
" [ 732180],\n", | |
" [1103462],\n", | |
" [ 137337],\n", | |
" [ 999890]])\n", | |
"\n", | |
"tensor([[ 671155],\n", | |
" [ 131932],\n", | |
" [ 259178],\n", | |
" [ 110268],\n", | |
" [ 732180],\n", | |
" [1103462],\n", | |
" [ 137337],\n", | |
" [ 999890]])\n", | |
"\n", | |
"tensor([[ 671155],\n", | |
" [ 131932],\n", | |
" [ 259178],\n", | |
" [ 110268],\n", | |
" [ 732180],\n", | |
" [1103462],\n", | |
" [ 137337],\n", | |
" [ 999890]])\n", | |
"\n", | |
"tensor([[ 912756],\n", | |
" [ 175203],\n", | |
" [ 278167],\n", | |
" [ 41090],\n", | |
" [ 329365],\n", | |
" [1113396],\n", | |
" [ 787201],\n", | |
" [ 327069]])\n", | |
"\n", | |
"tensor([[ 912756],\n", | |
" [ 175203],\n", | |
" [ 278167],\n", | |
" [ 41090],\n", | |
" [ 329365],\n", | |
" [1113396],\n", | |
" [ 787201],\n", | |
" [ 327069]])\n", | |
"\n", | |
"tensor([[ 912756],\n", | |
" [ 175203],\n", | |
" [ 278167],\n", | |
" [ 41090],\n", | |
" [ 329365],\n", | |
" [1113396],\n", | |
" [ 787201],\n", | |
" [ 327069]])\n", | |
"\n", | |
"tensor([[ 912756],\n", | |
" [ 175203],\n", | |
" [ 278167],\n", | |
" [ 41090],\n", | |
" [ 329365],\n", | |
" [1113396],\n", | |
" [ 787201],\n", | |
" [ 327069]])\n", | |
"\n", | |
"tensor([[791743],\n", | |
" [103355],\n", | |
" [184779],\n", | |
" [989436],\n", | |
" [486232],\n", | |
" [917040],\n", | |
" [156730],\n", | |
" [654811]])\n", | |
"\n", | |
"tensor([[791743],\n", | |
" [103355],\n", | |
" [184779],\n", | |
" [989436],\n", | |
" [486232],\n", | |
" [917040],\n", | |
" [156730],\n", | |
" [654811]])\n", | |
"\n", | |
"tensor([[791743],\n", | |
" [103355],\n", | |
" [184779],\n", | |
" [989436],\n", | |
" [486232],\n", | |
" [917040],\n", | |
" [156730],\n", | |
" [654811]])\n", | |
"\n", | |
"tensor([[791743],\n", | |
" [103355],\n", | |
" [184779],\n", | |
" [989436],\n", | |
" [486232],\n", | |
" [917040],\n", | |
" [156730],\n", | |
" [654811]])\n", | |
"\n", | |
"tensor([[527035],\n", | |
" [648143],\n", | |
" [ 65725],\n", | |
" [ 84654],\n", | |
" [953277],\n", | |
" [591723],\n", | |
" [319030],\n", | |
" [555839]])\n", | |
"\n", | |
"tensor([[527035],\n", | |
" [648143],\n", | |
" [ 65725],\n", | |
" [ 84654],\n", | |
" [953277],\n", | |
" [591723],\n", | |
" [319030],\n", | |
" [555839]])\n", | |
"\n", | |
"tensor([[527035],\n", | |
" [648143],\n", | |
" [ 65725],\n", | |
" [ 84654],\n", | |
" [953277],\n", | |
" [591723],\n", | |
" [319030],\n", | |
" [555839]])\n", | |
"\n", | |
"tensor([[527035],\n", | |
" [648143],\n", | |
" [ 65725],\n", | |
" [ 84654],\n", | |
" [953277],\n", | |
" [591723],\n", | |
" [319030],\n", | |
" [555839]])\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from torch.utils.data.dataloader import DataLoader\n", | |
"\n", | |
"loader = DataLoader(train_dataset, batch_size=8, num_workers=4)\n", | |
"\n", | |
"debug_iters = 16\n", | |
"\n", | |
"for it, (x, y) in enumerate(loader):\n", | |
" \n", | |
" if it == debug_iters:\n", | |
" break\n", | |
" \n", | |
" print(x)\n", | |
" print()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment