Skip to content

Instantly share code, notes, and snippets.

@yang-zhang
Created December 24, 2018 03:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yang-zhang/5f013c0f9c55c41aba67f1e21af1456a to your computer and use it in GitHub Desktop.
Save yang-zhang/5f013c0f9c55c41aba67f1e21af1456a to your computer and use it in GitHub Desktop.
fastai-RNNLearner-getpreds-debug
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true,
"scrolled": true
},
"cell_type": "code",
"source": "%load_ext autoreload\n%autoreload 2\n\nimport inspect\nfrom fastai.text import * \nimport fastai; fastai.__version__",
"execution_count": 1,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 1,
"data": {
"text/plain": "'1.0.39.dev0'"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## current"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "path = untar_data(URLs.IMDB_SAMPLE)\n\ndf = pd.read_csv(path/'texts.csv')\n\n# Language model data\ndata_lm = TextLMDataBunch.from_csv(path, 'texts.csv')",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Classifier model data\ndata_clas = TextClasDataBunch.from_csv(path, 'texts.csv', vocab=data_lm.train_ds.vocab, bs=32)",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn = language_model_learner(data_lm, drop_mult=0.5)\nlearn.save_encoder('ft_enc')",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn = text_classifier_learner(data_clas, drop_mult=0.5)\nlearn.load_encoder('ft_enc')",
"execution_count": 5,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`get_preds` works for `DatasetType.Valid`"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pred_val, y_val=learn.get_preds(DatasetType.Valid, ordered=True)\n\npred_val.shape, y_val.shape",
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 6,
"data": {
"text/plain": "(torch.Size([201, 2]), torch.Size([201]))"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "assert np.alltrue(learn.data.valid_ds.y.items==y_val.numpy())",
"execution_count": 7,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`get_preds` works for `DatasetType.Train` for `ordered=False`"
},
{
"metadata": {
"trusted": true,
"scrolled": true
},
"cell_type": "code",
"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=False)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 8,
"data": {
"text/plain": "(torch.Size([784, 2]), torch.Size([784]), 799)"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`get_preds` does not work for `DatasetType.Train` for `ordered=True`"
},
{
"metadata": {
"trusted": true,
"scrolled": true
},
"cell_type": "code",
"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=True)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
"execution_count": 9,
"outputs": [
{
"output_type": "error",
"ename": "RuntimeError",
"evalue": "index 798 is out of bounds for dimension 0 with size 784",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-64196d3861cb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpred_trn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_trn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_preds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDatasetType\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mpred_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_ds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/data/git/fastai/fastai/text/learner.py\u001b[0m in \u001b[0;36mget_preds\u001b[0;34m(self, ds_type, with_loss, n_batch, pbar, ordered)\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0msampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mreverse_sampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margsort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 84\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: index 798 is out of bounds for dimension 0 with size 784"
]
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## This change generate the desired result, but is there a real solution?"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Code change in `TextClasDataBunch.create`"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(inspect.getsource(TextClasDataBunch.create))",
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": " @classmethod\n def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs=64, pad_idx=1, pad_first=True,\n no_check:bool=False, **kwargs) -> DataBunch:\n \"Function that transform the `datasets` in a `DataBunch` for classification.\"\n datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first)\n train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)\n # I removed >>>\n # train_dl = DataLoader(datasets[0], batch_size=bs//2, sampler=train_sampler, drop_last=True, **kwargs)\n # <<< I removed\n # I added >>> \n train_dl = DataLoader(datasets[0], batch_size=bs//2, sampler=train_sampler, drop_last=False, **kwargs)\n # <<< I added >>> \n dataloaders = [train_dl]\n for ds in datasets[1:]:\n lengths = [len(t) for t in ds.x.items]\n sampler = SortSampler(ds.x, key=lengths.__getitem__)\n dataloaders.append(DataLoader(ds, batch_size=bs, sampler=sampler, **kwargs))\n return cls(*dataloaders, path=path, collate_fn=collate_fn, no_check=no_check)\n\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Code change in `SortishSampler`"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(inspect.getsource(SortishSampler))",
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": "class SortishSampler(Sampler):\n \"Go through the text data by order of length with a bit of randomness.\"\n\n def __init__(self, data_source:NPArrayList, key:KeyFunc, bs:int):\n self.data_source,self.key,self.bs = data_source,key,bs\n\n def __len__(self) -> int: return len(self.data_source)\n\n def __iter__(self):\n \n # I added >>> \n np.random.seed(42)\n # <<< I added\n idxs = np.random.permutation(len(self.data_source))\n sz = self.bs*50\n ck_idx = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]\n sort_idx = np.concatenate([sorted(s, key=self.key, reverse=True) for s in ck_idx])\n sz = self.bs\n ck_idx = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]\n max_ck = np.argmax([self.key(ck[0]) for ck in ck_idx]) # find the chunk with the largest key,\n ck_idx[0],ck_idx[max_ck] = ck_idx[max_ck],ck_idx[0] # then make sure it goes first.\n sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([],dtype=np.int)\n sort_idx = np.concatenate((ck_idx[0], sort_idx))\n return iter(sort_idx)\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "path = untar_data(URLs.IMDB_SAMPLE)\n\ndf = pd.read_csv(path/'texts.csv')\n\n# Language model data\ndata_lm = TextLMDataBunch.from_csv(path, 'texts.csv')",
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Classifier model data\ndata_clas = TextClasDataBunch.from_csv(path, 'texts.csv', vocab=data_lm.train_ds.vocab, bs=32)",
"execution_count": 14,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn = language_model_learner(data_lm, drop_mult=0.5)\nlearn.save_encoder('ft_enc')",
"execution_count": 15,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn = text_classifier_learner(data_clas, drop_mult=0.5)\nlearn.load_encoder('ft_enc')",
"execution_count": 16,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pred_val, y_val=learn.get_preds(DatasetType.Valid, ordered=True)\n\npred_val.shape, y_val.shape",
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 17,
"data": {
"text/plain": "(torch.Size([201, 2]), torch.Size([201]))"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "assert np.alltrue(learn.data.valid_ds.y.items==y_val.numpy())",
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"trusted": true,
"scrolled": true
},
"cell_type": "code",
"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=False)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 20,
"data": {
"text/plain": "(torch.Size([799, 2]), torch.Size([799]), 799)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true,
"scrolled": true
},
"cell_type": "code",
"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=True)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
"execution_count": 21,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 21,
"data": {
"text/plain": "(torch.Size([799, 2]), torch.Size([799]), 799)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "assert np.alltrue(learn.data.train_ds.y.items==y_trn.numpy())",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "conda-env-fastaidev-py",
"display_name": "Python [conda env:fastaidev]",
"language": "python"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"base_numbering": 1,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
},
"language_info": {
"name": "python",
"version": "3.7.1",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "fastai-RNNLearner-getpreds-debug",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment