yang-zhang/fastai-RNNLearner-getpreds-debug.ipynb

## fastai-RNNLearner-getpreds-debug.ipynb
{
  "cells": [
    {
      "metadata": {
        "trusted": true,
        "scrolled": true
      },
      "cell_type": "code",
      "source": "%load_ext autoreload\n%autoreload 2\n\nimport inspect\nfrom fastai.text import * \nimport fastai; fastai.__version__",
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 1,
          "data": {
            "text/plain": "'1.0.39.dev0'"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## current"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "path = untar_data(URLs.IMDB_SAMPLE)\n\ndf = pd.read_csv(path/'texts.csv')\n\n# Language model data\ndata_lm = TextLMDataBunch.from_csv(path, 'texts.csv')",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Classifier model data\ndata_clas = TextClasDataBunch.from_csv(path, 'texts.csv', vocab=data_lm.train_ds.vocab, bs=32)",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "learn = language_model_learner(data_lm, drop_mult=0.5)\nlearn.save_encoder('ft_enc')",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "learn = text_classifier_learner(data_clas, drop_mult=0.5)\nlearn.load_encoder('ft_enc')",
      "execution_count": 5,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "`get_preds` works for `DatasetType.Valid`"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pred_val, y_val=learn.get_preds(DatasetType.Valid, ordered=True)\n\npred_val.shape, y_val.shape",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 6,
          "data": {
            "text/plain": "(torch.Size([201, 2]), torch.Size([201]))"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "assert np.alltrue(learn.data.valid_ds.y.items==y_val.numpy())",
      "execution_count": 7,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "`get_preds` works for `DatasetType.Train` for `ordered=False`"
    },
    {
      "metadata": {
        "trusted": true,
        "scrolled": true
      },
      "cell_type": "code",
      "source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=False)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 8,
          "data": {
            "text/plain": "(torch.Size([784, 2]), torch.Size([784]), 799)"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "`get_preds` does not work for `DatasetType.Train` for `ordered=True`"
    },
    {
      "metadata": {
        "trusted": true,
        "scrolled": true
      },
      "cell_type": "code",
      "source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=True)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "error",
          "ename": "RuntimeError",
          "evalue": "index 798 is out of bounds for dimension 0 with size 784",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-9-64196d3861cb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpred_trn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_trn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_preds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDatasetType\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mpred_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_ds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/data/git/fastai/fastai/text/learner.py\u001b[0m in \u001b[0;36mget_preds\u001b[0;34m(self, ds_type, with_loss, n_batch, pbar, ordered)\u001b[0m\n\u001b[1;32m     81\u001b[0m             \u001b[0msampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     82\u001b[0m             \u001b[0mreverse_sampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margsort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m             \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     84\u001b[0m             \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m         \u001b[0;32mreturn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mRuntimeError\u001b[0m: index 798 is out of bounds for dimension 0 with size 784"
          ]
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## This change generate the desired result, but is there a real solution?"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Code change in `TextClasDataBunch.create`"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "print(inspect.getsource(TextClasDataBunch.create))",
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": "    @classmethod\n    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs=64, pad_idx=1, pad_first=True,\n               no_check:bool=False, **kwargs) -> DataBunch:\n        \"Function that transform the `datasets` in a `DataBunch` for classification.\"\n        datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first)\n        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)\n        # I removed >>>\n        # train_dl = DataLoader(datasets[0], batch_size=bs//2, sampler=train_sampler, drop_last=True, **kwargs)\n        # <<< I removed\n        # I added >>> \n        train_dl = DataLoader(datasets[0], batch_size=bs//2, sampler=train_sampler, drop_last=False, **kwargs)\n        # <<< I added >>> \n        dataloaders = [train_dl]\n        for ds in datasets[1:]:\n            lengths = [len(t) for t in ds.x.items]\n            sampler = SortSampler(ds.x, key=lengths.__getitem__)\n            dataloaders.append(DataLoader(ds, batch_size=bs, sampler=sampler, **kwargs))\n        return cls(*dataloaders, path=path, collate_fn=collate_fn, no_check=no_check)\n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Code change in `SortishSampler`"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "print(inspect.getsource(SortishSampler))",
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": "class SortishSampler(Sampler):\n    \"Go through the text data by order of length with a bit of randomness.\"\n\n    def __init__(self, data_source:NPArrayList, key:KeyFunc, bs:int):\n        self.data_source,self.key,self.bs = data_source,key,bs\n\n    def __len__(self) -> int: return len(self.data_source)\n\n    def __iter__(self):\n        \n        # I added >>> \n        np.random.seed(42)\n        # <<< I added\n        idxs = np.random.permutation(len(self.data_source))\n        sz = self.bs*50\n        ck_idx = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]\n        sort_idx = np.concatenate([sorted(s, key=self.key, reverse=True) for s in ck_idx])\n        sz = self.bs\n        ck_idx = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]\n        max_ck = np.argmax([self.key(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,\n        ck_idx[0],ck_idx[max_ck] = ck_idx[max_ck],ck_idx[0]     # then make sure it goes first.\n        sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([],dtype=np.int)\n        sort_idx = np.concatenate((ck_idx[0], sort_idx))\n        return iter(sort_idx)\n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "path = untar_data(URLs.IMDB_SAMPLE)\n\ndf = pd.read_csv(path/'texts.csv')\n\n# Language model data\ndata_lm = TextLMDataBunch.from_csv(path, 'texts.csv')",
      "execution_count": 13,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Classifier model data\ndata_clas = TextClasDataBunch.from_csv(path, 'texts.csv', vocab=data_lm.train_ds.vocab, bs=32)",
      "execution_count": 14,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "learn = language_model_learner(data_lm, drop_mult=0.5)\nlearn.save_encoder('ft_enc')",
      "execution_count": 15,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "learn = text_classifier_learner(data_clas, drop_mult=0.5)\nlearn.load_encoder('ft_enc')",
      "execution_count": 16,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pred_val, y_val=learn.get_preds(DatasetType.Valid, ordered=True)\n\npred_val.shape, y_val.shape",
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 17,
          "data": {
            "text/plain": "(torch.Size([201, 2]), torch.Size([201]))"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "assert np.alltrue(learn.data.valid_ds.y.items==y_val.numpy())",
      "execution_count": 18,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "scrolled": true
      },
      "cell_type": "code",
      "source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=False)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 20,
          "data": {
            "text/plain": "(torch.Size([799, 2]), torch.Size([799]), 799)"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true,
        "scrolled": true
      },
      "cell_type": "code",
      "source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=True)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 21,
          "data": {
            "text/plain": "(torch.Size([799, 2]), torch.Size([799]), 799)"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "assert np.alltrue(learn.data.train_ds.y.items==y_trn.numpy())",
      "execution_count": 23,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "conda-env-fastaidev-py",
      "display_name": "Python [conda env:fastaidev]",
      "language": "python"
    },
    "toc": {
      "nav_menu": {},
      "number_sections": true,
      "sideBar": true,
      "skip_h1_title": false,
      "base_numbering": 1,
      "title_cell": "Table of Contents",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": true
    },
    "language_info": {
      "name": "python",
      "version": "3.7.1",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "gist": {
      "id": "",
      "data": {
        "description": "fastai-RNNLearner-getpreds-debug",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
	{
	"cells": [
	{
	"metadata": {
	"trusted": true,
	"scrolled": true
	},
	"cell_type": "code",
	"source": "%load_ext autoreload\n%autoreload 2\n\nimport inspect\nfrom fastai.text import * \nimport fastai; fastai.__version__",
	"execution_count": 1,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 1,
	"data": {
	"text/plain": "'1.0.39.dev0'"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## current"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "path = untar_data(URLs.IMDB_SAMPLE)\n\ndf = pd.read_csv(path/'texts.csv')\n\n# Language model data\ndata_lm = TextLMDataBunch.from_csv(path, 'texts.csv')",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "# Classifier model data\ndata_clas = TextClasDataBunch.from_csv(path, 'texts.csv', vocab=data_lm.train_ds.vocab, bs=32)",
	"execution_count": 3,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "learn = language_model_learner(data_lm, drop_mult=0.5)\nlearn.save_encoder('ft_enc')",
	"execution_count": 4,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "learn = text_classifier_learner(data_clas, drop_mult=0.5)\nlearn.load_encoder('ft_enc')",
	"execution_count": 5,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "`get_preds` works for `DatasetType.Valid`"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pred_val, y_val=learn.get_preds(DatasetType.Valid, ordered=True)\n\npred_val.shape, y_val.shape",
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 6,
	"data": {
	"text/plain": "(torch.Size([201, 2]), torch.Size([201]))"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "assert np.alltrue(learn.data.valid_ds.y.items==y_val.numpy())",
	"execution_count": 7,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "`get_preds` works for `DatasetType.Train` for `ordered=False`"
	},
	{
	"metadata": {
	"trusted": true,
	"scrolled": true
	},
	"cell_type": "code",
	"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=False)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 8,
	"data": {
	"text/plain": "(torch.Size([784, 2]), torch.Size([784]), 799)"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "`get_preds` does not work for `DatasetType.Train` for `ordered=True`"
	},
	{
	"metadata": {
	"trusted": true,
	"scrolled": true
	},
	"cell_type": "code",
	"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=True)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
	"execution_count": 9,
	"outputs": [
	{
	"output_type": "error",
	"ename": "RuntimeError",
	"evalue": "index 798 is out of bounds for dimension 0 with size 784",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-9-64196d3861cb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpred_trn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_trn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_preds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDatasetType\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mpred_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_ds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m/data/git/fastai/fastai/text/learner.py\u001b[0m in \u001b[0;36mget_preds\u001b[0;34m(self, ds_type, with_loss, n_batch, pbar, ordered)\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0msampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mreverse_sampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margsort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msampler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 84\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mpreds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mreverse_sampler\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mRuntimeError\u001b[0m: index 798 is out of bounds for dimension 0 with size 784"
	]
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## This change generate the desired result, but is there a real solution?"
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Code change in `TextClasDataBunch.create`"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "print(inspect.getsource(TextClasDataBunch.create))",
	"execution_count": 11,
	"outputs": [
	{
	"output_type": "stream",
	"text": " @classmethod\n def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs=64, pad_idx=1, pad_first=True,\n no_check:bool=False, kwargs) -> DataBunch:\n \"Function that transform the `datasets` in a `DataBunch` for classification.\"\n datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first)\n train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)\n # I removed >>>\n # train_dl = DataLoader(datasets[0], batch_size=bs//2, sampler=train_sampler, drop_last=True, kwargs)\n # <<< I removed\n # I added >>> \n train_dl = DataLoader(datasets[0], batch_size=bs//2, sampler=train_sampler, drop_last=False, kwargs)\n # <<< I added >>> \n dataloaders = [train_dl]\n for ds in datasets[1:]:\n lengths = [len(t) for t in ds.x.items]\n sampler = SortSampler(ds.x, key=lengths.__getitem__)\n dataloaders.append(DataLoader(ds, batch_size=bs, sampler=sampler, kwargs))\n return cls(*dataloaders, path=path, collate_fn=collate_fn, no_check=no_check)\n\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Code change in `SortishSampler`"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "print(inspect.getsource(SortishSampler))",
	"execution_count": 12,
	"outputs": [
	{
	"output_type": "stream",
	"text": "class SortishSampler(Sampler):\n \"Go through the text data by order of length with a bit of randomness.\"\n\n def __init__(self, data_source:NPArrayList, key:KeyFunc, bs:int):\n self.data_source,self.key,self.bs = data_source,key,bs\n\n def __len__(self) -> int: return len(self.data_source)\n\n def __iter__(self):\n \n # I added >>> \n np.random.seed(42)\n # <<< I added\n idxs = np.random.permutation(len(self.data_source))\n sz = self.bs*50\n ck_idx = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]\n sort_idx = np.concatenate([sorted(s, key=self.key, reverse=True) for s in ck_idx])\n sz = self.bs\n ck_idx = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]\n max_ck = np.argmax([self.key(ck[0]) for ck in ck_idx]) # find the chunk with the largest key,\n ck_idx[0],ck_idx[max_ck] = ck_idx[max_ck],ck_idx[0] # then make sure it goes first.\n sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([],dtype=np.int)\n sort_idx = np.concatenate((ck_idx[0], sort_idx))\n return iter(sort_idx)\n\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "path = untar_data(URLs.IMDB_SAMPLE)\n\ndf = pd.read_csv(path/'texts.csv')\n\n# Language model data\ndata_lm = TextLMDataBunch.from_csv(path, 'texts.csv')",
	"execution_count": 13,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "# Classifier model data\ndata_clas = TextClasDataBunch.from_csv(path, 'texts.csv', vocab=data_lm.train_ds.vocab, bs=32)",
	"execution_count": 14,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "learn = language_model_learner(data_lm, drop_mult=0.5)\nlearn.save_encoder('ft_enc')",
	"execution_count": 15,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "learn = text_classifier_learner(data_clas, drop_mult=0.5)\nlearn.load_encoder('ft_enc')",
	"execution_count": 16,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pred_val, y_val=learn.get_preds(DatasetType.Valid, ordered=True)\n\npred_val.shape, y_val.shape",
	"execution_count": 17,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 17,
	"data": {
	"text/plain": "(torch.Size([201, 2]), torch.Size([201]))"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "assert np.alltrue(learn.data.valid_ds.y.items==y_val.numpy())",
	"execution_count": 18,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true,
	"scrolled": true
	},
	"cell_type": "code",
	"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=False)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
	"execution_count": 20,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 20,
	"data": {
	"text/plain": "(torch.Size([799, 2]), torch.Size([799]), 799)"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true,
	"scrolled": true
	},
	"cell_type": "code",
	"source": "pred_trn, y_trn=learn.get_preds(DatasetType.Train, ordered=True)\npred_trn.shape, y_trn.shape, len(learn.data.train_ds)",
	"execution_count": 21,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 21,
	"data": {
	"text/plain": "(torch.Size([799, 2]), torch.Size([799]), 799)"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "assert np.alltrue(learn.data.train_ds.y.items==y_trn.numpy())",
	"execution_count": 23,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "conda-env-fastaidev-py",
	"display_name": "Python [conda env:fastaidev]",
	"language": "python"
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"base_numbering": 1,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": true
	},
	"language_info": {
	"name": "python",
	"version": "3.7.1",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"gist": {
	"id": "",
	"data": {
	"description": "fastai-RNNLearner-getpreds-debug",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}