Skip to content

Instantly share code, notes, and snippets.

@marii-moe
Created November 3, 2020 06:20
Show Gist options
  • Save marii-moe/98ac9403a34d5db1aa6c5d2e1d358e98 to your computer and use it in GitHub Desktop.
Save marii-moe/98ac9403a34d5db1aa6c5d2e1d358e98 to your computer and use it in GitHub Desktop.
fastai/nbs/31_text.data.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\n#skip\n! [ -e /content ] && pip install -Uqq fastai # upgrade fastai on colab",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\nfrom fastai.torch_basics import *\nfrom fastai.data.all import *\nfrom fastai.text.core import *",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\nfrom nbdev.showdoc import *",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#default_exp text.data\n#default_cls_lvl 3",
"execution_count": 4,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Text data\n\n> Functions and transforms to help gather text data in a `Datasets`"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Backwards\n\nReversing the text can provide higher accuracy with an ensemble with a forward model. All that is needed is a `type_tfm` that will reverse the text as it is brought in:"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef reverse_text(x): return x.flip(0)",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "t = tensor([0,1,2])\nr = reverse_text(t)\ntest_eq(r, tensor([2,1,0]))",
"execution_count": 6,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Numericalizing"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Numericalization is the step in which we convert tokens to integers. The first step is to build a correspondence token to index that is called a vocab."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef make_vocab(count, min_freq=3, max_vocab=60000, special_toks=None):\n \"Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`\"\n vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]\n special_toks = ifnone(special_toks, defaults.text_spec_tok)\n for o in reversed(special_toks): #Make sure all special tokens are in the vocab\n if o in vocab: vocab.remove(o)\n vocab.insert(0, o)\n vocab = vocab[:max_vocab]\n return vocab + [f'xxfake' for i in range(0, 8-len(vocab)%8)]",
"execution_count": 7,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "If there are more than `max_vocab` tokens, the ones kept are the most frequent.\n\n> Note: For performance when using mixed precision, the vocabulary is always made of size a multiple of 8, potentially by adding `xxfake` tokens."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])\ntest_eq(set([x for x in make_vocab(count) if not x.startswith('xxfake')]), \n set(defaults.text_spec_tok + 'a'.split()))\ntest_eq(len(make_vocab(count))%8, 0)\ntest_eq(set([x for x in make_vocab(count, min_freq=1) if not x.startswith('xxfake')]), \n set(defaults.text_spec_tok + 'a b c d'.split()))\ntest_eq(set([x for x in make_vocab(count,max_vocab=12, min_freq=1) if not x.startswith('xxfake')]), \n set(defaults.text_spec_tok + 'a b c'.split()))",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\nclass TensorText(TensorBase): pass\nclass LMTensorText(TensorText): pass\n\nTensorText.__doc__ = \"Semantic type for a tensor representing text\"\nLMTensorText.__doc__ = \"Semantic type for a tensor representing text in language modeling\"",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\nclass Numericalize(Transform):\n \"Reversible transform of tokenized texts to numericalized ids\"\n def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None, pad_tok=None):\n store_attr('vocab,min_freq,max_vocab,special_toks,pad_tok')\n self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})\n\n def setups(self, dsets):\n if dsets is None: return\n if self.vocab is None:\n count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)\n if self.special_toks is None and hasattr(dsets, 'special_toks'):\n self.special_toks = dsets.special_toks\n self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)\n self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})\n\n def encodes(self, o): return TensorText(tensor([self.o2i [o_] for o_ in o]))\n def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)",
"execution_count": 10,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "If no `vocab` is passed, one is created at setup from the data, using `make_vocab` with `min_freq` and `max_vocab`."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "start = 'This is an example of text'\nnum = Numericalize(min_freq=1)\nnum.setup(L(start.split(), 'this is another text'.split()))\ntest_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), \n set(defaults.text_spec_tok + 'This is an example of text this another'.split()))\ntest_eq(len(num.vocab)%8, 0)\nt = num(start.split())\n\ntest_eq(t, tensor([11, 9, 12, 13, 14, 10]))\ntest_eq(num.decode(t), start.split())",
"execution_count": 11,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "num = Numericalize(min_freq=2)\nnum.setup(L('This is an example of text'.split(), 'this is another text'.split()))\ntest_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), \n set(defaults.text_spec_tok + 'is text'.split()))\ntest_eq(len(num.vocab)%8, 0)\nt = num(start.split())\ntest_eq(t, tensor([0, 9, 0, 0, 0, 10]))\ntest_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text'.split())",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\ndf = pd.DataFrame({'texts': ['This is an example of text', 'this is another text']})\ntl = TfmdLists(df, [attrgetter('text'), Tokenizer.from_df('texts'), Numericalize(min_freq=2)])\ntest_eq(tl, [tensor([2, 8, 9, 10, 0, 0, 0, 11]), tensor([2, 9, 10, 0, 11])])",
"execution_count": 13,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": ""
},
"metadata": {}
},
{
"output_type": "stream",
"text": "/opt/conda/lib/python3.8/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n return array(a, dtype, copy=False, order=order)\n",
"name": "stderr"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## LM_DataLoader -"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef _maybe_first(o): return o[0] if isinstance(o, tuple) else o",
"execution_count": 14,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef _get_tokenizer(ds):\n tok = getattr(ds, 'tokenizer', None)\n if isinstance(tok, Tokenizer): return tok\n if isinstance(tok, (list,L)):\n for t in tok:\n if isinstance(t, Tokenizer): return t",
"execution_count": 15,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef _get_lengths(ds):\n tok = _get_tokenizer(ds)\n if tok is None: return\n return tok.get_lengths(ds.items)",
"execution_count": 16,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\n#TODO: add backward\n@log_args(but_as=TfmdDL.__init__)\n@delegates()\nclass LMDataLoader(TfmdDL):\n \"A `DataLoader` suitable for language modeling\"\n def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):\n self.items = ReindexCollection(dataset, cache=cache, tfm=_maybe_first)\n self.seq_len = seq_len\n if lens is None: lens = _get_lengths(dataset)\n if lens is None: lens = [len(o) for o in self.items]\n self.lens = ReindexCollection(lens, idxs=self.items.idxs)\n # The \"-1\" is to allow for final label, we throw away the end that's less than bs\n corpus = round_multiple(sum(lens)-1, bs, round_down=True)\n self.bl = corpus//bs #bl stands for batch length\n self.n_batches = self.bl//(seq_len) + int(self.bl%seq_len!=0)\n self.last_len = self.bl - (self.n_batches-1)*seq_len\n self.make_chunks()\n super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)\n self.n = self.n_batches*bs\n\n def make_chunks(self): self.chunks = Chunks(self.items, self.lens)\n def shuffle_fn(self,idxs):\n self.items.shuffle()\n self.make_chunks()\n return idxs\n\n def create_item(self, seq):\n if seq>=self.n: raise IndexError\n sl = self.last_len if seq//self.bs==self.n_batches-1 else self.seq_len\n st = (seq%self.bs)*self.bl + (seq//self.bs)*self.seq_len\n txt = self.chunks[st : st+sl+1]\n return LMTensorText(txt[:-1]),txt[1:]\n\n @delegates(TfmdDL.new)\n def new(self, dataset=None, seq_len=None, **kwargs):\n lens = self.lens.coll if dataset is None else None\n seq_len = self.seq_len if seq_len is None else seq_len\n return super().new(dataset=dataset, lens=lens, seq_len=seq_len, **kwargs)",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(LMDataLoader, title_level=2)",
"execution_count": 18,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h2 id=\"LMDataLoader\" class=\"doc_header\"><code>class</code> <code>LMDataLoader</code><a href=\"\" class=\"source_link\" style=\"float:right\">[source]</a></h2>\n\n> <code>LMDataLoader</code>(**`dataset`**, **`lens`**=*`None`*, **`cache`**=*`2`*, **`bs`**=*`64`*, **`seq_len`**=*`72`*, **`num_workers`**=*`0`*, **`shuffle`**=*`False`*, **`verbose`**=*`False`*, **`do_setup`**=*`True`*, **`pin_memory`**=*`False`*, **`timeout`**=*`0`*, **`batch_size`**=*`None`*, **`drop_last`**=*`False`*, **`indexed`**=*`None`*, **`n`**=*`None`*, **`device`**=*`None`*, **`persistent_workers`**=*`False`*, **`wif`**=*`None`*, **`before_iter`**=*`None`*, **`after_item`**=*`None`*, **`before_batch`**=*`None`*, **`after_batch`**=*`None`*, **`after_iter`**=*`None`*, **`create_batches`**=*`None`*, **`create_item`**=*`None`*, **`create_batch`**=*`None`*, **`retain`**=*`None`*, **`get_idxs`**=*`None`*, **`sample`**=*`None`*, **`shuffle_fn`**=*`None`*, **`do_batch`**=*`None`*) :: [`TfmdDL`](/data.core.html#TfmdDL)\n\nA [`DataLoader`](/data.load.html#DataLoader) suitable for language modeling"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`dataset` should be a collection of numericalized texts for this to work. `lens` can be passed for optimizing the creation, otherwise, the `LMDataLoader` will do a full pass of the `dataset` to compute them. `cache` is used to avoid reloading items unnecessarily.\n\nThe `LMDataLoader` will concatenate all texts (maybe `shuffle`d) in one big stream, split it in `bs` contiguous sentences, then go through those `seq_len` at a time."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\nbs,sl = 4,3\nints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22]).map(tensor)\ndl = LMDataLoader(ints, bs=bs, seq_len=sl)\nlist(dl)\ntest_eq(list(dl),\n [[tensor([[0, 1, 2], [5, 6, 7], [10, 11, 12], [15, 16, 17]]),\n tensor([[1, 2, 3], [6, 7, 8], [11, 12, 13], [16, 17, 18]])],\n [tensor([[3, 4], [8, 9], [13, 14], [18, 19]]),\n tensor([[4, 5], [9, 10], [14, 15], [19, 20]])]])",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "bs,sl = 4,3\nints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).map(tensor)",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dl = LMDataLoader(ints, bs=bs, seq_len=sl)\ntest_eq(list(dl),\n [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),\n tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],\n [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),\n tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\n#Check lens work\ndl = LMDataLoader(ints, lens=ints.map(len), bs=bs, seq_len=sl)\ntest_eq(list(dl),\n [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),\n tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],\n [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),\n tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])",
"execution_count": 22,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)\nfor x,y in dl: test_eq(x[:,1:], y[:,:-1])\n((x0,y0), (x1,y1)) = tuple(dl)\n#Second batch begins where first batch ended\ntest_eq(y0[:,-1], x1[:,0]) \ntest_eq(type(x0), LMTensorText)",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\n#test new works\ndl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)\ndl1 = dl.new()\ntest_eq(dl1.seq_len, sl)\ndl2 = dl.new(seq_len=2)\ntest_eq(dl2.seq_len, 2)",
"execution_count": 24,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Showing -"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\n@typedispatch\ndef show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):\n if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))\n if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)\n ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)\n display_df(pd.DataFrame(ctxs))\n return ctxs",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\n@typedispatch\ndef show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):\n samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at)) for s in samples)\n return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, trunc_at=None, **kwargs)",
"execution_count": 26,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Classification"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "For classification, we deal with the fact that texts don't all have the same length by using padding."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef pad_input(samples, pad_idx=1, pad_fields=0, pad_first=False, backwards=False):\n \"Function that collect `samples` and adds padding\"\n pad_fields = L(pad_fields)\n max_len_l = pad_fields.map(lambda f: max([len(s[f]) for s in samples]))\n if backwards: pad_first = not pad_first\n def _f(field_idx, x):\n if field_idx not in pad_fields: return x\n idx = pad_fields.items.index(field_idx) #TODO: remove items if L.index is fixed\n sl = slice(-len(x), sys.maxsize) if pad_first else slice(0, len(x))\n pad = x.new_zeros(max_len_l[idx]-x.shape[0])+pad_idx\n x1 = torch.cat([pad, x] if pad_first else [x, pad])\n if backwards: x1 = x1.flip(0)\n return retain_type(x1, x)\n return [tuple(map(lambda idxx: _f(*idxx), enumerate(s))) for s in samples]",
"execution_count": 27,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`pad_idx` is used for the padding, and the padding is applied to the `pad_fields` of the samples. The padding is applied at the beginning if `pad_first` is `True`, and if `backwards` is added, the tensors are flipped."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0), \n [(tensor([1,2,3]),1), (tensor([4,5,0]),2), (tensor([6,0,0]), 3)])\ntest_eq(pad_input([(tensor([1,2,3]), (tensor([6]))), (tensor([4,5]), tensor([4,5])), (tensor([6]), (tensor([1,2,3])))], pad_idx=0, pad_fields=1), \n [(tensor([1,2,3]),(tensor([6,0,0]))), (tensor([4,5]),tensor([4,5,0])), ((tensor([6]),tensor([1, 2, 3])))])\ntest_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, pad_first=True), \n [(tensor([1,2,3]),1), (tensor([0,4,5]),2), (tensor([0,0,6]), 3)])\ntest_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, backwards=True), \n [(tensor([3,2,1]),1), (tensor([5,4,0]),2), (tensor([6,0,0]), 3)])\nx = test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, backwards=True), \n [(tensor([3,2,1]),1), (tensor([5,4,0]),2), (tensor([6,0,0]), 3)])",
"execution_count": 28,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\n#Check retain type\nx = [(TensorText([1,2,3]),1), (TensorText([4,5]), 2), (TensorText([6]), 3)]\ny = pad_input(x, pad_idx=0)\nfor s in y: test_eq(type(s[0]), TensorText)",
"execution_count": 29,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef pad_input_chunk(samples, pad_idx=1, pad_first=True, seq_len=72):\n \"Pad `samples` by adding padding by chunks of size `seq_len`\"\n max_len = max([len(s[0]) for s in samples])\n def _f(x):\n l = max_len - x.shape[0]\n pad_chunk = x.new_zeros((l//seq_len) * seq_len) + pad_idx\n pad_res = x.new_zeros(l % seq_len) + pad_idx\n x1 = torch.cat([pad_chunk, x, pad_res]) if pad_first else torch.cat([x, pad_res, pad_chunk])\n return retain_type(x1, x)\n return [(_f(s[0]), *s[1:]) for s in samples]",
"execution_count": 30,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "The difference with the base `pad_input` is that most of the padding is applied first (if `pad_first=True`) or at the end (if `pad_first=False`) but only by a round multiple of `seq_len`. The rest of the padding is applied to the end (or the beginning if `pad_first=False`). This is to work with `SequenceEncoder` with recurrent models."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),1), (tensor([1,2,3]), 2), (tensor([1,2]), 3)], pad_idx=0, seq_len=2), \n [(tensor([1,2,3,4,5,6]),1), (tensor([0,0,1,2,3,0]),2), (tensor([0,0,0,0,1,2]), 3)])\ntest_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),), (tensor([1,2,3]),), (tensor([1,2]),)], pad_idx=0, seq_len=2), \n [(tensor([1,2,3,4,5,6]),), (tensor([0,0,1,2,3,0]),), (tensor([0,0,0,0,1,2]),)])\ntest_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),), (tensor([1,2,3]),), (tensor([1,2]),)], pad_idx=0, seq_len=2, pad_first=False), \n [(tensor([1,2,3,4,5,6]),), (tensor([1,2,3,0,0,0]),), (tensor([1,2,0,0,0,0]),)])",
"execution_count": 31,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\ndef _default_sort(x): return len(x[0])\n\n@delegates(TfmdDL)\nclass SortedDL(TfmdDL):\n \"A `DataLoader` that goes throught the item in the order given by `sort_func`\"\n def __init__(self, dataset, sort_func=None, res=None, **kwargs):\n super().__init__(dataset, **kwargs)\n self.sort_func = _default_sort if sort_func is None else sort_func\n if res is None and self.sort_func == _default_sort: res = _get_lengths(dataset)\n self.res = [self.sort_func(self.do_item(i)) for i in range_of(self.dataset)] if res is None else res\n if len(self.res) > 0: self.idx_max = np.argmax(self.res)\n\n def get_idxs(self):\n idxs = super().get_idxs()\n if self.shuffle: return idxs\n return sorted(idxs, key=lambda i: self.res[i], reverse=True)\n\n def shuffle_fn(self,idxs):\n idxs = np.random.permutation(len(self.dataset))\n idx_max = np.where(idxs==self.idx_max)[0][0]\n idxs[0],idxs[idx_max] = idxs[idx_max],idxs[0]\n sz = self.bs*50\n chunks = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]\n chunks = [sorted(s, key=lambda i: self.res[i], reverse=True) for s in chunks]\n sort_idx = np.concatenate(chunks)\n\n sz = self.bs\n batches = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]\n sort_idx = np.concatenate(np.random.permutation(batches[1:-1])) if len(batches) > 2 else np.array([],dtype=np.int)\n sort_idx = np.concatenate((batches[0], sort_idx) if len(batches)==1 else (batches[0], sort_idx, batches[-1]))\n return iter(sort_idx)\n\n @delegates(TfmdDL.new)\n def new(self, dataset=None, **kwargs):\n if 'val_res' in kwargs and kwargs['val_res'] is not None: res = kwargs['val_res']\n else: res = self.res if dataset is None else None\n return super().new(dataset=dataset, res=res, **kwargs)",
"execution_count": 32,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`res` is the result of `sort_func` applied on all elements of the `dataset`. You can pass it if available to make the init much faster by avoiding an initial pass over the whole dataset. For example if sorting by text length (as in the default `sort_func`, called `_default_sort`) you should pass a list with the length of each element in `dataset` to `res` to take advantage of this speed-up. \n\nTo get the same init speed-up for the validation set, `val_res` (a list of text lengths for your validation set) can be passed to the `kwargs` argument of `SortedDL`. Below is an example to reduce the init time by passing a list of text lengths for both the training set and the validation set:\n\n```\n# Pass the training dataset text lengths to SortedDL\nsrtd_dl=partial(SortedDL, res = train_text_lens)\n\n# Pass the validation dataset text lengths \ndl_kwargs = [{},{'val_res': val_text_lens}]\n\n# init our Datasets \ndsets = Datasets(...) \n\n# init our Dataloaders\ndls = dsets.dataloaders(...,dl_type = srtd_dl, dl_kwargs = dl_kwargs)\n```\n\nIf `shuffle` is `True`, this will shuffle a bit the results of the sort to have items of roughly the same size in batches, but not in the exact sorted order."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ds = [(tensor([1,2]),1), (tensor([3,4,5,6]),2), (tensor([7]),3), (tensor([8,9,10]),4)]\ndl = SortedDL(ds, bs=2, before_batch=partial(pad_input, pad_idx=0))\ntest_eq(list(dl), [(tensor([[ 3, 4, 5, 6], [ 8, 9, 10, 0]]), tensor([2, 4])), \n (tensor([[1, 2], [7, 0]]), tensor([1, 3]))])",
"execution_count": 33,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ds = [(tensor(range(random.randint(1,10))),i) for i in range(101)]\ndl = SortedDL(ds, bs=2, create_batch=partial(pad_input, pad_idx=-1), shuffle=True, num_workers=0)\nbatches = list(dl)\nmax_len = len(batches[0][0])\nfor b in batches: \n assert(len(b[0])) <= max_len \n test_ne(b[0][-1], -1)",
"execution_count": 34,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## TransformBlock for text"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "To use the data block API, you will need this build block for texts."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\nclass TextBlock(TransformBlock):\n \"A `TransformBlock` for texts\"\n @delegates(Numericalize.__init__)\n def __init__(self, tok_tfm, vocab=None, is_lm=False, seq_len=72, backwards=False, **kwargs):\n type_tfms = [tok_tfm, Numericalize(vocab, **kwargs)]\n if backwards: type_tfms += [reverse_text]\n return super().__init__(type_tfms=type_tfms,\n dl_type=LMDataLoader if is_lm else SortedDL,\n dls_kwargs={'seq_len': seq_len} if is_lm else {'before_batch': partial(pad_input_chunk, seq_len=seq_len)})\n\n @classmethod\n @delegates(Tokenizer.from_df, keep=True)\n def from_df(cls, text_cols, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, **kwargs):\n \"Build a `TextBlock` from a dataframe using `text_cols`\"\n return cls(Tokenizer.from_df(text_cols, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,\n backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)\n\n @classmethod\n @delegates(Tokenizer.from_folder, keep=True)\n def from_folder(cls, path, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, **kwargs):\n \"Build a `TextBlock` from a `path`\"\n return cls(Tokenizer.from_folder(path, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,\n backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)",
"execution_count": 35,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "For efficient tokenization, you probably want to use one of the factory methods. Otherwise, you can pass your custom `tok_tfm` that will deal with tokenization (if your texts are already tokenized, you can pass `noop`), a `vocab`, or leave it to be inferred on the texts using `min_freq` and `max_vocab`.\n\n`is_lm` indicates if we want to use texts for language modeling or another task, `seq_len` is only necessary to tune if `is_lm=False`, and is passed along to `pad_input_chunk`."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(TextBlock.from_df)",
"execution_count": 36,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h4 id=\"TextBlock.from_df\" class=\"doc_header\"><code>TextBlock.from_df</code><a href=\"__main__.py#L12\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n\n> <code>TextBlock.from_df</code>(**`text_cols`**, **`vocab`**=*`None`*, **`is_lm`**=*`False`*, **`seq_len`**=*`72`*, **`backwards`**=*`False`*, **`min_freq`**=*`3`*, **`max_vocab`**=*`60000`*, **`tok`**=*`None`*, **`rules`**=*`None`*, **`sep`**=*`' '`*, **`n_workers`**=*`16`*, **`mark_fields`**=*`None`*, **`res_col_name`**=*`'text'`*, **\\*\\*`kwargs`**)\n\nBuild a [`TextBlock`](/text.data.html#TextBlock) from a dataframe using `text_cols`"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Here is an example using a sample of IMDB stored as a CSV file:"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "path = untar_data(URLs.IMDB_SAMPLE)\ndf = pd.read_csv(path/'texts.csv')\n\nimdb_clas = DataBlock(\n blocks=(TextBlock.from_df('text', seq_len=72), CategoryBlock),\n get_x=ColReader('text'), get_y=ColReader('label'), splitter=ColSplitter())\n\ndls = imdb_clas.dataloaders(df, bs=64)\ndls.show_batch(max_n=2)",
"execution_count": 37,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": ""
},
"metadata": {}
},
{
"output_type": "stream",
"text": "/opt/conda/lib/python3.8/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n return array(a, dtype, copy=False, order=order)\n",
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>category</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \\n\\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is</td>\n <td>negative</td>\n </tr>\n <tr>\n <th>1</th>\n <td>xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxbos xxup the xxup shop xxup</td>\n <td>positive</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`vocab`, `is_lm`, `seq_len`, `min_freq` and `max_vocab` are passed to the main init, the other argument to `Tokenizer.from_df`."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(TextBlock.from_folder)",
"execution_count": 38,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h4 id=\"TextBlock.from_folder\" class=\"doc_header\"><code>TextBlock.from_folder</code><a href=\"__main__.py#L19\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n\n> <code>TextBlock.from_folder</code>(**`path`**, **`vocab`**=*`None`*, **`is_lm`**=*`False`*, **`seq_len`**=*`72`*, **`backwards`**=*`False`*, **`min_freq`**=*`3`*, **`max_vocab`**=*`60000`*, **`tok`**=*`None`*, **`rules`**=*`None`*, **`extensions`**=*`None`*, **`folders`**=*`None`*, **`output_dir`**=*`None`*, **`skip_if_exists`**=*`True`*, **`output_names`**=*`None`*, **`n_workers`**=*`16`*, **`encoding`**=*`'utf8'`*, **\\*\\*`kwargs`**)\n\nBuild a [`TextBlock`](/text.data.html#TextBlock) from a `path`"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`vocab`, `is_lm`, `seq_len`, `min_freq` and `max_vocab` are passed to the main init, the other argument to `Tokenizer.from_folder`."
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## TextDataLoaders -"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#export\nclass TextDataLoaders(DataLoaders):\n \"Basic wrapper around several `DataLoader`s with factory methods for NLP problems\"\n @classmethod\n @delegates(DataLoaders.from_dblock)\n def from_folder(cls, path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, text_vocab=None, is_lm=False,\n tok_tfm=None, seq_len=72, backwards=False, **kwargs):\n \"Create from imagenet style dataset in `path` with `train` and `valid` subfolders (or provide `valid_pct`)\"\n splitter = GrandparentSplitter(train_name=train, valid_name=valid) if valid_pct is None else RandomSplitter(valid_pct, seed=seed)\n blocks = [TextBlock.from_folder(path, text_vocab, is_lm, seq_len, backwards) if tok_tfm is None else TextBlock(tok_tfm, text_vocab, is_lm, seq_len, backwards)]\n if not is_lm: blocks.append(CategoryBlock(vocab=vocab))\n get_items = partial(get_text_files, folders=[train,valid]) if valid_pct is None else get_text_files\n dblock = DataBlock(blocks=blocks,\n get_items=get_items,\n splitter=splitter,\n get_y=None if is_lm else parent_label)\n return cls.from_dblock(dblock, path, path=path, seq_len=seq_len, **kwargs)\n\n @classmethod\n @delegates(DataLoaders.from_dblock)\n def from_df(cls, df, path='.', valid_pct=0.2, seed=None, text_col=0, label_col=1, label_delim=None, y_block=None,\n text_vocab=None, is_lm=False, valid_col=None, tok_tfm=None, seq_len=72, backwards=False, **kwargs):\n \"Create from `df` in `path` with `valid_pct`\"\n blocks = [TextBlock.from_df(text_col, text_vocab, is_lm, seq_len, backwards) if tok_tfm is None else TextBlock(tok_tfm, text_vocab, is_lm, seq_len, backwards)]\n if y_block is None and not is_lm:\n blocks.append(MultiCategoryBlock if is_listy(label_col) and len(label_col) > 1 else CategoryBlock)\n if y_block is not None and not is_lm: blocks += (y_block if is_listy(y_block) else [y_block])\n splitter = RandomSplitter(valid_pct, seed=seed) if valid_col is None else ColSplitter(valid_col)\n dblock = DataBlock(blocks=blocks,\n get_x=ColReader(\"text\"),\n get_y=None if is_lm else ColReader(label_col, label_delim=label_delim),\n splitter=splitter)\n return cls.from_dblock(dblock, df, path=path, seq_len=seq_len, **kwargs)\n\n @classmethod\n def from_csv(cls, path, csv_fname='labels.csv', header='infer', delimiter=None, **kwargs):\n \"Create from `csv` file in `path/csv_fname`\"\n df = pd.read_csv(Path(path)/csv_fname, header=header, delimiter=delimiter)\n return cls.from_df(df, path=path, **kwargs)\n\nTextDataLoaders.from_csv = delegates(to=TextDataLoaders.from_df)(TextDataLoaders.from_csv)",
"execution_count": 39,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(TextDataLoaders, title_level=2)",
"execution_count": 40,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h2 id=\"TextDataLoaders\" class=\"doc_header\"><code>class</code> <code>TextDataLoaders</code><a href=\"\" class=\"source_link\" style=\"float:right\">[source]</a></h2>\n\n> <code>TextDataLoaders</code>(**\\*`loaders`**, **`path`**=*`'.'`*, **`device`**=*`None`*) :: [`DataLoaders`](/data.core.html#DataLoaders)\n\nBasic wrapper around several [`DataLoader`](/data.load.html#DataLoader)s with factory methods for NLP problems"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "You should not use the init directly but one of the following factory methods. All those factory methods accept as arguments:\n\n- `text_vocab`: the vocabulary used for numericalizing texts (if not passed, it's inferred from the data)\n- `tok_tfm`: if passed, uses this `tok_tfm` instead of the default\n- `seq_len`: the sequence length used for batch\n- `bs`: the batch size\n- `val_bs`: the batch size for the validation `DataLoader` (defaults to `bs`)\n- `shuffle_train`: if we shuffle the training `DataLoader` or not\n- `device`: the PyTorch device to use (defaults to `default_device()`)"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(TextDataLoaders.from_folder)",
"execution_count": 41,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h4 id=\"TextDataLoaders.from_folder\" class=\"doc_header\"><code>TextDataLoaders.from_folder</code><a href=\"__main__.py#L4\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n\n> <code>TextDataLoaders.from_folder</code>(**`path`**, **`train`**=*`'train'`*, **`valid`**=*`'valid'`*, **`valid_pct`**=*`None`*, **`seed`**=*`None`*, **`vocab`**=*`None`*, **`text_vocab`**=*`None`*, **`is_lm`**=*`False`*, **`tok_tfm`**=*`None`*, **`seq_len`**=*`72`*, **`backwards`**=*`False`*, **`bs`**=*`64`*, **`val_bs`**=*`None`*, **`shuffle_train`**=*`True`*, **`device`**=*`None`*)\n\nCreate from imagenet style dataset in `path` with `train` and `valid` subfolders (or provide `valid_pct`)"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "If `valid_pct` is provided, a random split is performed (with an optional `seed`) by setting aside that percentage of the data for the validation set (instead of looking at the grandparents folder). If a `vocab` is passed, only the folders with names in `vocab` are kept.\n\nHere is an example on a sample of the IMDB movie review dataset:"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#slow\npath = untar_data(URLs.IMDB)\ndls = TextDataLoaders.from_folder(path)\ndls.show_batch(max_n=3)",
"execution_count": 42,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>category</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero</td>\n <td>pos</td>\n </tr>\n <tr>\n <th>1</th>\n <td>xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad</td>\n <td>neg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad</td>\n <td>neg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(TextDataLoaders.from_df)",
"execution_count": 43,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h4 id=\"TextDataLoaders.from_df\" class=\"doc_header\"><code>TextDataLoaders.from_df</code><a href=\"__main__.py#L19\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n\n> <code>TextDataLoaders.from_df</code>(**`df`**, **`path`**=*`'.'`*, **`valid_pct`**=*`0.2`*, **`seed`**=*`None`*, **`text_col`**=*`0`*, **`label_col`**=*`1`*, **`label_delim`**=*`None`*, **`y_block`**=*`None`*, **`text_vocab`**=*`None`*, **`is_lm`**=*`False`*, **`valid_col`**=*`None`*, **`tok_tfm`**=*`None`*, **`seq_len`**=*`72`*, **`backwards`**=*`False`*, **`bs`**=*`64`*, **`val_bs`**=*`None`*, **`shuffle_train`**=*`True`*, **`device`**=*`None`*)\n\nCreate from `df` in `path` with `valid_pct`"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "`seed` can optionally be passed for reproducibility. `text_col`, `label_col` and optionally `valid_col` are indices or names of columns for texts/labels and the validation flag. `label_delim` can be passed for a multi-label problem if your labels are in one column, separated by a particular char. `y_block` should be passed to indicate your type of targets, in case the library did no infer it properly.\n\nHere are examples on subsets of IMDB:"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls = TextDataLoaders.from_df(df, path=path, text_col='text', label_col='label', valid_col='is_valid')\ndls.show_batch(max_n=3)",
"execution_count": 44,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": ""
},
"metadata": {}
},
{
"output_type": "stream",
"text": "/opt/conda/lib/python3.8/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n return array(a, dtype, copy=False, order=order)\n",
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>category</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \\n\\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is</td>\n <td>negative</td>\n </tr>\n <tr>\n <th>1</th>\n <td>xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxbos xxup the xxup shop xxup</td>\n <td>positive</td>\n </tr>\n <tr>\n <th>2</th>\n <td>xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad</td>\n <td>negative</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls = TextDataLoaders.from_df(df, path=path, text_col='text', is_lm=True, valid_col='is_valid')\ndls.show_batch(max_n=3)",
"execution_count": 45,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": ""
},
"metadata": {}
},
{
"output_type": "stream",
"text": "/opt/conda/lib/python3.8/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n return array(a, dtype, copy=False, order=order)\n",
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>text_</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>xxbos i saw this movie on t.v . this afternoon and i ca n't see how anyone can sit through this piece of trash . xxmaj it 's not funny at all and it takes your xxup i.q . down a few xxunk . i know this movie is for kids , but that does n't mean the writers should take their intelligence for granted . i bet that writers were sitting</td>\n <td>i saw this movie on t.v . this afternoon and i ca n't see how anyone can sit through this piece of trash . xxmaj it 's not funny at all and it takes your xxup i.q . down a few xxunk . i know this movie is for kids , but that does n't mean the writers should take their intelligence for granted . i bet that writers were sitting around</td>\n </tr>\n <tr>\n <th>1</th>\n <td>who the killer is . \\n\\n xxmaj the film has many twists , and xxmaj bill xxmaj paxton directs xxunk by keeping us guessing without losing interest . xxmaj the acting is incredible . xxmaj the two young leads and xxmaj paxton work great together , looking like a normal family even though they are all involved in murder . xxmaj like i said there is the one implausibility involving xxmaj powers</td>\n <td>the killer is . \\n\\n xxmaj the film has many twists , and xxmaj bill xxmaj paxton directs xxunk by keeping us guessing without losing interest . xxmaj the acting is incredible . xxmaj the two young leads and xxmaj paxton work great together , looking like a normal family even though they are all involved in murder . xxmaj like i said there is the one implausibility involving xxmaj powers xxmaj</td>\n </tr>\n <tr>\n <th>2</th>\n <td>tension but the characters are n't developed enough to care . xxmaj then it rushes through a resolution of all the outstanding problems in about a minute of screen time leaving the viewer feeling like they have just wasted their time . xxbos xxmaj xxunk made here a very interesting movie . xxmaj it begins with the description of an almost - deaf young woman , in its working universe as a</td>\n <td>but the characters are n't developed enough to care . xxmaj then it rushes through a resolution of all the outstanding problems in about a minute of screen time leaving the viewer feeling like they have just wasted their time . xxbos xxmaj xxunk made here a very interesting movie . xxmaj it begins with the description of an almost - deaf young woman , in its working universe as a secretary</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "show_doc(TextDataLoaders.from_csv)",
"execution_count": 46,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "<h4 id=\"TextDataLoaders.from_csv\" class=\"doc_header\"><code>TextDataLoaders.from_csv</code><a href=\"__main__.py#L35\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n\n> <code>TextDataLoaders.from_csv</code>(**`path`**, **`csv_fname`**=*`'labels.csv'`*, **`header`**=*`'infer'`*, **`delimiter`**=*`None`*, **`valid_pct`**=*`0.2`*, **`seed`**=*`None`*, **`text_col`**=*`0`*, **`label_col`**=*`1`*, **`label_delim`**=*`None`*, **`y_block`**=*`None`*, **`text_vocab`**=*`None`*, **`is_lm`**=*`False`*, **`valid_col`**=*`None`*, **`tok_tfm`**=*`None`*, **`seq_len`**=*`72`*, **`backwards`**=*`False`*, **`bs`**=*`64`*, **`val_bs`**=*`None`*, **`shuffle_train`**=*`True`*, **`device`**=*`None`*)\n\nCreate from `csv` file in `path/csv_fname`"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Opens the csv file with `header` and `delimiter`, then pass all the other arguments to `TextDataLoaders.from_df`."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls = TextDataLoaders.from_csv(path=path, csv_fname='texts.csv', text_col='text', label_col='label', valid_col='is_valid')\ndls.show_batch(max_n=3)",
"execution_count": 47,
"outputs": [
{
"output_type": "error",
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/root/.fastai/data/imdb/texts.csv'",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-47-70e32d29f8e3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextDataLoaders\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcsv_fname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'texts.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'is_valid'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_n\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-39-a4cfb5ad68b3>\u001b[0m in \u001b[0;36mfrom_csv\u001b[0;34m(cls, path, csv_fname, header, delimiter, **kwargs)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfrom_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcsv_fname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'labels.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'infer'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelimiter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;34m\"Create from `csv` file in `path/csv_fname`\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mcsv_fname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelimiter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdelimiter\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 686\u001b[0m )\n\u001b[1;32m 687\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 688\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 689\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 690\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 452\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 454\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfp_or_buf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 455\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 946\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 948\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 950\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1178\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"c\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1179\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"c\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1180\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1181\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"python\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 2008\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"usecols\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2009\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2010\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2011\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2012\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/.fastai/data/imdb/texts.csv'"
]
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Export -"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#hide\nfrom nbdev.export import notebook2script\nnotebook2script()",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"jupytext": {
"split_at_heading": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.8.3",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "fastai/nbs/31_text.data.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment