bfarzin/Wiki103_fastai_v1-SP_Tokenizer-Example.ipynb

## Wiki103_fastai_v1-SP_Tokenizer-Example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai import *\n",
    "from fastai.text import *\n",
    "\n",
    "import re\n",
    "import sentencepiece as spm #https://github.com/google/sentencepiece"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentencepiece tokenizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data processing to DataBunch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading the texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip\n",
    "PATH = Path('/home/farzin/rnn_python_code/wikitext-103-raw')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/farzin/anaconda3/envs/fastaiv1_dev/lib/python3.7/site-packages/ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#made up separator so it will not divide lines\n",
    "#taking first 100 lines, just to run quickly and demonstrate functionality\n",
    "all_texts_df = pd.read_csv(PATH/'wiki.train.raw',sep='%$#',header=None).head(100) \n",
    "all_texts = all_texts_df.values.squeeze()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentence Piece expects some extra tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>= Valkyria Chronicles III =</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Senjō no Valkyria 3 : Unrecorded Chronicles ( ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The game began development in 2010 , carrying ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>It met with positive sales in Japan , and was ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>= = Gameplay = =</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0\n",
       "0                        = Valkyria Chronicles III =\n",
       "1  Senjō no Valkyria 3 : Unrecorded Chronicles ( ...\n",
       "2  The game began development in 2010 , carrying ...\n",
       "3  It met with positive sales in Japan , and was ...\n",
       "4                                   = = Gameplay = ="
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_texts_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<function fastai.text.transform.fix_html(x: str) -> str>,\n",
       " <function fastai.text.transform.replace_rep(t: str) -> str>,\n",
       " <function fastai.text.transform.replace_wrep(t: str) -> str>,\n",
       " <function fastai.text.transform.spec_add_spaces(t: str) -> str>,\n",
       " <function fastai.text.transform.rm_useless_spaces(t: str) -> str>]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "defaults.text_pre_rules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<function fix_html at 0x7fa78cbc0048>\n",
      "<function replace_rep at 0x7fa78cbb4ea0>\n",
      "<function replace_wrep at 0x7fa78cbb4f28>\n",
      "<function spec_add_spaces at 0x7fa7a1307840>\n",
      "<function rm_useless_spaces at 0x7fa78cbb4e18>\n"
     ]
    }
   ],
   "source": [
    "raw_text = all_texts_df.iloc[:,0]\n",
    "for rule in defaults.text_pre_rules:\n",
    "    print(rule)\n",
    "    raw_text = raw_text.apply(lambda x: rule(str(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_texts_df['new_text'] = '<s>' + raw_text + '</s>'        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                   <s>= Valkyria Chronicles III =</s>\n",
       "1    <s>Senjō no Valkyria 3 : Unrecorded Chronicles...\n",
       "2    <s>The game began development in 2010 , carryi...\n",
       "3    <s>It met with positive sales in Japan , and w...\n",
       "4                              <s>= = Gameplay = =</s>\n",
       "Name: new_text, dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_texts_df['new_text'].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the file down so we can call the SWIG wrapped sentencepice app"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "formatted_text_file = 'wk103_text_example'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_texts_df['new_text'].to_frame().to_csv(formatted_text_file, header=False,index=False,quotechar=' ')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SentencePiece tokenizer wrapped appropriately"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# m.model and m.vocab created in the pwd\n",
    "vocab_size = 500\n",
    "model_prefix = 'wk103m_example'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spm.SentencePieceTrainer.Train(f'--input={formatted_text_file}'\\\n",
    "                               f' --model_prefix={model_prefix}'\\\n",
    "                               f' --vocab_size={vocab_size}')\n",
    "#                               f'--unk_piece={UNK} --bos_piece={BOS} --eos_id=-1 --pad_piece={PAD}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     1\t<unk>\t0\r\n",
      "     2\t<s>\t0\r\n",
      "     3\t</s>\t0\r\n",
      "     4\t▁\t-2.23492\r\n",
      "     5\ts\t-2.83873\r\n",
      "     6\t▁the\t-3.45204\r\n",
      "     7\t▁,\t-3.81317\r\n",
      "     8\tt\t-3.81373\r\n",
      "     9\te\t-3.93292\r\n",
      "    10\ted\t-4.05182\r\n"
     ]
    }
   ],
   "source": [
    "#Head of the Vocab file: Line nums indicate index of vocab\n",
    "!head -n10 {model_prefix}.vocab | nl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## load up the Processor\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.load(f'{model_prefix}.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "itos = [] #{}\n",
    "with open(f'{model_prefix}.vocab','r') as f:\n",
    "    for line_num,line in enumerate(f):\n",
    "        itos.append(line.split(\"\\t\")[0])\n",
    "\n",
    "class SPTokenizer(BaseTokenizer):\n",
    "    \"Wrapper around a SentncePiece tokenizer to make it a `BaseTokenizer`.\"\n",
    "    def __init__(self, model_prefix:str):\n",
    "        self.tok = spm.SentencePieceProcessor()\n",
    "        self.tok.load(f'{model_prefix}.model')\n",
    "\n",
    "    def tokenizer(self, t:str) -> List[str]:\n",
    "        return self.tok.EncodeAsPieces(t)\n",
    "    \n",
    "class CustomTokenizer():\n",
    "    '''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''\n",
    "    def __init__(self,tok_func:Callable,model_prefix:str, pre_rules:ListRules=None):\n",
    "        self.tok_func,self.model_prefix = tok_func,model_prefix\n",
    "        self.pre_rules  = ifnone(pre_rules,  defaults.text_pre_rules )\n",
    "        \n",
    "    def __repr__(self) -> str:\n",
    "        res = f'Tokenizer {self.tok_func.__name__} using `{self.model_prefix}` model with the following rules:\\n'\n",
    "        for rule in self.pre_rules: res += f' - {rule.__name__}\\n'\n",
    "        return res        \n",
    "\n",
    "    def process_text(self, t:str,tok:BaseTokenizer) -> List[str]:\n",
    "        \"Processe one text `t` with tokenizer `tok`.\"\n",
    "        for rule in self.pre_rules: t = rule(t)  \n",
    "        toks = tok.tokenizer(t)\n",
    "        #post rules?\n",
    "        return toks \n",
    "    \n",
    "    def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:\n",
    "        'Process a list of `texts` in one process'\n",
    "        tok = self.tok_func(self.model_prefix)\n",
    "        return [self.process_text(t,tok) for t in texts]\n",
    "                                                                     \n",
    "    def process_all(self, texts:Collection[str]) -> List[List[str]]: \n",
    "        \"Process a list of `texts`.\"                                 \n",
    "        return self._process_all_1(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "mycust_tok = CustomTokenizer(SPTokenizer, model_prefix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#setup Vocab object for use in LM\n",
    "sp_vocab = Vocab(itos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### build DataBunchfrom tokenizer and Vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = np.random.permutation(len(all_texts))\n",
    "cut = int(0.1 * len(idx))\n",
    "train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])\n",
    "valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])\n",
    "\n",
    "train_df = train_df.dropna()\n",
    "valid_df = valid_df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = TextLMDataBunch.from_df(PATH, train_df, valid_df, \n",
    "                               tokenizer=mycust_tok, vocab=sp_vocab, \n",
    "                               text_cols='text', label_cols='labels')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>idx</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>▁ ) ▁wa s ▁ place d ▁in ▁ ch ar ge ▁of ▁the ▁A rsenal ▁. ▁Du nning t on ▁pre s u m a b ly ▁return ed ▁to ▁his ▁ n a v al ▁du t ie s ▁ and ▁the ▁P on ch ar tra in ▁. ▁ x x b o s ▁H all ▁' s ▁c ar b ine s ▁2 6 7 ▁</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>▁the ▁ e ar ly ▁su m m er ▁of ▁186 3 ▁. ▁ x x b o s ▁A s ▁the ▁Nam ele s s ▁ o ffici all y ▁do ▁not ▁ex is t ▁, ▁the ▁up per ▁ e ch el on s ▁of ▁the ▁Gallia n ▁Arm y ▁exp l o it ▁the ▁con cept ▁of ▁p l a us i ble ▁ d en i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>▁the ir ▁ l ead er ▁, ▁D a hau ▁. ▁A t ▁the ▁ s ame ▁time ▁, ▁ ele ment s ▁with in ▁Gallia n ▁Arm y ▁C om m and ▁move ▁to ▁ er a se ▁the ▁Nam ele s s ▁in ▁order ▁to ▁pro t ect ▁the ir ▁ o w n ▁in t ere st s ▁. ▁H ound ed ▁b y ▁bo th ▁</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>us ed ▁Kur t ▁of ▁T reas on ▁. ▁ x x b o s ▁P er h a p s ▁the ▁most ▁ ill u m in at ing ▁point s ▁of ▁the ▁abo ve ▁\" ▁Summar y ▁of ▁Work ▁\" ▁ and ▁ th o se ▁for ▁follow ing ▁month s ▁are ▁tha t ▁the ▁ st and ar d ▁ am munition ▁ma de ▁wa s ▁.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>▁original ▁Li ttle ▁R ock ▁A rsenal ▁ and ▁one ▁of ▁the ▁ ol de st ▁building s ▁in ▁c ent r al ▁Arkansas ▁, ▁it ▁wa s ▁a lso ▁the ▁b ir th place ▁of ▁Gen eral ▁D o u g l a s ▁Mac Ar th ur ▁, ▁who ▁be c ame ▁the ▁su p re m e ▁command er ▁of ▁ U S ▁force s ▁in ▁the</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data.show_batch()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7 fasta.ai1 DEV",
   "language": "python",
   "name": "fastai1_dev"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"%reload_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from fastai import *\n",
	"from fastai.text import *\n",
	"\n",
	"import re\n",
	"import sentencepiece as spm #https://github.com/google/sentencepiece"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Sentencepiece tokenizer"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Data processing to DataBunch"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Reading the texts"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"#https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip\n",
	"PATH = Path('/home/farzin/rnn_python_code/wikitext-103-raw')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/farzin/anaconda3/envs/fastaiv1_dev/lib/python3.7/site-packages/ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
	" This is separate from the ipykernel package so we can avoid doing imports until\n"
	]
	}
	],
	"source": [
	"#made up separator so it will not divide lines\n",
	"#taking first 100 lines, just to run quickly and demonstrate functionality\n",
	"all_texts_df = pd.read_csv(PATH/'wiki.train.raw',sep='%$#',header=None).head(100) \n",
	"all_texts = all_texts_df.values.squeeze()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Sentence Piece expects some extra tokens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>= Valkyria Chronicles III =</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Senjō no Valkyria 3 : Unrecorded Chronicles ( ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>The game began development in 2010 , carrying ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>It met with positive sales in Japan , and was ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>= = Gameplay = =</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0\n",
	"0 = Valkyria Chronicles III =\n",
	"1 Senjō no Valkyria 3 : Unrecorded Chronicles ( ...\n",
	"2 The game began development in 2010 , carrying ...\n",
	"3 It met with positive sales in Japan , and was ...\n",
	"4 = = Gameplay = ="
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"all_texts_df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[<function fastai.text.transform.fix_html(x: str) -> str>,\n",
	" <function fastai.text.transform.replace_rep(t: str) -> str>,\n",
	" <function fastai.text.transform.replace_wrep(t: str) -> str>,\n",
	" <function fastai.text.transform.spec_add_spaces(t: str) -> str>,\n",
	" <function fastai.text.transform.rm_useless_spaces(t: str) -> str>]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"defaults.text_pre_rules"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<function fix_html at 0x7fa78cbc0048>\n",
	"<function replace_rep at 0x7fa78cbb4ea0>\n",
	"<function replace_wrep at 0x7fa78cbb4f28>\n",
	"<function spec_add_spaces at 0x7fa7a1307840>\n",
	"<function rm_useless_spaces at 0x7fa78cbb4e18>\n"
	]
	}
	],
	"source": [
	"raw_text = all_texts_df.iloc[:,0]\n",
	"for rule in defaults.text_pre_rules:\n",
	" print(rule)\n",
	" raw_text = raw_text.apply(lambda x: rule(str(x)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"all_texts_df['new_text'] = '<s>' + raw_text + '</s>' "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0 <s>= Valkyria Chronicles III =</s>\n",
	"1 <s>Senjō no Valkyria 3 : Unrecorded Chronicles...\n",
	"2 <s>The game began development in 2010 , carryi...\n",
	"3 <s>It met with positive sales in Japan , and w...\n",
	"4 <s>= = Gameplay = =</s>\n",
	"Name: new_text, dtype: object"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"all_texts_df['new_text'].head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Save the file down so we can call the SWIG wrapped sentencepice app"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"formatted_text_file = 'wk103_text_example'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"all_texts_df['new_text'].to_frame().to_csv(formatted_text_file, header=False,index=False,quotechar=' ')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### SentencePiece tokenizer wrapped appropriately"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"# m.model and m.vocab created in the pwd\n",
	"vocab_size = 500\n",
	"model_prefix = 'wk103m_example'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"spm.SentencePieceTrainer.Train(f'--input={formatted_text_file}'\\\n",
	" f' --model_prefix={model_prefix}'\\\n",
	" f' --vocab_size={vocab_size}')\n",
	"# f'--unk_piece={UNK} --bos_piece={BOS} --eos_id=-1 --pad_piece={PAD}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 1\t<unk>\t0\r\n",
	" 2\t<s>\t0\r\n",
	" 3\t</s>\t0\r\n",
	" 4\t▁\t-2.23492\r\n",
	" 5\ts\t-2.83873\r\n",
	" 6\t▁the\t-3.45204\r\n",
	" 7\t▁,\t-3.81317\r\n",
	" 8\tt\t-3.81373\r\n",
	" 9\te\t-3.93292\r\n",
	" 10\ted\t-4.05182\r\n"
	]
	}
	],
	"source": [
	"#Head of the Vocab file: Line nums indicate index of vocab\n",
	"!head -n10 {model_prefix}.vocab \| nl"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"## load up the Processor\n",
	"sp = spm.SentencePieceProcessor()\n",
	"sp.load(f'{model_prefix}.model')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"itos = [] #{}\n",
	"with open(f'{model_prefix}.vocab','r') as f:\n",
	" for line_num,line in enumerate(f):\n",
	" itos.append(line.split(\"\\t\")[0])\n",
	"\n",
	"class SPTokenizer(BaseTokenizer):\n",
	" \"Wrapper around a SentncePiece tokenizer to make it a `BaseTokenizer`.\"\n",
	" def __init__(self, model_prefix:str):\n",
	" self.tok = spm.SentencePieceProcessor()\n",
	" self.tok.load(f'{model_prefix}.model')\n",
	"\n",
	" def tokenizer(self, t:str) -> List[str]:\n",
	" return self.tok.EncodeAsPieces(t)\n",
	" \n",
	"class CustomTokenizer():\n",
	" '''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''\n",
	" def __init__(self,tok_func:Callable,model_prefix:str, pre_rules:ListRules=None):\n",
	" self.tok_func,self.model_prefix = tok_func,model_prefix\n",
	" self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules )\n",
	" \n",
	" def __repr__(self) -> str:\n",
	" res = f'Tokenizer {self.tok_func.__name__} using `{self.model_prefix}` model with the following rules:\\n'\n",
	" for rule in self.pre_rules: res += f' - {rule.__name__}\\n'\n",
	" return res \n",
	"\n",
	" def process_text(self, t:str,tok:BaseTokenizer) -> List[str]:\n",
	" \"Processe one text `t` with tokenizer `tok`.\"\n",
	" for rule in self.pre_rules: t = rule(t) \n",
	" toks = tok.tokenizer(t)\n",
	" #post rules?\n",
	" return toks \n",
	" \n",
	" def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:\n",
	" 'Process a list of `texts` in one process'\n",
	" tok = self.tok_func(self.model_prefix)\n",
	" return [self.process_text(t,tok) for t in texts]\n",
	" \n",
	" def process_all(self, texts:Collection[str]) -> List[List[str]]: \n",
	" \"Process a list of `texts`.\" \n",
	" return self._process_all_1(texts)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"mycust_tok = CustomTokenizer(SPTokenizer, model_prefix)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"#setup Vocab object for use in LM\n",
	"sp_vocab = Vocab(itos)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### build DataBunchfrom tokenizer and Vocab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"idx = np.random.permutation(len(all_texts))\n",
	"cut = int(0.1 * len(idx))\n",
	"train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])\n",
	"valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])\n",
	"\n",
	"train_df = train_df.dropna()\n",
	"valid_df = valid_df.dropna()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = TextLMDataBunch.from_df(PATH, train_df, valid_df, \n",
	" tokenizer=mycust_tok, vocab=sp_vocab, \n",
	" text_cols='text', label_cols='labels')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th>idx</th>\n",
	" <th>text</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td>0</td>\n",
	" <td>▁ ) ▁wa s ▁ place d ▁in ▁ ch ar ge ▁of ▁the ▁A rsenal ▁. ▁Du nning t on ▁pre s u m a b ly ▁return ed ▁to ▁his ▁ n a v al ▁du t ie s ▁ and ▁the ▁P on ch ar tra in ▁. ▁ x x b o s ▁H all ▁' s ▁c ar b ine s ▁2 6 7 ▁</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>1</td>\n",
	" <td>▁the ▁ e ar ly ▁su m m er ▁of ▁186 3 ▁. ▁ x x b o s ▁A s ▁the ▁Nam ele s s ▁ o ffici all y ▁do ▁not ▁ex is t ▁, ▁the ▁up per ▁ e ch el on s ▁of ▁the ▁Gallia n ▁Arm y ▁exp l o it ▁the ▁con cept ▁of ▁p l a us i ble ▁ d en i</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>2</td>\n",
	" <td>▁the ir ▁ l ead er ▁, ▁D a hau ▁. ▁A t ▁the ▁ s ame ▁time ▁, ▁ ele ment s ▁with in ▁Gallia n ▁Arm y ▁C om m and ▁move ▁to ▁ er a se ▁the ▁Nam ele s s ▁in ▁order ▁to ▁pro t ect ▁the ir ▁ o w n ▁in t ere st s ▁. ▁H ound ed ▁b y ▁bo th ▁</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>3</td>\n",
	" <td>us ed ▁Kur t ▁of ▁T reas on ▁. ▁ x x b o s ▁P er h a p s ▁the ▁most ▁ ill u m in at ing ▁point s ▁of ▁the ▁abo ve ▁\" ▁Summar y ▁of ▁Work ▁\" ▁ and ▁ th o se ▁for ▁follow ing ▁month s ▁are ▁tha t ▁the ▁ st and ar d ▁ am munition ▁ma de ▁wa s ▁.</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>4</td>\n",
	" <td>▁original ▁Li ttle ▁R ock ▁A rsenal ▁ and ▁one ▁of ▁the ▁ ol de st ▁building s ▁in ▁c ent r al ▁Arkansas ▁, ▁it ▁wa s ▁a lso ▁the ▁b ir th place ▁of ▁Gen eral ▁D o u g l a s ▁Mac Ar th ur ▁, ▁who ▁be c ame ▁the ▁su p re m e ▁command er ▁of ▁ U S ▁force s ▁in ▁the</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>"
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"data.show_batch()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3.7 fasta.ai1 DEV",
	"language": "python",
	"name": "fastai1_dev"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	},
	"varInspector": {
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"delete_cmd_postfix": "",
	"delete_cmd_prefix": "del ",
	"library": "var_list.py",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"delete_cmd_postfix": ") ",
	"delete_cmd_prefix": "rm(",
	"library": "var_list.r",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	],
	"window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}