ohmeow/_TextProcessor-dev.ipynb Secret

## _TextProcessor-dev.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import html, pdb\n",
    "import os, re\n",
    "import dill as pickle\n",
    "from collections import Counter, defaultdict\n",
    "from itertools import chain\n",
    "from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "\n",
    "import spacy\n",
    "from spacy.symbols import ORTH\n",
    "spacy_en = spacy.load('en')\n",
    "spacy_es = spacy.load('es')\n",
    "\n",
    "# pandas and plotting config\n",
    "pd.set_option('display.max_rows', 1000)\n",
    "pd.set_option('display.max_columns', 1000)\n",
    "pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def num_cpus():\n",
    "    try:\n",
    "        return len(os.sched_getaffinity(0))\n",
    "    except AttributeError:\n",
    "        return os.cpu_count()\n",
    "    \n",
    "def partition(a, sz): \n",
    "    \"\"\"splits iterables a in equal parts of size sz\"\"\"\n",
    "    return [a[i:i+sz] for i in range(0, len(a), sz)]\n",
    "\n",
    "def partition_by_cores(a):\n",
    "    return partition(a, len(a)//num_cpus() + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "code_folding": [
     0,
     38,
     105,
     138
    ]
   },
   "outputs": [],
   "source": [
    "class StringCleaner():\n",
    "    # get rid of multiple spaces\n",
    "    re_spaces = re.compile(r'  +')            \n",
    "    \n",
    "    def __init__(self, str_repls_dict={}, re_repls_dict={}, regrp_repls_dict={}):\n",
    "        self.str_repls = str_repls_dict\n",
    "        self.re_repls = re_repls_dict\n",
    "        self.regrp_repls = regrp_repls_dict\n",
    "        \n",
    "        \n",
    "    def clean(self, x):\n",
    "        # replace based on regexs (keeping case)\n",
    "        for k, v in self.re_repls.items(): x = StringCleaner.re_replace(k, v, x)\n",
    "        # replace based on strings\n",
    "        for k, v in self.str_repls.items(): x = x.replace(k, v)\n",
    "        # replace regexs based on regex groups (e.g., r'\\1 \\2 ')\n",
    "        for k, v in self.regrp_repls.items(): x = re.sub(k, v, x)\n",
    "            \n",
    "        x = StringCleaner.re_spaces.sub(' ', html.unescape(x))\n",
    "\n",
    "        return x\n",
    "    \n",
    "    \n",
    "    @staticmethod\n",
    "    def re_replace(word, replacement, text):\n",
    "        # does regex replace making the substitution the same case\n",
    "        def func(match):\n",
    "            g = match.group()\n",
    "            if g.islower(): return replacement.lower()\n",
    "            if g.istitle(): return replacement.title()\n",
    "            if g.isupper(): return replacement.upper()\n",
    "            return replacement      \n",
    "\n",
    "        return re.sub(word, func, text, flags=re.I)\n",
    "    \n",
    "    @staticmethod\n",
    "    def appos_re_repls():\n",
    "        # https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view : apostrophe lookup dict\n",
    "        repls = {\n",
    "            r\"\\baren't\\b\" : \"are not\",\n",
    "            r\"\\bcan't\\b\" : \"cannot\",\n",
    "            r\"\\bcouldn't\\b\" : \"could not\",\n",
    "            r\"\\bdidn't\\b\" : \"did not\",\n",
    "            r\"\\bdoesn't\\b\" : \"does not\",\n",
    "            r\"\\bdon't\\b\" : \"do not\",\n",
    "            r\"\\bhadn't\\b\" : \"had not\",\n",
    "            r\"\\bhasn't\\b\" : \"has not\",\n",
    "            r\"\\bhaven't\\b\" : \"have not\",\n",
    "            r\"\\bhe'd\\b\" : \"he would\",\n",
    "            r\"\\bhe'll\\b\" : \"he will\",\n",
    "            r\"\\bhe's\\b\" : \"he is\",\n",
    "            r\"\\bi'd\\b\" : \"I would\",\n",
    "            r\"\\bi'd\\b\" : \"I had\",\n",
    "            r\"\\bi'll\\b\" : \"I will\",\n",
    "            r\"\\bi'm\\b\" : \"I am\",\n",
    "            r\"\\bisn't\\b\" : \"is not\",\n",
    "            r\"\\bits\\b\" : \"it is\",\n",
    "            r\"\\bit's\\b\" : \"it is\",\n",
    "            r\"\\bit'll\\b\" : \"it will\",\n",
    "            r\"\\bi've\\b\" : \"I have\",\n",
    "            r\"\\blet's\\b\" : \"let us\",\n",
    "            r\"\\bmightn't\\b\" : \"might not\",\n",
    "            r\"\\bmustn't\\b\" : \"must not\",\n",
    "            r\"\\bshan't\\b\" : \"shall not\",\n",
    "            r\"\\bshe'd\\b\" : \"she would\",\n",
    "            r\"\\bshe'll\\b\" : \"she will\",\n",
    "            r\"\\bshe's\\b\" : \"she is\",\n",
    "            r\"\\bshouldn't\\b\" : \"should not\",\n",
    "            r\"\\bthat's\\b\" : \"that is\",\n",
    "            r\"\\bthere's\\b\" : \"there is\",\n",
    "            r\"\\bthey'd\\b\" : \"they would\",\n",
    "            r\"\\bthey'll\\b\" : \"they will\",\n",
    "            r\"\\bthey're\\b\" : \"they are\",\n",
    "            r\"\\bthey've\\b\" : \"they have\",\n",
    "            r\"\\bwe'd\\b\" : \"we would\",\n",
    "            r\"\\bwe're\\b\" : \"we are\",\n",
    "            r\"\\bweren't\\b\" : \"were not\",\n",
    "            r\"\\bwe've\\b\" : \"we have\",\n",
    "            r\"\\bwhat'll\\b\" : \"what will\",\n",
    "            r\"\\bwhat're\\b\" : \"what are\",\n",
    "            r\"\\bwhat's\\b\" : \"what is\",\n",
    "            r\"\\bwhat've\\b\" : \"what have\",\n",
    "            r\"\\bwhere's\\b\" : \"where is\",\n",
    "            r\"\\bwho'd\\b\" : \"who would\",\n",
    "            r\"\\bwho'll\\b\" : \"who will\",\n",
    "            r\"\\bwho're\\b\" : \"who are\",\n",
    "            r\"\\bwho's\\b\" : \"who is\",\n",
    "            r\"\\bwho've\\b\" : \"who have\",\n",
    "            r\"\\bwon't\\b\" : \"will not\",\n",
    "            r\"\\bwouldn't\\b\" : \"would not\",\n",
    "            r\"\\byou'd\\b\" : \"you would\",\n",
    "            r\"\\byou'll\\b\" : \"you will\",\n",
    "            r\"\\byou're\\b\" : \"you are\",\n",
    "            r\"\\byou've\\b\" : \"you have\",\n",
    "            r\"\\b're\\b\" : \" are\",\n",
    "            r\"\\bwasn't\\b\" : \"was not\",\n",
    "            r\"\\bwe'll\\b\" : \"will\",\n",
    "            r\"\\bdidn't\\b\" : \"did not\",\n",
    "            r\"\\btryin'\\b\" : \"trying\"\n",
    "        }\n",
    "        return repls\n",
    "    \n",
    "    @staticmethod\n",
    "    def emoji_str_repls():\n",
    "        # based on https://www.kaggle.com/prashantkikani/pooled-gru-with-preprocessing\n",
    "        repls = {\n",
    "            \"&lt;3\": \" love \",\n",
    "            \":]\" : \" happy \",\n",
    "            \"=)\" : \" happy \",\n",
    "            \"8)\": \" happy \",\n",
    "            \":-)\": \" happy \",\n",
    "            \":)\": \" happy \",\n",
    "            \"(-:\": \" happy \",\n",
    "            \"(:\": \" happy \",\n",
    "            \":&gt;\": \" happy \",\n",
    "            \":')\": \" happy \",\n",
    "            \"(:\" : \" happy \",\n",
    "            \":d\": \" laughing \",\n",
    "            \":dd\": \" laughing \",\n",
    "            \";-)\" : \" wink \",\n",
    "            \";)\": \" wink \",\n",
    "            \":p\": \" playful \",\n",
    "            \":o\" : \" surprise \",\n",
    "            \":-(\": \" sad \",\n",
    "            \":(\": \" sad \",\n",
    "            \"=(\" : \" sad \",\n",
    "            \"):\" : \" sad \",\n",
    "            \":/\": \" skeptical \",\n",
    "            \":s\": \" skeptical \",\n",
    "            \":-s\": \" skeptical \",\n",
    "            \"^^\": \" nervous \",\n",
    "            \"^_^\": \" nervous \",\n",
    "            \"-_-\" : \" shame \",\n",
    "        }\n",
    "        return repls\n",
    "        \n",
    "    @staticmethod\n",
    "    def weirdchar_str_repls():\n",
    "        repls = {\n",
    "            \"#39;\" : \"'\",   \n",
    "            'amp;' : '&',   \n",
    "            '#146;' : \"'\",   \n",
    "            'nbsp;' : ' ',   \n",
    "            '#36;' : '$',   \n",
    "            '\\\\n' : \"\\n\",   \n",
    "            'quot;' : \"'\",   \n",
    "            '’' : \"'\",   \n",
    "            \"´\" : \"'\",\n",
    "            \"`\" : \"'\",\n",
    "            '`' : \"'\", \n",
    "            '´' : \"'\", \n",
    "            '“' : '\"',   \n",
    "            '”' : '\"',   \n",
    "            '<br />' : \"\\n\",   \n",
    "            '\\\\\"' : '\"',   \n",
    "            '<unk>' : 'u_n',   \n",
    "            ' @.@ ' : '.',   \n",
    "            ' @-@ ' : '-',   \n",
    "            '\\\\' : ' \\\\ ',   \n",
    "            '•' : '-'\n",
    "        }\n",
    "        return repls\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "code_folding": [
     0
    ]
   },
   "outputs": [],
   "source": [
    "class Tokenizer():\n",
    "    # regexs\n",
    "    re_rep = re.compile(r'(\\S)(\\1{3,})')               # => repeated characters\n",
    "    re_word_rep = re.compile(r'(\\b\\w+\\W+)(\\1{3,})')    # => repeated words\n",
    "    re_br = re.compile(r'<\\s*br\\s*/?>', re.IGNORECASE) # => <br> tag\n",
    "    \n",
    "    def __init__(self, lang='en', special_toks:dict=None, \n",
    "                 annotate_caps=True, annotate_char_rep=True, annotate_word_rep=True):\n",
    "        \n",
    "        # load language specific spacy\n",
    "        self.tok = spacy.load(lang)\n",
    "        \n",
    "        # this is the default token set\n",
    "        self.special_toks = special_toks or {'unk': '_unk_', 'pad': '_pad_', 'bos': '_bos_', 'eos': '_eos_'}\n",
    "        \n",
    "        self.annotate_caps = annotate_caps\n",
    "        self.annotate_char_rep = annotate_char_rep\n",
    "        self.annotate_word_rep = annotate_word_rep\n",
    "        \n",
    "    @property\n",
    "    def special_toks(self): return self._special_toks\n",
    "    \n",
    "    @special_toks.setter\n",
    "    def special_toks(self, value:dict):\n",
    "        if ('unk' not in value or 'pad' not in value):\n",
    "            raise ValueError(\"Must include both 'unk' and 'pad' token replacements\")\n",
    "            \n",
    "        self._special_toks = value\n",
    "        for w in list(self._special_toks.values()):\n",
    "            self.tok.tokenizer.add_special_case(w, [{ORTH: w}])\n",
    "             \n",
    "    def add_special_tok(self, tok=dict):\n",
    "        self.special_toks.update(tok)\n",
    "        \n",
    "        \n",
    "    def spacy_tok(self, x):\n",
    "        return [ t.text for t in self.tok.tokenizer(Tokenizer.replace_br(x)) ]\n",
    "    \n",
    "    def proc_text(self, s):\n",
    "        if (self.annotate_char_rep): s = re.sub(Tokenizer.re_rep, Tokenizer.replace_rep, s)\n",
    "        if (self.annotate_word_rep): s = re.sub(Tokenizer.re_word_rep, Tokenizer.replace_wrep, s)\n",
    "        if (self.annotate_caps): s = Tokenizer.annotate_caps(s)\n",
    "        \n",
    "        s = re.sub(r'([/#])', r' \\1 ', s)\n",
    "        s = re.sub(' {2,}', ' ', s)\n",
    "        \n",
    "        return self.spacy_tok(s)\n",
    "\n",
    "    \n",
    "    @classmethod\n",
    "    def replace_br(self, x): \n",
    "        return self.re_br.sub(\"\\n\", x)\n",
    "    \n",
    "    @classmethod\n",
    "    def replace_rep(cls, m):\n",
    "        TK_REP = 'tk_rep'\n",
    "        c, cc = m.groups()\n",
    "        return f' {TK_REP} {len(cc)+1} {c} '\n",
    "\n",
    "    @classmethod\n",
    "    def replace_wrep(cls, m):\n",
    "        TK_WREP = 'tk_wrep'\n",
    "        c, cc = m.groups()\n",
    "        return f' {TK_WREP} {len(cc.split())+1} {c} '\n",
    "\n",
    "    @classmethod\n",
    "    def annotate_caps(cls, ss):\n",
    "        TOK_UP = ' t_up '\n",
    "        res = []\n",
    "        prev='.'\n",
    "        \n",
    "        re_word = re.compile('\\w')\n",
    "        re_nonsp = re.compile('\\S')\n",
    "        \n",
    "        for s in re.findall(r'\\w+|\\W+', ss):\n",
    "            res += ([TOK_UP,s.lower()] if (s.isupper() and (len(s)>2)) else [s.lower()])\n",
    "        return ''.join(res)\n",
    "\n",
    "\n",
    "    @classmethod\n",
    "    def proc_all(cls, ss, lang, special_toks:dict=None):\n",
    "        tok = cls(lang, special_toks)\n",
    "        return [tok.proc_text(s) for s in ss]\n",
    "\n",
    "    @classmethod\n",
    "    def proc_all_mp(cls, ss, lang='en', special_toks:dict=None, ncpus = None):\n",
    "        ncpus = ncpus or num_cpus()//2\n",
    "        with ProcessPoolExecutor(ncpus) as e:\n",
    "            return sum(e.map(cls.proc_all, ss, [lang]*len(ss), [special_toks]*len(ss)), [])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "code_folding": [
     0
    ]
   },
   "outputs": [],
   "source": [
    "class Vocab:\n",
    "    def __init__(self, min_freq=1, max_size=None, special_toks:dict=None):\n",
    "        \n",
    "        self.min_freq = max(min_freq, 1)\n",
    "        self.max_size = max_size\n",
    "        self.itos = []\n",
    "        \n",
    "        self.special_toks = special_toks or {'unk': '_unk_', 'pad': '_pad_', 'bos': '_bos_', 'eos': '_eos_'}\n",
    "            \n",
    "        \n",
    "    @property\n",
    "    def special_toks(self): return self._special_toks\n",
    "    \n",
    "    @special_toks.setter\n",
    "    def special_toks(self, value:dict): \n",
    "        if ('unk' not in value or 'pad' not in value):\n",
    "            raise ValueError(\"Must include both 'unk' and 'pad' token replacements\")\n",
    "            \n",
    "        self._special_toks = value\n",
    "\n",
    "    def add_special_tok(self, tok=dict):\n",
    "        self.special_toks.update(tok)\n",
    "        \n",
    "    def get_special_idx(self, key):\n",
    "        return list(self.special_toks.keys()).index(key)\n",
    "            \n",
    "            \n",
    "    def build(self, tokens):\n",
    "        self.itos = list(self.special_toks.values())\n",
    "        self.max_size = None if self.max_size is None else self.max_size + len(self.itos)\n",
    "        \n",
    "        self.token_freqs = Counter(tokens)\n",
    "        for t in list(self.special_toks.values()): del self.token_freqs[t]\n",
    "            \n",
    "        # itos (sorted by freq, alpha)\n",
    "        sorted_freqs = sorted(self.token_freqs.most_common(self.max_size), key=lambda i: (-i[1], i[0]))\n",
    "        self.itos += [ t for t, c in sorted_freqs if c > self.min_freq ]\n",
    "        \n",
    "        # stoi\n",
    "        self.stoi = defaultdict(lambda: self.get_special_idx('unk'), { tok:i for i, tok in enumerate(self.itos) })\n",
    "        \n",
    "    def __len__(self):\n",
    "        return len(self.itos)\n",
    "    \n",
    "    def token_freq(self, token):\n",
    "        return self.token_freqs.get(token, 0)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "code_folding": [
     0
    ]
   },
   "outputs": [],
   "source": [
    "class TextProcessor():\n",
    "    \n",
    "    def __init__(self, txt_cols, lbl_cols=[], lbl_dtype=np.int64, lang='en', special_toks:dict=None,\n",
    "                 fld_tok='_xfld_', include_leading_nl=False, include_bos_tok=False, include_eos_tok=False,\n",
    "                 cleaner: StringCleaner=None, tokenizer: Tokenizer=None, vocab: Vocab=None):\n",
    "        \n",
    "        self.lang = lang\n",
    "        self.txt_cols, self.lbl_cols, self.lbl_dtype = txt_cols, lbl_cols, lbl_dtype\n",
    "        \n",
    "        self.fld_tok = fld_tok\n",
    "        self.special_toks = special_toks or {'unk': '_unk_', 'pad': '_pad_', 'bos': '_bos_', 'eos': '_eos_'}\n",
    "        if (self.fld_tok != None): self.special_toks.update({'fld': fld_tok})\n",
    "            \n",
    "        self.include_leading_nl = include_leading_nl\n",
    "        self.include_fld_tok = self.fld_tok != None\n",
    "        self.include_bos_tok = include_bos_tok\n",
    "        self.include_eos_tok = include_eos_tok\n",
    "        \n",
    "        # 1. define string cleaning/pre-processing \n",
    "        str_repls, re_repls = {** StringCleaner.weirdchar_str_repls()}, {}\n",
    "        self.cleaner = StringCleaner(str_repls, re_repls) if (cleaner == None) else cleaner\n",
    "        \n",
    "        # 2. define tokenizer\n",
    "        self.tokenizer = Tokenizer if (tokenizer == None) else tokenizer\n",
    "        \n",
    "        # 3. build vocab\n",
    "        self.vocab = Vocab(2, 60000, self.special_toks) if (vocab == None) else vocab\n",
    "        \n",
    "        \n",
    "    @property\n",
    "    def special_toks(self): return self._special_toks\n",
    "    \n",
    "    @special_toks.setter\n",
    "    def special_toks(self, value:dict):\n",
    "        if ('unk' not in value or 'pad' not in value):\n",
    "            raise ValueError(\"Must include both 'unk' and 'pad' token replacements\")\n",
    "            \n",
    "        self._special_toks = value\n",
    "        \n",
    "        \n",
    "    def fit(self, data, rebuild_vocab=False):\n",
    "        is_mult_datasets = isinstance(data, list)\n",
    "        \n",
    "        tok_docs, doc_lbls = self.__preprocess(data, is_mult_datasets)\n",
    "        \n",
    "        # build vocab if necessary\n",
    "        if (len(self.vocab) == 0 or rebuild_vocab == True): self.__build_vocab(tok_docs, is_mult_datasets)        \n",
    "        return tok_docs, doc_lbls\n",
    "    \n",
    "    def fit_transform(self, data, rebuild_vocab=False):\n",
    "        is_mult_datasets = isinstance(data, list)\n",
    "        \n",
    "        tok_docs, doc_lbls = self.fit(data, rebuild_vocab)\n",
    "        return self.__numericalize(tok_docs, is_mult_datasets), tok_docs, doc_lbls\n",
    "    \n",
    "    def transform(self, data, rebuild_vocab=False):\n",
    "        is_mult_datasets = isinstance(data, list)\n",
    "        \n",
    "        tok_docs, doc_lbls = self.__preprocess(data, is_mult_datasets)\n",
    "        \n",
    "        # build vocab if necessary\n",
    "        if (rebuild_vocab == True): self.__build_vocab(tok_docs, is_mult_datasets) \n",
    "        return self.__numericalize(tok_docs, is_mult_datasets), tok_docs, doc_lbls\n",
    "        \n",
    "    \n",
    "    # private methods\n",
    "    def __build_vocab(self, tok_docs, is_mult_datasets):\n",
    "        all_toks = []\n",
    "        if (is_mult_datasets):\n",
    "            for doc_set in tok_docs:\n",
    "                for doc in doc_set: all_toks += doc\n",
    "        else:\n",
    "            for doc in tok_docs: all_toks += doc\n",
    "\n",
    "        self.vocab.build(all_toks)\n",
    "        \n",
    "    def __numericalize(self, tok_docs, is_mult_datasets):\n",
    "        if (is_mult_datasets):\n",
    "            num_doc_sets = []\n",
    "            for doc_set in tok_docs:\n",
    "                num_doc_sets.append([[ self.vocab.stoi[tok] for tok in d] for d in doc_set ])\n",
    "            return num_doc_sets\n",
    "        else:\n",
    "            return [[ self.vocab.stoi[tok] for tok in d] for d in tok_docs ]\n",
    "    \n",
    "    def __preprocess(self, data, is_mult_datasets):\n",
    "        tok_docs, doc_lbls = [], []\n",
    "        \n",
    "        # data is a DataFrame\n",
    "        if (isinstance(data, pd.DataFrame)):\n",
    "            _tok_docs, _doc_lbls = self.__proc_df(data)\n",
    "            tok_docs += _tok_docs\n",
    "            doc_lbls += _doc_lbls     \n",
    "            \n",
    "            return tok_docs, doc_lbls\n",
    "        \n",
    "        # data is a pandas TextFileReader\n",
    "        if (isinstance(data, pd.io.parsers.TextFileReader)):\n",
    "            for i, df in enumerate(data):\n",
    "                _tok_docs, _doc_lbls = self.__proc_df(df)\n",
    "                tok_docs += _tok_docs\n",
    "                doc_lbls += _doc_lbls\n",
    "            \n",
    "            return tok_docs, doc_lbls\n",
    "        \n",
    "        # data is a list of DataFrames\n",
    "        if (isinstance(data, list) and isinstance(data[0], pd.DataFrame)):\n",
    "            for df in data:\n",
    "                tmp_tok_docs, tmp_doc_lbls = [], []\n",
    "                _tok_docs, _doc_lbls = self.__proc_df(df)\n",
    "                tmp_tok_docs += _tok_docs\n",
    "                tmp_doc_lbls += _doc_lbls\n",
    "                    \n",
    "            return tok_docs.append(tmp_tok_docs), doc_lbls.append(tmp_doc_lbls)\n",
    "            \n",
    "        # data is a list of pandas TextFileReaders\n",
    "        if (isinstance(data, list) and isinstance(data[0], pd.io.parsers.TextFileReader)):\n",
    "            for tfr in data:\n",
    "                tmp_tok_docs, tmp_doc_lbls = [], []\n",
    "                for i, df in enumerate(tfr):\n",
    "                    _tok_docs, _doc_lbls = self.__proc_df(df)\n",
    "                    tmp_tok_docs += _tok_docs\n",
    "                    tmp_doc_lbls += _doc_lbls\n",
    "                    \n",
    "                tok_docs.append(tmp_tok_docs)\n",
    "                doc_lbls.append(tmp_doc_lbls)\n",
    "                \n",
    "            return tok_docs, doc_lbls\n",
    "        \n",
    "              \n",
    "    def __proc_df(self, df):\n",
    "        n_txt_cols = len(self.txt_cols)\n",
    "        n_label_cols = len(self.lbl_cols)\n",
    "\n",
    "        doc_lbls = df[self.lbl_cols].values.astype(self.lbl_dtype) if (n_label_cols > 0) else []\n",
    "        \n",
    "        lead_tok = '\\n' if (self.include_leading_nl) else ''\n",
    "        bos_tok = self.special_toks[\"bos\"] if (self.include_eos_tok != None) else ''\n",
    "        fld_tok = f' {self.fld_tok} ' if (self.fld_tok != None) else ''\n",
    "        eos_tok = self.special_toks[\"eos\"] if (self.include_eos_tok != None) else ''\n",
    "\n",
    "        docs = f'{lead_tok}{bos_tok}{fld_tok}{\" 1 \" if (fld_tok != \"\") else \"\"}' + df[self.txt_cols[0]].astype(str)\n",
    "        for i, col in enumerate(self.txt_cols[1:]):\n",
    "            docs += f'{fld_tok}{i+2 if (fld_tok != \"\") else \"\"}' + df[col].astype(str)\n",
    "\n",
    "        docs = docs.apply(self.cleaner.clean).values.astype(str)\n",
    "        doc_toks = self.tokenizer.proc_all_mp(partition_by_cores(docs), self.lang, self.special_toks)\n",
    "\n",
    "        return doc_toks, list(doc_lbls)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH = Path('data/verbatims')\n",
    "\n",
    "LM_PATH = PATH/'lm'\n",
    "CLS_PATH = PATH/'class'\n",
    "\n",
    "(LM_PATH/'models').mkdir(parents=True, exist_ok=True)\n",
    "(LM_PATH/'tmp').mkdir(exist_ok=True)\n",
    "\n",
    "(CLS_PATH/'models').mkdir(parents=True, exist_ok=True)\n",
    "(CLS_PATH/'tmp').mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "# dataframe config\n",
    "verbatims_filename = 'verbatims.csv'\n",
    "\n",
    "txt_dtypes = { \n",
    "    'AnswerText': str, 'AnswerText_Cleaned': str, 'AnswerText_NonEnglish': str\n",
    "}\n",
    "sent_dtypes = { \n",
    "    'OverallSentiment': int, 'IsVeryPositive': int, 'IsPositive': int, 'IsVeryNegative': int, 'IsNegative' : int, \n",
    "    'IsSuggestion' : int, 'FeelsThreatened' : int, 'HasProfanity' : int, 'IsNonsense' : int \n",
    "}\n",
    "ent_dtypes = { \n",
    "    'HasPersonsName': int, 'HasOrgName': int, 'HasContactInfo': int\n",
    "}\n",
    "\n",
    "date_cols = ['LastTaggedOn']\n",
    "\n",
    "dtypes = {**txt_dtypes, **sent_dtypes, **ent_dtypes}\n",
    "\n",
    "\n",
    "# columns for text, labels, and classes\n",
    "TXT_COLS = ['AnswerText', 'AnswerText_Cleaned']\n",
    "\n",
    "LABELS_SENT = list(sent_dtypes.keys())\n",
    "LABELS_ENT = list(ent_dtypes.keys())\n",
    "LABELS = LABELS_SENT + LABELS_ENT\n",
    "\n",
    "CLASSES = [['Very Negative', 'Negative', 'Neutral', 'Positive', 'VeryPositive'], ['no', 'yes']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configure TextProcessor\n",
    "\n",
    "You can configure via a combination of using the class constructor's parameters and subclassing any of the dependent classes required to prepare the text for machine learning (e.g. StringCleaner, Tokenizer, Vocab) and overriding the appropriate methods.\n",
    "\n",
    "Here I'm adding a few regular expressions and their substition based on matched groups to my StringCleaner instance, as well as the \"weird character\" string replacements made available via a classmethod"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # try to handle places where a new sentence doesn't begin with a space (e.g., I like dogs.I like cats)\n",
    "# # without breaking apart things like urls and emails\n",
    "# re_sentend = re.compile(r'(?<!www)\\.((?!com|edu|org|net|m\\b)[a-zA-Z]+)(?!(@|\\.(com|edu|org|net)))\\b') \n",
    "\n",
    "# # separate hyphen|tilde if it is at beginning of letter/digit\n",
    "# re_hypword = re.compile(r'\\s(\\-+|~+)([a-zA-Z0-9])')\n",
    "\n",
    "# # ensure am|pm is considered it own token (7:00pm > 7:00 pm, 7am-10pm > 7 am - 10 pm))\n",
    "# re_ampm = re.compile(r'(\\d+)(am|pm|am\\-|pm\\-|a\\.m\\.|p\\.m\\.|a\\.m\\.\\-|p\\.m\\.\\-)')  \n",
    "\n",
    "regrp_repls = {\n",
    "    r'(\\d+)(am|pm|am\\-|pm\\-|a\\.m\\.|p\\.m\\.|a\\.m\\.\\-|p\\.m\\.\\-)' : r' \\1 \\2',\n",
    "    r'(?<!www)\\.((?!com|edu|org|net|m\\b)[a-zA-Z]+)(?!(@|\\.(com|edu|org|net)))\\b' : r'. \\1 ',\n",
    "    r'\\s(\\-+|~+)([a-zA-Z0-9])' : r' \\1 \\2'\n",
    "}\n",
    "\n",
    "str_repls = { ** StringCleaner.weirdchar_str_repls() }\n",
    "\n",
    "sc = StringCleaner(str_repls_dict=str_repls, regrp_repls_dict=regrp_repls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "tp = TextProcessor(['AnswerText'], cleaner=sc, include_leading_nl=True, include_bos_tok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Building a vocab based on both training and validation data\n",
    "\n",
    "We'll build the vocab of both LM training and validation datasets and get both the numericalized and tokenized documents.\n",
    "\n",
    "If you don't want the numericalized documents, you can just call `fit()` to build the vocab and get the tokenized data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunksize = 24000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 21.1 s, sys: 16.9 s, total: 38 s\n",
      "Wall time: 1min 53s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "trn_df = pd.read_csv(LM_PATH/'train.csv', chunksize=chunksize)\n",
    "val_df = pd.read_csv(LM_PATH/'test.csv', chunksize=chunksize)\n",
    "\n",
    "num_docs, tok_docs, _ = tp.fit_transform([trn_df, val_df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27187, 359080, 359080, 39898, 39898)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tp.vocab), len(num_docs[0]), len(tok_docs[0]), len(num_docs[1]), len(tok_docs[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27187,\n",
       " ['_unk_', '_pad_', '_bos_', '_eos_', '_xfld_', '.', '1', '\\n', 'the', 'to'])"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = tp.vocab\n",
    "len(vocab), vocab.itos[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use an existing vocab to numericalize other documents\n",
    "\n",
    "Here we use the vocab learned from the LM datasets to numericalize the classification datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "trn_df = pd.read_csv(CLS_PATH/'train.csv', chunksize=chunksize)\n",
    "val_df = pd.read_csv(CLS_PATH/'test.csv', chunksize=chunksize)\n",
    "\n",
    "vocab = tp.vocab\n",
    "\n",
    "cls_tp = TextProcessor(['AnswerText'], LABELS_SENT, lbl_dtype=np.float32, vocab=vocab)\n",
    "trn_num_docs, trn_tok_docs, trn_doc_lbls = cls_tp.transform(trn_df)\n",
    "val_num_docs, val_tok_docs, val_doc_lbls = cls_tp.transform(val_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27187,\n",
       " ['_unk_', '_pad_', '_bos_', '_eos_', '_xfld_', '.', '1', '\\n', 'the', 'to'])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cls_tp.vocab), cls_tp.vocab.itos[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13692 1522 13692 1522\n",
      "(9,) (9,)\n"
     ]
    }
   ],
   "source": [
    "print(len(trn_num_docs), len(val_num_docs), len(trn_tok_docs), len(val_tok_docs))\n",
    "print(trn_doc_lbls[0].shape, val_doc_lbls[0].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['_bos_', '_xfld_', '1', 'maybe', 'more', 'information', 'on', 'rec', 'sports', 'like', 'dodgeball', 'and', 'all', 'those', 'other', 'leagues', '/', 'groups', '.']\n"
     ]
    }
   ],
   "source": [
    "print([ cls_tp.vocab.itos[i] for i in trn_num_docs[0] ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['_bos_', '_xfld_', '1', 'maybe', 'more', 'information', 'on', 'rec', 'sports', 'like', 'dodgeball', 'and', 'all', 'those', 'other', 'leagues', '/', 'groups', '.']\n"
     ]
    }
   ],
   "source": [
    "print(trn_tok_docs[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3. 0. 0. 0. 0. 1. 0. 0. 0.]\n"
     ]
    }
   ],
   "source": [
    "print(trn_doc_lbls[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Various ways you can use a TextProcessor instance\n",
    "\n",
    "You can fit{transform} a dataframe, a TextFileReader, a list of dataframes, or a list of TextFileReaders.  In the case of the later two options, each item returned will instead be a list that contains an item for every dataset included."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "_tp = TextProcessor(['AnswerText'])\n",
    "\n",
    "# using TextFileReader(s)\n",
    "trn_df = pd.read_csv(CLS_PATH/'train.csv', chunksize=chunksize)\n",
    "num_docs, tok_docs = _tp.fit(trn_df)\n",
    "\n",
    "trn_df = pd.read_csv(CLS_PATH/'train.csv', chunksize=chunksize)\n",
    "val_df = pd.read_csv(CLS_PATH/'test.csv', chunksize=chunksize)\n",
    "num_docs, tok_docs = _tp.fit([trn_df, val_df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "_tp = TextProcessor(['AnswerText'])\n",
    "\n",
    "# using DataFrame(s)\n",
    "trn_df = pd.read_csv(CLS_PATH/'train.csv')\n",
    "num_docs, tok_docs = _tp.fit(trn_df)\n",
    "\n",
    "val_df = pd.read_csv(CLS_PATH/'test.csv')\n",
    "num_docs, tok_docs =_tp.fit([trn_df, val_df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}