ohmeow/sentencepiece-prep.ipynb Secret

## sentencepiece-prep.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pdb, math, requests, json\n",
    "from pathlib import Path\n",
    "from shutil import copyfile, copyfileobj\n",
    "\n",
    "from fastai.text import *\n",
    "from fastprogress import master_bar, progress_bar\n",
    "\n",
    "from sacremoses import MosesTokenizer, MosesDetokenizer\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "LANG = 'en'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = Path('data')\n",
    "\n",
    "DUMP_DIR = DATA_DIR/'wiki_dumps'\n",
    "EXTRACT_DIR = DATA_DIR/'wiki_extracts'/LANG\n",
    "\n",
    "WIKI_DIR = DATA_DIR/'wiki'/LANG\n",
    "TRAINING_TEXT_DIR = WIKI_DIR/'txt'\n",
    "TRAINING_CSV_PATH = WIKI_DIR/\"wiki.csv\"\n",
    "\n",
    "# ensure directories exist for training data\n",
    "WIKI_DIR.mkdir(parents=True, exist_ok=True)\n",
    "TRAINING_TEXT_DIR.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rules\n",
    "\n",
    "#def rm_escaped_quotes(t:str) -> str: return t.replace(\"\\\\'\",\"'\").replace('\\\\\"','\"')\n",
    "def rm_empty_quotes(t:str) -> str:     \n",
    "    return re.sub(\"[\\\"'][\\\"']+\",\"\",t)\n",
    "\n",
    "def rm_empty_lists(t:str) -> str:\n",
    "    t= re.sub(\"\\s*[,;][\\s,;]+\", \", \", t) # replace  \", ,, ;\" or \",, ,\" or  \", ,\" with \",\"\n",
    "    t= re.sub(\"\\([\\s,;]\",\"(\", t) # replace (, or ( , with (\n",
    "    t= re.sub(\"[\\s,;]\\)\",\")\", t) # replace ,) or ,)  with )\n",
    "    #t= re.sub(\"\\([\\s,;]+\",\"(\", t) # replace (, or ( , with (\n",
    "    #t= re.sub(\"[\\s,;]+\\)\",\")\", t) # replace ,) or ,)  with )\n",
    "    t= re.sub(\"\\([^\\d\\w\\r\\n]*\\)\", \"\", t)     # remove  () ( ) (  )  (, ) (, )  (,,, ; ) but leave any parantese with a letter or number\n",
    "    return t\n",
    "\n",
    "def lower(t:str) -> str: return t.lower()\n",
    "\n",
    "def trim(t:str) -> str:  return t.strip()\n",
    "\n",
    "def extract_link_title(t:str) -> str: \n",
    "    \"the below evaluation order is important\"\n",
    "    def last_pip(m):\n",
    "        gt = m.group(1)\n",
    "        last = gt.rfind('|')\n",
    "        return gt if last==-1 else gt[last+1:]\n",
    "\n",
    "    t=re.sub('\\[\\[(?:[^:]*:[^\\]]*)\\]\\]', '', t)  #remove [[file:blabala]], [[image:blabala]] etc\n",
    "    t=re.sub('\\[\\[([^\\]\\[:]+)\\]\\]', last_pip, t) #replace [[samething|othering]] or [[otherthing]] with otherthing\n",
    "    return re.sub('(?:[\\]\\[])', \"\", t)           #remove the remaning [[, [, ], ]] \n",
    "\n",
    "def rm_stray_tags(t:str) -> str: \n",
    "    #return re.sub(\"<[/\\w\\s]*>\", \"\", t)           # remove  (<ref> </ref> </ ref> < /ref > <ref /> <br> <nowiki> < / nowiki> < / > etc\n",
    "    return re.sub(\"<(style|script)[^<>]*>.*?</\\1>|</?[a-z][a-z0-9]*[^<>]*>|<!--.*?-->\",\"\",t) # remove any remaining html tags\n",
    "\n",
    "def spec_add_more_spaces(t:str) -> str:\n",
    "    \"replace this nonbreakling charater with space\"\n",
    "    return t.replace(\"\\xa0\",\"\")\n",
    "    #return re.sub(r'([,;%°\\*\\+-_:\\.\\(\\)/#$§£€•<>]\\?)', r' \\1 ', t)\n",
    "\n",
    "def my_replace_wrep(t:str) -> str:\n",
    "    def _replace_wrep(m:Collection[str]) -> str:\n",
    "        c,_ = m.groups()\n",
    "        n   = len(m.string[m.start():m.end()].split(c))\n",
    "        return f' {TK_WREP}{n-1}{c} '\n",
    "    re_rep = re.compile(r'\\b(\\w+)\\s+(\\1\\b\\s*)+')\n",
    "    return re_rep.sub(_replace_wrep, t)\n",
    "                   \n",
    "def my_replace_rep(t:str) -> str:\n",
    "    \"Replace repetitions at the character level in `t`.\"\n",
    "    def _replace_rep(m:Collection[str]) -> str:\n",
    "        c,cc= m.groups()\n",
    "        return f' {TK_REP}{len(cc)+1}{c} '\n",
    "    re_rep = re.compile(r'(\\S)(\\1\\1\\1+)')\n",
    "    return re_rep.sub(_replace_rep, t)\n",
    "\n",
    "def count_alphas(t:str) -> str: \n",
    "    n_spaces              = t.count(\" \")\n",
    "    n_alphas              = len(re.findall('[A-Za-z\\\"“”‘’]', t)) \n",
    "    n_sep_symbols         = len(re.findall('[,;:•—|\\\\/]', t))\n",
    "    n_hyphen_symbols      = len(re.findall('[—]', t))\n",
    "    n_parentheses_symbols = len(re.findall('[\\(\\)]', t))\n",
    "\n",
    "    return n_spaces, n_alphas, n_sep_symbols, n_hyphen_symbols, n_parentheses_symbols\n",
    "\n",
    "def only_alphas(t:str) -> str: return re.findall('[A-Za-z]', t)\n",
    "\n",
    "\n",
    "# pre and post tokenization rules to run on text\n",
    "pre_spm_rules = [\n",
    "    fix_html, extract_link_title, rm_stray_tags, rm_empty_lists, rm_empty_quotes,\n",
    "    my_replace_rep, my_replace_wrep, spec_add_more_spaces, rm_useless_spaces, trim\n",
    "]\n",
    "\n",
    "post_spm_rules = [replace_all_caps, deal_caps]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BaseWikiTokenizer\n",
    "\n",
    "Default is to break/join text on spaces (' ').  Subclass to use Moses, Spacy, or whatever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BaseWikiTokenizer():\n",
    "    def __init__(self, lang='en'):\n",
    "        self.lang = lang\n",
    "    \n",
    "    def tokenize(self, input_str):\n",
    "        return input_str.split(' ')\n",
    "\n",
    "    def detokenize(self, toks):\n",
    "        return ' '.join(toks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MosesWikiTokenizer(BaseWikiTokenizer):\n",
    "    def __init__(self, lang='en'):\n",
    "        super().__init__(lang)\n",
    "        self.moses_tok, self.moses_detok = MosesTokenizer(lang), MosesDetokenizer(lang)\n",
    "    \n",
    "    def tokenize(self, input_str):\n",
    "        return self.moses_tok.tokenize(input_str)\n",
    "\n",
    "    def detokenize(self, toks):\n",
    "        return self.moses_detok.detokenize(toks) + '\\n'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SpacyWikiTokenizer(BaseWikiTokenizer):\n",
    "    def __init__(self, lang='en_core_web_sm'):\n",
    "        super().__init__(lang)\n",
    "        self.spacy_tok = spacy.load(lang)\n",
    "    \n",
    "    def tokenize(self, input_str):\n",
    "        return [ tok.text for tok in self.spacy_tok(input_str) ]\n",
    "\n",
    "    def detokenize(self, toks):\n",
    "        return super().detokenize(toks)  + '\\n'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wiki_json2train(root_dir, wiki_tokenizer=BaseWikiTokenizer('en'), min_toks_per_article=100, n_toks_per_file=2e7, \n",
    "                    preproc_rules=pre_spm_rules, post_proc_rules=post_spm_rules,\n",
    "                    include_bos=True, include_eos=True, include_fld=True, random_seed=42,\n",
    "                    cols = ['id', 'text', 'title', 'url'], txt_cols=['title','text']):\n",
    "\n",
    "    n_total_articles_processed = 0  # cumulative count of articles processed\n",
    "    n_total_tokens_processed = 0    # cumlative count of tokens processed\n",
    "    \n",
    "    n_writes = 0                    # num of times data written to .csv and .txt\n",
    "    n_batch_articles_processed = 0  # num of articles processed before saving to .csv and .txt\n",
    "    n_batch_tokens_processed = 0    # num of tokens processed before saving to .csv and .txt\n",
    "    \n",
    "    # build cols for our training .csv\n",
    "    tok_cols = [ f'tokenized_{tc}' for tc in txt_cols ]\n",
    "    cols += tok_cols\n",
    "    cols += [f'tokenized_{tc}_tokens' for tc in txt_cols ] + ['total_tokens']\n",
    "    \n",
    "    bos_tok = f'{text.transform.BOS} ' if (include_bos) else ''\n",
    "    eos_tok = f'{text.transform.EOS}' if (include_eos) else ''\n",
    "    fld_tok = f'{text.transform.FLD}' if (include_fld) else ''\n",
    "    \n",
    "        \n",
    "    # used to append chunks of data to the .csv and create a .txt for this chunk\n",
    "    def save_chunks(new_df, file_num):\n",
    "        \n",
    "        # prepare data for .txt file\n",
    "        if (include_fld):\n",
    "            t = f'{bos_tok}{fld_tok} {1} ' + new_df[tok_cols[0]].astype(str)  \n",
    "        else: \n",
    "            t = f'{bos_tok}' + new_df[tok_cols[0]].astype(str)\n",
    "\n",
    "        for idx, col in enumerate(tok_cols[1:]):\n",
    "            t += (f' {fld_tok} {idx+2} ' if include_fld else ' ') + new_df[col].astype(str)\n",
    "    \n",
    "        if (include_eos): t = t + f' {eos_tok}'\n",
    "    \n",
    "        # create a new .txt file\n",
    "        f = TRAINING_TEXT_DIR/f'{file_num}.txt'\n",
    "        with f.open('w+', encoding='utf-8') as fw: \n",
    "            fw.write('\\n\\n'.join(t.values))\n",
    "\n",
    "        # append data to .csv\n",
    "        new_df['text_file'] = f'{file_num}.txt'\n",
    "        new_df.to_csv(f_out, index=False, header=(n_writes == 0), mode='a', encoding='utf-8')\n",
    "    \n",
    "    \n",
    "    # used to add tokenized text and metadata to 'txt_cols' columns\n",
    "    def add_tokenized_data(row):\n",
    "        n_total_tokens = 0\n",
    "        n_tot_len = 0\n",
    "\n",
    "        for idx, tc in enumerate(txt_cols):\n",
    "            n_tokens = 0\n",
    "            tokenized_paragraphs = []\n",
    "            \n",
    "            # apply any pre-processing rules\n",
    "            text = reduce(lambda t, rule: rule(t), preproc_rules, row[tc])\n",
    "            \n",
    "            paragraphs = text.split('\\n')\n",
    "            for paragraph in paragraphs:\n",
    "                tokenized = wiki_tokenizer.tokenize(paragraph.strip())\n",
    "\n",
    "                if (len(tokenized) > 0):\n",
    "                    # calculate length based on tokens\n",
    "                    n_tokens += len([token for token in tokenized if token])\n",
    "                    \n",
    "                    # apply post-processing rules\n",
    "                    tokenized = reduce(lambda t, rule: rule(t), post_proc_rules, tokenized)\n",
    "                    \n",
    "                    final_text = wiki_tokenizer.detokenize(tokenized)\n",
    "                    tokenized_paragraphs.append(final_text)\n",
    "                    \n",
    "            row[f'tokenized_{tc}'] = '\\n'.join(tokenized_paragraphs)\n",
    "            row[f'tokenized_{tc}_tokens'] = n_tokens\n",
    "            \n",
    "            n_total_tokens += n_tokens\n",
    "\n",
    "        row[f'total_tokens'] = n_total_tokens\n",
    "        \n",
    "        return row\n",
    "    \n",
    "    \n",
    "    # process the raw wiki files\n",
    "    wiki_files = sorted([ f for f in root_dir.glob('*/**/*') if f.suffix == '' ])\n",
    "    \n",
    "    # make it random in a predictable way\n",
    "    np.random.seed(random_seed)\n",
    "    wiki_files = np.random.permutation(wiki_files)\n",
    "\n",
    "    pb = progress_bar(wiki_files)\n",
    "    \n",
    "    batch_df = pd.DataFrame(columns=cols)\n",
    "    \n",
    "    with TRAINING_CSV_PATH.open(\"w\", encoding='utf-8') as f_out:\n",
    "        for wiki_file in pb:\n",
    "            # wiki json files are suffix-less\n",
    "            if (wiki_file.suffix): continue\n",
    "            \n",
    "            # lines=True because each line in the file represents a json document\n",
    "            df = pd.read_json(wiki_file, lines=True)\n",
    "\n",
    "            # remove title from text if text starts with the title\n",
    "            df.text = df.apply(\n",
    "                lambda r: r.text[len(r.title):].strip() if r.text.startswith(str(r.title)) else r.text, axis=1)\n",
    "\n",
    "            # add tokenized data, counts, etc...\n",
    "            df = df.apply(add_tokenized_data, axis=1)\n",
    "\n",
    "            # ---- begin cleanup -----\n",
    "\n",
    "            # 1. if the text is simply the title (e.g., text == title), remove entry\n",
    "            df.drop(df[df.text.str.strip() == df.title.str.strip()].index, inplace=True)\n",
    "\n",
    "            # 2. if there is no content\n",
    "            df.drop(df[df.text.str.strip() == ''].index, inplace=True)\n",
    "\n",
    "            # 3. if there are < min_toks_per_article (default=100) tokens\n",
    "            if (min_toks_per_article):\n",
    "                df.drop(df[df.total_tokens < min_toks_per_article].index, inplace=True)\n",
    "\n",
    "            # ---- end cleanup -----\n",
    "\n",
    "            # keep track of articles processed and batch/overall token counts\n",
    "            n_batch_articles_processed += len(df)\n",
    "            n_batch_tokens_processed += df['total_tokens'].sum() \n",
    "\n",
    "            n_total_articles_processed += len(df)\n",
    "            n_total_tokens_processed += df['total_tokens'].sum() \n",
    "\n",
    "\n",
    "            # append to master dataframe\n",
    "            batch_df = batch_df.append(df, ignore_index=True, sort=True)\n",
    "            df = None\n",
    "\n",
    "            # append data to .csv and create a new .txt file for SP training\n",
    "            if (n_batch_tokens_processed > n_toks_per_file):\n",
    "                save_chunks(batch_df, n_writes)\n",
    "\n",
    "                batch_df = None; gc.collect()\n",
    "                batch_df = pd.DataFrame(columns=cols)\n",
    "\n",
    "                n_writes += 1\n",
    "                n_batch_tokens_processed = 0\n",
    "                n_batch_articles_processed = 0\n",
    "            \n",
    "            pb.comment = (f'Articles processed: {n_total_articles_processed:,} | Tokens processed: {n_total_tokens_processed:,}')\n",
    "            \n",
    "        # capture any leftover data\n",
    "        if (len(batch_df) > 0):\n",
    "            save_chunks(batch_df, n_writes)\n",
    "            batch_df = None; gc.collect()\n",
    "                "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build the .txt files for SentencePiece to train on in /data/wiki/en/txt and the giant .csv with EVERYTHING in /data/wiki/en/wiki.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wiki_json2train(EXTRACT_DIR, wiki_tokenizer=BaseWikiTokenizer(), min_toks_per_article=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3343627\n"
     ]
    }
   ],
   "source": [
    "updated_df = pd.read_csv(TRAINING_CSV_PATH, chunksize=24000)\n",
    "\n",
    "n_articles = 0\n",
    "for df in updated_df: n_articles += len(df)\n",
    "    \n",
    "print(n_articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>title</th>\n",
       "      <th>tokenized_text</th>\n",
       "      <th>tokenized_text_tokens</th>\n",
       "      <th>tokenized_title</th>\n",
       "      <th>tokenized_title_tokens</th>\n",
       "      <th>total_tokens</th>\n",
       "      <th>url</th>\n",
       "      <th>text_file</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8060</td>\n",
       "      <td>The Dominican Republic (; ) is a country locat...</td>\n",
       "      <td>Dominican Republic</td>\n",
       "      <td>xxmaj the xxmaj dominican xxmaj republic is a ...</td>\n",
       "      <td>14935</td>\n",
       "      <td>xxmaj dominican xxmaj republic</td>\n",
       "      <td>2</td>\n",
       "      <td>14937</td>\n",
       "      <td>https://en.wikipedia.org/wiki?curid=8060</td>\n",
       "      <td>0.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8062</td>\n",
       "      <td>Section::::History.\\nFounded in 1917 as the ' ...</td>\n",
       "      <td>Deutsches Institut für Normung</td>\n",
       "      <td>xxmaj section xxrep4: xxmaj history.\\nxxmaj fo...</td>\n",
       "      <td>201</td>\n",
       "      <td>xxmaj deutsches xxmaj institut für xxmaj normung</td>\n",
       "      <td>4</td>\n",
       "      <td>205</td>\n",
       "      <td>https://en.wikipedia.org/wiki?curid=8062</td>\n",
       "      <td>0.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8063</td>\n",
       "      <td>The recorded history of the Dominican Republic...</td>\n",
       "      <td>History of the Dominican Republic</td>\n",
       "      <td>xxmaj the recorded history of the xxmaj domini...</td>\n",
       "      <td>13290</td>\n",
       "      <td>xxmaj history of the xxmaj dominican xxmaj rep...</td>\n",
       "      <td>5</td>\n",
       "      <td>13295</td>\n",
       "      <td>https://en.wikipedia.org/wiki?curid=8063</td>\n",
       "      <td>0.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8064</td>\n",
       "      <td>The Dominican Republic (Spanish: \"República Do...</td>\n",
       "      <td>Geography of the Dominican Republic</td>\n",
       "      <td>xxmaj the xxmaj dominican xxmaj republic (span...</td>\n",
       "      <td>527</td>\n",
       "      <td>xxmaj geography of the xxmaj dominican xxmaj r...</td>\n",
       "      <td>5</td>\n",
       "      <td>532</td>\n",
       "      <td>https://en.wikipedia.org/wiki?curid=8064</td>\n",
       "      <td>0.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8065</td>\n",
       "      <td>This article is about the demographic features...</td>\n",
       "      <td>Demographics of the Dominican Republic</td>\n",
       "      <td>xxmaj this article is about the demographic fe...</td>\n",
       "      <td>177</td>\n",
       "      <td>xxmaj demographics of the xxmaj dominican xxma...</td>\n",
       "      <td>5</td>\n",
       "      <td>182</td>\n",
       "      <td>https://en.wikipedia.org/wiki?curid=8065</td>\n",
       "      <td>0.txt</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id                                               text  \\\n",
       "0  8060  The Dominican Republic (; ) is a country locat...   \n",
       "1  8062  Section::::History.\\nFounded in 1917 as the ' ...   \n",
       "2  8063  The recorded history of the Dominican Republic...   \n",
       "3  8064  The Dominican Republic (Spanish: \"República Do...   \n",
       "4  8065  This article is about the demographic features...   \n",
       "\n",
       "                                    title  \\\n",
       "0                      Dominican Republic   \n",
       "1          Deutsches Institut für Normung   \n",
       "2       History of the Dominican Republic   \n",
       "3     Geography of the Dominican Republic   \n",
       "4  Demographics of the Dominican Republic   \n",
       "\n",
       "                                      tokenized_text  tokenized_text_tokens  \\\n",
       "0  xxmaj the xxmaj dominican xxmaj republic is a ...                  14935   \n",
       "1  xxmaj section xxrep4: xxmaj history.\\nxxmaj fo...                    201   \n",
       "2  xxmaj the recorded history of the xxmaj domini...                  13290   \n",
       "3  xxmaj the xxmaj dominican xxmaj republic (span...                    527   \n",
       "4  xxmaj this article is about the demographic fe...                    177   \n",
       "\n",
       "                                     tokenized_title  tokenized_title_tokens  \\\n",
       "0                     xxmaj dominican xxmaj republic                       2   \n",
       "1   xxmaj deutsches xxmaj institut für xxmaj normung                       4   \n",
       "2  xxmaj history of the xxmaj dominican xxmaj rep...                       5   \n",
       "3  xxmaj geography of the xxmaj dominican xxmaj r...                       5   \n",
       "4  xxmaj demographics of the xxmaj dominican xxma...                       5   \n",
       "\n",
       "   total_tokens                                       url text_file  \n",
       "0         14937  https://en.wikipedia.org/wiki?curid=8060     0.txt  \n",
       "1           205  https://en.wikipedia.org/wiki?curid=8062     0.txt  \n",
       "2         13295  https://en.wikipedia.org/wiki?curid=8063     0.txt  \n",
       "3           532  https://en.wikipedia.org/wiki?curid=8064     0.txt  \n",
       "4           182  https://en.wikipedia.org/wiki?curid=8065     0.txt  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "updated_df = pd.read_csv(TRAINING_CSV_PATH, chunksize=24000)\n",
    "df = next(updated_df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Playground"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EXTRACT_DIR\n",
    "\n",
    "articles = []\n",
    "for wiki_file in (EXTRACT_DIR/'AA').iterdir():\n",
    "    with open(wiki_file, encoding='utf-8') as f_in:\n",
    "        for line in f_in:\n",
    "            j = json.loads(line)\n",
    "            if(len(j['text'].split(' ')) + len(j['title'].split(' ')) < 100): continue\n",
    "            articles.append(j)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "article_txt = articles[0]['text']\n",
    "article_txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "article_txt.split('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mt.tokenize('', return_str=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mt.tokenize(article_txt.strip(), return_str=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"%reload_ext autoreload\n",
	"%autoreload 2\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pdb, math, requests, json\n",
	"from pathlib import Path\n",
	"from shutil import copyfile, copyfileobj\n",
	"\n",
	"from fastai.text import *\n",
	"from fastprogress import master_bar, progress_bar\n",
	"\n",
	"from sacremoses import MosesTokenizer, MosesDetokenizer\n",
	"import spacy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"LANG = 'en'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"DATA_DIR = Path('data')\n",
	"\n",
	"DUMP_DIR = DATA_DIR/'wiki_dumps'\n",
	"EXTRACT_DIR = DATA_DIR/'wiki_extracts'/LANG\n",
	"\n",
	"WIKI_DIR = DATA_DIR/'wiki'/LANG\n",
	"TRAINING_TEXT_DIR = WIKI_DIR/'txt'\n",
	"TRAINING_CSV_PATH = WIKI_DIR/\"wiki.csv\"\n",
	"\n",
	"# ensure directories exist for training data\n",
	"WIKI_DIR.mkdir(parents=True, exist_ok=True)\n",
	"TRAINING_TEXT_DIR.mkdir(parents=True, exist_ok=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"# rules\n",
	"\n",
	"#def rm_escaped_quotes(t:str) -> str: return t.replace(\"\\\\'\",\"'\").replace('\\\\\"','\"')\n",
	"def rm_empty_quotes(t:str) -> str: \n",
	" return re.sub(\"[\\\"'][\\\"']+\",\"\",t)\n",
	"\n",
	"def rm_empty_lists(t:str) -> str:\n",
	" t= re.sub(\"\\s*[,;][\\s,;]+\", \", \", t) # replace \", ,, ;\" or \",, ,\" or \", ,\" with \",\"\n",
	" t= re.sub(\"\\([\\s,;]\",\"(\", t) # replace (, or ( , with (\n",
	" t= re.sub(\"[\\s,;]\\)\",\")\", t) # replace ,) or ,) with )\n",
	" #t= re.sub(\"\\([\\s,;]+\",\"(\", t) # replace (, or ( , with (\n",
	" #t= re.sub(\"[\\s,;]+\\)\",\")\", t) # replace ,) or ,) with )\n",
	" t= re.sub(\"\\([^\\d\\w\\r\\n]*\\)\", \"\", t) # remove () ( ) ( ) (, ) (, ) (,,, ; ) but leave any parantese with a letter or number\n",
	" return t\n",
	"\n",
	"def lower(t:str) -> str: return t.lower()\n",
	"\n",
	"def trim(t:str) -> str: return t.strip()\n",
	"\n",
	"def extract_link_title(t:str) -> str: \n",
	" \"the below evaluation order is important\"\n",
	" def last_pip(m):\n",
	" gt = m.group(1)\n",
	" last = gt.rfind('\|')\n",
	" return gt if last==-1 else gt[last+1:]\n",
	"\n",
	" t=re.sub('\\[\\[(?:[^:]:[^\\]])\\]\\]', '', t) #remove [[file:blabala]], [[image:blabala]] etc\n",
	" t=re.sub('\\[\\[([^\\]\\[:]+)\\]\\]', last_pip, t) #replace [[samething\|othering]] or [[otherthing]] with otherthing\n",
	" return re.sub('(?:[\\]\\[])', \"\", t) #remove the remaning [[, [, ], ]] \n",
	"\n",
	"def rm_stray_tags(t:str) -> str: \n",
	" #return re.sub(\"<[/\\w\\s]*>\", \"\", t) # remove (<ref> </ref> </ ref> < /ref > <ref /> <br> <nowiki> < / nowiki> < / > etc\n",
	" return re.sub(\"<(style\|script)[^<>]>.?</\\1>\|</?[a-z][a-z0-9][^<>]>\|<!--.*?-->\",\"\",t) # remove any remaining html tags\n",
	"\n",
	"def spec_add_more_spaces(t:str) -> str:\n",
	" \"replace this nonbreakling charater with space\"\n",
	" return t.replace(\"\\xa0\",\"\")\n",
	" #return re.sub(r'([,;%°\\*\\+-_:\\.\\(\\)/#$§£€•<>]\\?)', r' \\1 ', t)\n",
	"\n",
	"def my_replace_wrep(t:str) -> str:\n",
	" def _replace_wrep(m:Collection[str]) -> str:\n",
	" c,_ = m.groups()\n",
	" n = len(m.string[m.start():m.end()].split(c))\n",
	" return f' {TK_WREP}{n-1}{c} '\n",
	" re_rep = re.compile(r'\\b(\\w+)\\s+(\\1\\b\\s*)+')\n",
	" return re_rep.sub(_replace_wrep, t)\n",
	" \n",
	"def my_replace_rep(t:str) -> str:\n",
	" \"Replace repetitions at the character level in `t`.\"\n",
	" def _replace_rep(m:Collection[str]) -> str:\n",
	" c,cc= m.groups()\n",
	" return f' {TK_REP}{len(cc)+1}{c} '\n",
	" re_rep = re.compile(r'(\\S)(\\1\\1\\1+)')\n",
	" return re_rep.sub(_replace_rep, t)\n",
	"\n",
	"def count_alphas(t:str) -> str: \n",
	" n_spaces = t.count(\" \")\n",
	" n_alphas = len(re.findall('[A-Za-z\\\"“”‘’]', t)) \n",
	" n_sep_symbols = len(re.findall('[,;:•—\|\\\\/]', t))\n",
	" n_hyphen_symbols = len(re.findall('[—]', t))\n",
	" n_parentheses_symbols = len(re.findall('[\\(\\)]', t))\n",
	"\n",
	" return n_spaces, n_alphas, n_sep_symbols, n_hyphen_symbols, n_parentheses_symbols\n",
	"\n",
	"def only_alphas(t:str) -> str: return re.findall('[A-Za-z]', t)\n",
	"\n",
	"\n",
	"# pre and post tokenization rules to run on text\n",
	"pre_spm_rules = [\n",
	" fix_html, extract_link_title, rm_stray_tags, rm_empty_lists, rm_empty_quotes,\n",
	" my_replace_rep, my_replace_wrep, spec_add_more_spaces, rm_useless_spaces, trim\n",
	"]\n",
	"\n",
	"post_spm_rules = [replace_all_caps, deal_caps]\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### BaseWikiTokenizer\n",
	"\n",
	"Default is to break/join text on spaces (' '). Subclass to use Moses, Spacy, or whatever"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"class BaseWikiTokenizer():\n",
	" def __init__(self, lang='en'):\n",
	" self.lang = lang\n",
	" \n",
	" def tokenize(self, input_str):\n",
	" return input_str.split(' ')\n",
	"\n",
	" def detokenize(self, toks):\n",
	" return ' '.join(toks)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"class MosesWikiTokenizer(BaseWikiTokenizer):\n",
	" def __init__(self, lang='en'):\n",
	" super().__init__(lang)\n",
	" self.moses_tok, self.moses_detok = MosesTokenizer(lang), MosesDetokenizer(lang)\n",
	" \n",
	" def tokenize(self, input_str):\n",
	" return self.moses_tok.tokenize(input_str)\n",
	"\n",
	" def detokenize(self, toks):\n",
	" return self.moses_detok.detokenize(toks) + '\\n'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"class SpacyWikiTokenizer(BaseWikiTokenizer):\n",
	" def __init__(self, lang='en_core_web_sm'):\n",
	" super().__init__(lang)\n",
	" self.spacy_tok = spacy.load(lang)\n",
	" \n",
	" def tokenize(self, input_str):\n",
	" return [ tok.text for tok in self.spacy_tok(input_str) ]\n",
	"\n",
	" def detokenize(self, toks):\n",
	" return super().detokenize(toks) + '\\n'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {},
	"outputs": [],
	"source": [
	"def wiki_json2train(root_dir, wiki_tokenizer=BaseWikiTokenizer('en'), min_toks_per_article=100, n_toks_per_file=2e7, \n",
	" preproc_rules=pre_spm_rules, post_proc_rules=post_spm_rules,\n",
	" include_bos=True, include_eos=True, include_fld=True, random_seed=42,\n",
	" cols = ['id', 'text', 'title', 'url'], txt_cols=['title','text']):\n",
	"\n",
	" n_total_articles_processed = 0 # cumulative count of articles processed\n",
	" n_total_tokens_processed = 0 # cumlative count of tokens processed\n",
	" \n",
	" n_writes = 0 # num of times data written to .csv and .txt\n",
	" n_batch_articles_processed = 0 # num of articles processed before saving to .csv and .txt\n",
	" n_batch_tokens_processed = 0 # num of tokens processed before saving to .csv and .txt\n",
	" \n",
	" # build cols for our training .csv\n",
	" tok_cols = [ f'tokenized_{tc}' for tc in txt_cols ]\n",
	" cols += tok_cols\n",
	" cols += [f'tokenized_{tc}_tokens' for tc in txt_cols ] + ['total_tokens']\n",
	" \n",
	" bos_tok = f'{text.transform.BOS} ' if (include_bos) else ''\n",
	" eos_tok = f'{text.transform.EOS}' if (include_eos) else ''\n",
	" fld_tok = f'{text.transform.FLD}' if (include_fld) else ''\n",
	" \n",
	" \n",
	" # used to append chunks of data to the .csv and create a .txt for this chunk\n",
	" def save_chunks(new_df, file_num):\n",
	" \n",
	" # prepare data for .txt file\n",
	" if (include_fld):\n",
	" t = f'{bos_tok}{fld_tok} {1} ' + new_df[tok_cols[0]].astype(str) \n",
	" else: \n",
	" t = f'{bos_tok}' + new_df[tok_cols[0]].astype(str)\n",
	"\n",
	" for idx, col in enumerate(tok_cols[1:]):\n",
	" t += (f' {fld_tok} {idx+2} ' if include_fld else ' ') + new_df[col].astype(str)\n",
	" \n",
	" if (include_eos): t = t + f' {eos_tok}'\n",
	" \n",
	" # create a new .txt file\n",
	" f = TRAINING_TEXT_DIR/f'{file_num}.txt'\n",
	" with f.open('w+', encoding='utf-8') as fw: \n",
	" fw.write('\\n\\n'.join(t.values))\n",
	"\n",
	" # append data to .csv\n",
	" new_df['text_file'] = f'{file_num}.txt'\n",
	" new_df.to_csv(f_out, index=False, header=(n_writes == 0), mode='a', encoding='utf-8')\n",
	" \n",
	" \n",
	" # used to add tokenized text and metadata to 'txt_cols' columns\n",
	" def add_tokenized_data(row):\n",
	" n_total_tokens = 0\n",
	" n_tot_len = 0\n",
	"\n",
	" for idx, tc in enumerate(txt_cols):\n",
	" n_tokens = 0\n",
	" tokenized_paragraphs = []\n",
	" \n",
	" # apply any pre-processing rules\n",
	" text = reduce(lambda t, rule: rule(t), preproc_rules, row[tc])\n",
	" \n",
	" paragraphs = text.split('\\n')\n",
	" for paragraph in paragraphs:\n",
	" tokenized = wiki_tokenizer.tokenize(paragraph.strip())\n",
	"\n",
	" if (len(tokenized) > 0):\n",
	" # calculate length based on tokens\n",
	" n_tokens += len([token for token in tokenized if token])\n",
	" \n",
	" # apply post-processing rules\n",
	" tokenized = reduce(lambda t, rule: rule(t), post_proc_rules, tokenized)\n",
	" \n",
	" final_text = wiki_tokenizer.detokenize(tokenized)\n",
	" tokenized_paragraphs.append(final_text)\n",
	" \n",
	" row[f'tokenized_{tc}'] = '\\n'.join(tokenized_paragraphs)\n",
	" row[f'tokenized_{tc}_tokens'] = n_tokens\n",
	" \n",
	" n_total_tokens += n_tokens\n",
	"\n",
	" row[f'total_tokens'] = n_total_tokens\n",
	" \n",
	" return row\n",
	" \n",
	" \n",
	" # process the raw wiki files\n",
	" wiki_files = sorted([ f for f in root_dir.glob('//') if f.suffix == '' ])\n",
	" \n",
	" # make it random in a predictable way\n",
	" np.random.seed(random_seed)\n",
	" wiki_files = np.random.permutation(wiki_files)\n",
	"\n",
	" pb = progress_bar(wiki_files)\n",
	" \n",
	" batch_df = pd.DataFrame(columns=cols)\n",
	" \n",
	" with TRAINING_CSV_PATH.open(\"w\", encoding='utf-8') as f_out:\n",
	" for wiki_file in pb:\n",
	" # wiki json files are suffix-less\n",
	" if (wiki_file.suffix): continue\n",
	" \n",
	" # lines=True because each line in the file represents a json document\n",
	" df = pd.read_json(wiki_file, lines=True)\n",
	"\n",
	" # remove title from text if text starts with the title\n",
	" df.text = df.apply(\n",
	" lambda r: r.text[len(r.title):].strip() if r.text.startswith(str(r.title)) else r.text, axis=1)\n",
	"\n",
	" # add tokenized data, counts, etc...\n",
	" df = df.apply(add_tokenized_data, axis=1)\n",
	"\n",
	" # ---- begin cleanup -----\n",
	"\n",
	" # 1. if the text is simply the title (e.g., text == title), remove entry\n",
	" df.drop(df[df.text.str.strip() == df.title.str.strip()].index, inplace=True)\n",
	"\n",
	" # 2. if there is no content\n",
	" df.drop(df[df.text.str.strip() == ''].index, inplace=True)\n",
	"\n",
	" # 3. if there are < min_toks_per_article (default=100) tokens\n",
	" if (min_toks_per_article):\n",
	" df.drop(df[df.total_tokens < min_toks_per_article].index, inplace=True)\n",
	"\n",
	" # ---- end cleanup -----\n",
	"\n",
	" # keep track of articles processed and batch/overall token counts\n",
	" n_batch_articles_processed += len(df)\n",
	" n_batch_tokens_processed += df['total_tokens'].sum() \n",
	"\n",
	" n_total_articles_processed += len(df)\n",
	" n_total_tokens_processed += df['total_tokens'].sum() \n",
	"\n",
	"\n",
	" # append to master dataframe\n",
	" batch_df = batch_df.append(df, ignore_index=True, sort=True)\n",
	" df = None\n",
	"\n",
	" # append data to .csv and create a new .txt file for SP training\n",
	" if (n_batch_tokens_processed > n_toks_per_file):\n",
	" save_chunks(batch_df, n_writes)\n",
	"\n",
	" batch_df = None; gc.collect()\n",
	" batch_df = pd.DataFrame(columns=cols)\n",
	"\n",
	" n_writes += 1\n",
	" n_batch_tokens_processed = 0\n",
	" n_batch_articles_processed = 0\n",
	" \n",
	" pb.comment = (f'Articles processed: {n_total_articles_processed:,} \| Tokens processed: {n_total_tokens_processed:,}')\n",
	" \n",
	" # capture any leftover data\n",
	" if (len(batch_df) > 0):\n",
	" save_chunks(batch_df, n_writes)\n",
	" batch_df = None; gc.collect()\n",
	" "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Build the .txt files for SentencePiece to train on in /data/wiki/en/txt and the giant .csv with EVERYTHING in /data/wiki/en/wiki.csv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"wiki_json2train(EXTRACT_DIR, wiki_tokenizer=BaseWikiTokenizer(), min_toks_per_article=100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3343627\n"
	]
	}
	],
	"source": [
	"updated_df = pd.read_csv(TRAINING_CSV_PATH, chunksize=24000)\n",
	"\n",
	"n_articles = 0\n",
	"for df in updated_df: n_articles += len(df)\n",
	" \n",
	"print(n_articles)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>text</th>\n",
	" <th>title</th>\n",
	" <th>tokenized_text</th>\n",
	" <th>tokenized_text_tokens</th>\n",
	" <th>tokenized_title</th>\n",
	" <th>tokenized_title_tokens</th>\n",
	" <th>total_tokens</th>\n",
	" <th>url</th>\n",
	" <th>text_file</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>8060</td>\n",
	" <td>The Dominican Republic (; ) is a country locat...</td>\n",
	" <td>Dominican Republic</td>\n",
	" <td>xxmaj the xxmaj dominican xxmaj republic is a ...</td>\n",
	" <td>14935</td>\n",
	" <td>xxmaj dominican xxmaj republic</td>\n",
	" <td>2</td>\n",
	" <td>14937</td>\n",
	" <td>https://en.wikipedia.org/wiki?curid=8060</td>\n",
	" <td>0.txt</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>8062</td>\n",
	" <td>Section::::History.\\nFounded in 1917 as the ' ...</td>\n",
	" <td>Deutsches Institut für Normung</td>\n",
	" <td>xxmaj section xxrep4: xxmaj history.\\nxxmaj fo...</td>\n",
	" <td>201</td>\n",
	" <td>xxmaj deutsches xxmaj institut für xxmaj normung</td>\n",
	" <td>4</td>\n",
	" <td>205</td>\n",
	" <td>https://en.wikipedia.org/wiki?curid=8062</td>\n",
	" <td>0.txt</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>8063</td>\n",
	" <td>The recorded history of the Dominican Republic...</td>\n",
	" <td>History of the Dominican Republic</td>\n",
	" <td>xxmaj the recorded history of the xxmaj domini...</td>\n",
	" <td>13290</td>\n",
	" <td>xxmaj history of the xxmaj dominican xxmaj rep...</td>\n",
	" <td>5</td>\n",
	" <td>13295</td>\n",
	" <td>https://en.wikipedia.org/wiki?curid=8063</td>\n",
	" <td>0.txt</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>8064</td>\n",
	" <td>The Dominican Republic (Spanish: \"República Do...</td>\n",
	" <td>Geography of the Dominican Republic</td>\n",
	" <td>xxmaj the xxmaj dominican xxmaj republic (span...</td>\n",
	" <td>527</td>\n",
	" <td>xxmaj geography of the xxmaj dominican xxmaj r...</td>\n",
	" <td>5</td>\n",
	" <td>532</td>\n",
	" <td>https://en.wikipedia.org/wiki?curid=8064</td>\n",
	" <td>0.txt</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>8065</td>\n",
	" <td>This article is about the demographic features...</td>\n",
	" <td>Demographics of the Dominican Republic</td>\n",
	" <td>xxmaj this article is about the demographic fe...</td>\n",
	" <td>177</td>\n",
	" <td>xxmaj demographics of the xxmaj dominican xxma...</td>\n",
	" <td>5</td>\n",
	" <td>182</td>\n",
	" <td>https://en.wikipedia.org/wiki?curid=8065</td>\n",
	" <td>0.txt</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" id text \\\n",
	"0 8060 The Dominican Republic (; ) is a country locat... \n",
	"1 8062 Section::::History.\\nFounded in 1917 as the ' ... \n",
	"2 8063 The recorded history of the Dominican Republic... \n",
	"3 8064 The Dominican Republic (Spanish: \"República Do... \n",
	"4 8065 This article is about the demographic features... \n",
	"\n",
	" title \\\n",
	"0 Dominican Republic \n",
	"1 Deutsches Institut für Normung \n",
	"2 History of the Dominican Republic \n",
	"3 Geography of the Dominican Republic \n",
	"4 Demographics of the Dominican Republic \n",
	"\n",
	" tokenized_text tokenized_text_tokens \\\n",
	"0 xxmaj the xxmaj dominican xxmaj republic is a ... 14935 \n",
	"1 xxmaj section xxrep4: xxmaj history.\\nxxmaj fo... 201 \n",
	"2 xxmaj the recorded history of the xxmaj domini... 13290 \n",
	"3 xxmaj the xxmaj dominican xxmaj republic (span... 527 \n",
	"4 xxmaj this article is about the demographic fe... 177 \n",
	"\n",
	" tokenized_title tokenized_title_tokens \\\n",
	"0 xxmaj dominican xxmaj republic 2 \n",
	"1 xxmaj deutsches xxmaj institut für xxmaj normung 4 \n",
	"2 xxmaj history of the xxmaj dominican xxmaj rep... 5 \n",
	"3 xxmaj geography of the xxmaj dominican xxmaj r... 5 \n",
	"4 xxmaj demographics of the xxmaj dominican xxma... 5 \n",
	"\n",
	" total_tokens url text_file \n",
	"0 14937 https://en.wikipedia.org/wiki?curid=8060 0.txt \n",
	"1 205 https://en.wikipedia.org/wiki?curid=8062 0.txt \n",
	"2 13295 https://en.wikipedia.org/wiki?curid=8063 0.txt \n",
	"3 532 https://en.wikipedia.org/wiki?curid=8064 0.txt \n",
	"4 182 https://en.wikipedia.org/wiki?curid=8065 0.txt "
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"updated_df = pd.read_csv(TRAINING_CSV_PATH, chunksize=24000)\n",
	"df = next(updated_df)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Playground"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"EXTRACT_DIR\n",
	"\n",
	"articles = []\n",
	"for wiki_file in (EXTRACT_DIR/'AA').iterdir():\n",
	" with open(wiki_file, encoding='utf-8') as f_in:\n",
	" for line in f_in:\n",
	" j = json.loads(line)\n",
	" if(len(j['text'].split(' ')) + len(j['title'].split(' ')) < 100): continue\n",
	" articles.append(j)\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"len(articles)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"article_txt = articles[0]['text']\n",
	"article_txt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"article_txt.split('\\n')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"mt.tokenize('', return_str=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"mt.tokenize(article_txt.strip(), return_str=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}