Skip to content

Instantly share code, notes, and snippets.

@angrymeir
Last active January 20, 2021 16:33
Show Gist options
  • Save angrymeir/b2657897bc50d4d3a134237a20a2a19f to your computer and use it in GitHub Desktop.
Save angrymeir/b2657897bc50d4d3a134237a20a2a19f to your computer and use it in GitHub Desktop.
Sequential PySS3.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Sequential PySS3.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"authorship_tag": "ABX9TyPp7Q+RyKWBGdoxpib5gfeH",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/angrymeir/b2657897bc50d4d3a134237a20a2a19f/sequential-pyss3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cZIWcU0tesn2",
"outputId": "a1618b57-4b95-4b92-97c2-a9f0d6c1a0f0"
},
"source": [
"!pip3 install numpy sklearn iterative-stratification pandas pyss3"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (1.19.5)\n",
"Requirement already satisfied: sklearn in /usr/local/lib/python3.6/dist-packages (0.0)\n",
"Collecting iterative-stratification\n",
" Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (1.1.5)\n",
"Collecting pyss3\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/67/d9/93197c8cfcb1c689f3ff9693256c33e51294d26f4af4d079708c4ff089b7/pyss3-0.6.3-py3-none-any.whl (2.0MB)\n",
"\u001b[K |████████████████████████████████| 2.0MB 4.1MB/s \n",
"\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn) (0.22.2.post1)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from iterative-stratification) (1.4.1)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2018.9)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from pyss3) (3.2.2)\n",
"Requirement already satisfied: tqdm>=4.8.4 in /usr/local/lib/python3.6/dist-packages (from pyss3) (4.41.1)\n",
"Requirement already satisfied: cython in /usr/local/lib/python3.6/dist-packages (from pyss3) (0.29.21)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from pyss3) (1.15.0)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn) (1.0.0)\n",
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyss3) (2.4.7)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyss3) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyss3) (0.10.0)\n",
"Installing collected packages: iterative-stratification, pyss3\n",
"Successfully installed iterative-stratification-0.1.6 pyss3-0.6.3\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "mMKs8ai7eqDd"
},
"source": [
"%matplotlib inline\n",
"\n",
"from pyss3 import SS3\n",
"from pyss3.util import Dataset, Evaluation, span\n",
"from pyss3.server import Live_Test\n",
"\n",
"from sklearn.metrics import accuracy_score"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "3l1QQffNojyX",
"outputId": "1b3c60fb-7e86-4717-fa1a-50db9081472b",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"!cat /usr/local/lib/python3.6/dist-packages/pyss3/__init__.py"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"# -*- coding: utf-8 -*-\n",
"\"\"\"\n",
"This is the main module containing the implementation of the SS3 classifier.\n",
"\n",
"(Please, visit https://github.com/sergioburdisso/pyss3 for more info)\n",
"\"\"\"\n",
"from __future__ import print_function\n",
"import os\n",
"import re\n",
"import json\n",
"import errno\n",
"import numpy as np\n",
"\n",
"from io import open\n",
"from time import time\n",
"from tqdm import tqdm\n",
"from math import pow, tanh\n",
"from .util import is_a_collection, Print, VERBOSITY, Preproc as Pp\n",
"\n",
"# python 2 and 3 compatibility\n",
"from functools import reduce\n",
"from six.moves import xrange\n",
"\n",
"__version__ = \"0.6.3\"\n",
"\n",
"ENCODING = \"utf-8\"\n",
"\n",
"PARA_DELTR = \"\\n\"\n",
"SENT_DELTR = r\"\\.\"\n",
"WORD_DELTR = r\"\\s\"\n",
"WORD_REGEX = r\"\\w+(?:'\\w+)?\"\n",
"\n",
"STR_UNKNOWN, STR_MOST_PROBABLE = \"unknown\", \"most-probable\"\n",
"STR_OTHERS_CATEGORY = \"[others]\"\n",
"STR_UNKNOWN_CATEGORY = \"[unknown]\"\n",
"IDX_UNKNOWN_CATEGORY = -1\n",
"STR_UNKNOWN_WORD = ''\n",
"IDX_UNKNOWN_WORD = -1\n",
"STR_VANILLA, STR_XAI = \"vanilla\", \"xai\"\n",
"STR_GV, STR_NORM_GV, STR_NORM_GV_XAI = \"gv\", \"norm_gv\", \"norm_gv_xai\"\n",
"\n",
"STR_MODEL_FOLDER = \"ss3_models\"\n",
"STR_MODEL_EXT = \"ss3m\"\n",
"\n",
"VERBOSITY = VERBOSITY # to allow \"from pyss3 import VERBOSITY\"\n",
"\n",
"NAME = 0\n",
"VOCAB = 1\n",
"\n",
"NEXT = 0\n",
"FR = 1\n",
"CV = 2\n",
"SG = 3\n",
"GV = 4\n",
"LV = 5\n",
"EMPTY_WORD_INFO = [0, 0, 0, 0, 0, 0]\n",
"\n",
"NOISE_FR = 1\n",
"MIN_MAD_SD = .03\n",
"\n",
"\n",
"class SS3:\n",
" \"\"\"\n",
" The SS3 classifier class.\n",
"\n",
" The SS3 classifier was originally defined in Section 3 of\n",
" https://dx.doi.org/10.1016/j.eswa.2019.05.023\n",
" (preprint avialable here: https://arxiv.org/abs/1905.08772)\n",
"\n",
" :param s: the \"smoothness\"(sigma) hyperparameter value\n",
" :type s: float\n",
" :param l: the \"significance\"(lambda) hyperparameter value\n",
" :type l: float\n",
" :param p: the \"sanction\"(rho) hyperparameter value\n",
" :type p: float\n",
" :param a: the alpha hyperparameter value (i.e. all terms with a\n",
" confidence value (cv) less than alpha will be ignored during\n",
" classification)\n",
" :type a: float\n",
" :param name: the model's name (to save and load the model from disk)\n",
" :type name: str\n",
" :param cv_m: method used to compute the confidence value (cv) of each\n",
" term (word or n-grams), options are:\n",
" \"norm_gv_xai\", \"norm_gv\" and \"gv\" (default: \"norm_gv_xai\")\n",
" :type cv_m: str\n",
" :param sg_m: method used to compute the significance (sg) function, options\n",
" are: \"vanilla\" and \"xai\" (default: \"xai\")\n",
" :type sg_m: str\n",
" \"\"\"\n",
"\n",
" __name__ = \"model\"\n",
" __models_folder__ = STR_MODEL_FOLDER\n",
"\n",
" __s__ = .45\n",
" __l__ = .5\n",
" __p__ = 1\n",
" __a__ = .0\n",
"\n",
" __multilabel__ = False\n",
"\n",
" __l_update__ = None\n",
" __s_update__ = None\n",
" __p_update__ = None\n",
"\n",
" __cv_cache__ = None\n",
" __last_x_test__ = None\n",
" __last_x_test_idx__ = None\n",
"\n",
" __prun_floor__ = 10\n",
" __prun_trigger__ = 1000000\n",
" __prun_counter__ = 0\n",
"\n",
" __zero_cv__ = None\n",
"\n",
" __parag_delimiter__ = PARA_DELTR\n",
" __sent_delimiter__ = SENT_DELTR\n",
" __word_delimiter__ = WORD_DELTR\n",
" __word_regex__ = WORD_REGEX\n",
"\n",
" def __init__(\n",
" self, s=None, l=None, p=None, a=None,\n",
" name=\"\", cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI\n",
" ):\n",
" \"\"\"\n",
" Class constructor.\n",
"\n",
" :param s: the \"smoothness\"(sigma) hyperparameter value\n",
" :type s: float\n",
" :param l: the \"significance\"(lambda) hyperparameter value\n",
" :type l: float\n",
" :param p: the \"sanction\"(rho) hyperparameter value\n",
" :type p: float\n",
" :param a: the alpha hyperparameter value (i.e. all terms with a\n",
" confidence value (cv) less than alpha will be ignored during\n",
" classification)\n",
" :type a: float\n",
" :param name: the model's name (to save and load the model from disk)\n",
" :type name: str\n",
" :param cv_m: method used to compute the confidence value (cv) of each\n",
" term (word or n-grams), options are:\n",
" \"norm_gv_xai\", \"norm_gv\" and \"gv\" (default: \"norm_gv_xai\")\n",
" :type cv_m: str\n",
" :param sg_m: method used to compute the significance (sg) function, options\n",
" are: \"vanilla\" and \"xai\" (default: \"xai\")\n",
" :type sg_m: str\n",
" :raises: ValueError\n",
" \"\"\"\n",
" self.__name__ = (name or self.__name__).lower()\n",
"\n",
" self.__s__ = self.__s__ if s is None else s\n",
" self.__l__ = self.__l__ if l is None else l\n",
" self.__p__ = self.__p__ if p is None else p\n",
" self.__a__ = self.__a__ if a is None else a\n",
"\n",
" try:\n",
" float(self.__s__ + self.__l__ + self.__p__ + self.__a__)\n",
" except BaseException:\n",
" raise ValueError(\"hyperparameter values must be numbers\")\n",
"\n",
" self.__categories_index__ = {}\n",
" self.__categories__ = []\n",
" self.__max_fr__ = []\n",
" self.__max_gv__ = []\n",
"\n",
" self.__index_to_word__ = {}\n",
" self.__word_to_index__ = {}\n",
"\n",
" if cv_m == STR_NORM_GV_XAI:\n",
" self.__cv__ = self.__cv_norm_gv_xai__\n",
" elif cv_m == STR_NORM_GV:\n",
" self.__cv__ = self.__cv_norm_gv__\n",
" elif cv_m == STR_GV:\n",
" self.__cv__ = self.__gv__\n",
"\n",
" if sg_m == STR_XAI:\n",
" self.__sg__ = self.__sg_xai__\n",
" elif sg_m == STR_VANILLA:\n",
" self.__sg__ = self.__sg_vanilla__\n",
"\n",
" self.__cv_mode__ = cv_m\n",
" self.__sg_mode__ = sg_m\n",
"\n",
" self.original_sumop_ngrams = self.summary_op_ngrams\n",
" self.original_sumop_sentences = self.summary_op_sentences\n",
" self.original_sumop_paragraphs = self.summary_op_paragraphs\n",
"\n",
" def __lv__(self, ngram, icat, cache=True):\n",
" \"\"\"Local value function.\"\"\"\n",
" if cache:\n",
" return self.__trie_node__(ngram, icat)[LV]\n",
" else:\n",
" try:\n",
" ilength = len(ngram) - 1\n",
" fr = self.__trie_node__(ngram, icat)[FR]\n",
" if fr > NOISE_FR:\n",
" max_fr = self.__max_fr__[icat][ilength]\n",
" local_value = (fr / float(max_fr)) ** self.__s__\n",
" return local_value\n",
" else:\n",
" return 0\n",
" except TypeError:\n",
" return 0\n",
" except IndexError:\n",
" return 0\n",
"\n",
" def __sn__(self, ngram, icat):\n",
" \"\"\"The sanction (sn) function.\"\"\"\n",
" m_values = [\n",
" self.__sg__(ngram, ic)\n",
" for ic in xrange(len(self.__categories__)) if ic != icat\n",
" ]\n",
"\n",
" c = len(self.__categories__)\n",
"\n",
" s = sum([min(v, 1) for v in m_values])\n",
"\n",
" try:\n",
" return pow((c - (s + 1)) / ((c - 1) * (s + 1)), self.__p__)\n",
" except ZeroDivisionError: # if c <= 1\n",
" return 1.\n",
"\n",
" def __sg_vanilla__(self, ngram, icat, cache=True):\n",
" \"\"\"The original significance (sg) function definition.\"\"\"\n",
" try:\n",
" if cache:\n",
" return self.__trie_node__(ngram, icat)[SG]\n",
" else:\n",
" ncats = len(self.__categories__)\n",
" l = self.__l__\n",
" lvs = [self.__lv__(ngram, ic) for ic in xrange(ncats)]\n",
" lv = lvs[icat]\n",
"\n",
" M, sd = mad(lvs, ncats)\n",
"\n",
" if not sd and lv:\n",
" return 1.\n",
" else:\n",
" return sigmoid(lv - M, l * sd)\n",
" except TypeError:\n",
" return 0.\n",
"\n",
" def __sg_xai__(self, ngram, icat, cache=True):\n",
" \"\"\"\n",
" A variation of the significance (sn) function.\n",
"\n",
" This version of the sg function adds extra checks to\n",
" improve visual explanations.\n",
" \"\"\"\n",
" try:\n",
" if cache:\n",
" return self.__trie_node__(ngram, icat)[SG]\n",
" else:\n",
" ncats = len(self.__categories__)\n",
" l = self.__l__\n",
"\n",
" lvs = [self.__lv__(ngram, ic) for ic in xrange(ncats)]\n",
" lv = lvs[icat]\n",
"\n",
" M, sd = mad(lvs, ncats)\n",
"\n",
" if l * sd <= MIN_MAD_SD:\n",
" sd = MIN_MAD_SD / l if l else 0\n",
"\n",
" # stopwords filter\n",
" stopword = (M > .2) or (\n",
" sum(map(lambda v: v > 0.09, lvs)) == ncats\n",
" )\n",
" if (stopword and sd <= .1) or (M >= .3):\n",
" return 0.\n",
"\n",
" if not sd and lv:\n",
" return 1.\n",
"\n",
" return sigmoid(lv - M, l * sd)\n",
" except TypeError:\n",
" return 0.\n",
"\n",
" def __gv__(self, ngram, icat, cache=True):\n",
" \"\"\"\n",
" The global value (gv) function.\n",
"\n",
" This is the original way of computing the confidence value (cv)\n",
" of a term.\n",
" \"\"\"\n",
" if cache:\n",
" return self.__trie_node__(ngram, icat)[GV]\n",
" else:\n",
" lv = self.__lv__(ngram, icat)\n",
" weight = self.__sg__(ngram, icat) * self.__sn__(ngram, icat)\n",
" return lv * weight\n",
"\n",
" def __cv_norm_gv__(self, ngram, icat, cache=True):\n",
" \"\"\"\n",
" Alternative way of computing the confidence value (cv) of terms.\n",
"\n",
" This variations normalizes the gv value and uses that value as the cv.\n",
" \"\"\"\n",
" try:\n",
" if cache:\n",
" return self.__trie_node__(ngram, icat)[CV]\n",
" else:\n",
" try:\n",
" cv = self.__gv__(ngram, icat)\n",
" return cv / self.__max_gv__[icat][len(ngram) - 1]\n",
" except (ZeroDivisionError, IndexError):\n",
" return .0\n",
"\n",
" except TypeError:\n",
" return 0\n",
"\n",
" def __cv_norm_gv_xai__(self, ngram, icat, cache=True):\n",
" \"\"\"\n",
" Alternative way of computing the confidence value (cv) of terms.\n",
"\n",
" This variations not only normalizes the gv value but also adds extra\n",
" checks to improve visual explanations.\n",
" \"\"\"\n",
" try:\n",
" if cache:\n",
" return self.__trie_node__(ngram, icat)[CV]\n",
" else:\n",
" try:\n",
" max_gv = self.__max_gv__[icat][len(ngram) - 1]\n",
" if (len(ngram) > 1):\n",
" # stopwords guard\n",
" n_cats = len(self.__categories__)\n",
" cats = xrange(n_cats)\n",
" sum_words_gv = sum([\n",
" self.__gv__([w], ic) for w in ngram for ic in cats\n",
" ])\n",
" if (sum_words_gv < .05):\n",
" return .0\n",
" elif len([\n",
" w for w in ngram\n",
" if self.__gv__([w], icat) >= .01\n",
" ]) == len(ngram):\n",
" gv = self.__gv__(ngram, icat)\n",
" return gv / max_gv + sum_words_gv\n",
" # return gv / max_gv * len(ngram)\n",
"\n",
" gv = self.__gv__(ngram, icat)\n",
" return gv / max_gv\n",
" except (ZeroDivisionError, IndexError):\n",
" return .0\n",
"\n",
" except TypeError:\n",
" return 0\n",
"\n",
" def __apply_fn__(self, fn, ngram, cat):\n",
" \"\"\"Private method used by gv, lv, sn, sg functions.\"\"\"\n",
" icat = self.get_category_index(cat)\n",
" if icat == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
"\n",
" if ngram.strip() == '':\n",
" return 0\n",
"\n",
" ngram = [self.get_word_index(w)\n",
" for w in re.split(self.__word_delimiter__, ngram)\n",
" if w]\n",
" return fn(ngram, icat) if IDX_UNKNOWN_WORD not in ngram else 0\n",
"\n",
" def __summary_ops_are_pristine__(self):\n",
" \"\"\"Return True if summary operators haven't changed.\"\"\"\n",
" return self.original_sumop_ngrams == self.summary_op_ngrams and \\\n",
" self.original_sumop_sentences == self.summary_op_sentences and \\\n",
" self.original_sumop_paragraphs == self.summary_op_paragraphs\n",
"\n",
" def __classify_ngram__(self, ngram):\n",
" \"\"\"Classify the given n-gram.\"\"\"\n",
" cv = [\n",
" self.__cv__(ngram, icat)\n",
" for icat in xrange(len(self.__categories__))\n",
" ]\n",
" cv[:] = [(v if v > self.__a__ else 0) for v in cv]\n",
" return cv\n",
"\n",
" def __classify_sentence__(self, sent, prep, json=False, prep_func=None):\n",
" \"\"\"Classify the given sentence.\"\"\"\n",
" classify_trans = self.__classify_ngram__\n",
" categories = self.__categories__\n",
" cats = xrange(len(categories))\n",
" word_index = self.get_word_index\n",
" word_delimiter = self.__word_delimiter__\n",
" word_regex = self.__word_regex__\n",
"\n",
" if not json:\n",
" if prep or prep_func is not None:\n",
" prep_func = prep_func or Pp.clean_and_ready\n",
" sent = prep_func(sent)\n",
" sent_words = [\n",
" (w, w)\n",
" for w in re_split_keep(word_regex, sent)\n",
" if w\n",
" ]\n",
" else:\n",
" if prep or prep_func is not None:\n",
" sent_words = [\n",
" (w, Pp.clean_and_ready(w, dots=False) if prep_func is None else prep_func(w))\n",
" for w in re_split_keep(word_regex, sent)\n",
" if w\n",
" ]\n",
" else:\n",
" sent_words = [\n",
" (w, w)\n",
" for w in re_split_keep(word_regex, sent)\n",
" if w\n",
" ]\n",
"\n",
" if not sent_words:\n",
" sent_words = [(u'.', u'.')]\n",
"\n",
" sent_iwords = [word_index(w) for _, w in sent_words]\n",
" sent_len = len(sent_iwords)\n",
" sent_parsed = []\n",
" wcur = 0\n",
" while wcur < sent_len:\n",
" cats_ngrams_cv = [[0] for icat in cats]\n",
" cats_ngrams_offset = [[0] for icat in cats]\n",
" cats_ngrams_iword = [[-1] for icat in cats]\n",
" cats_max_cv = [.0 for icat in cats]\n",
"\n",
" for icat in cats:\n",
" woffset = 0\n",
" word_raw = sent_words[wcur + woffset][0]\n",
" wordi = sent_iwords[wcur + woffset]\n",
" word_info = categories[icat][VOCAB]\n",
"\n",
" if wordi in word_info:\n",
" cats_ngrams_cv[icat][0] = word_info[wordi][CV]\n",
" word_info = word_info[wordi][NEXT]\n",
" cats_ngrams_iword[icat][0] = wordi\n",
" cats_ngrams_offset[icat][0] = woffset\n",
"\n",
" # if it is a learned word (not unknown and seen for this category),\n",
" # then try to recognize learned n-grams too\n",
" if wordi != IDX_UNKNOWN_WORD and wordi in categories[icat][VOCAB]:\n",
" # while word or word delimiter (e.g. space)\n",
" while wordi != IDX_UNKNOWN_WORD or re.match(word_delimiter, word_raw):\n",
" woffset += 1\n",
" if wcur + woffset >= sent_len:\n",
" break\n",
"\n",
" word_raw = sent_words[wcur + woffset][0]\n",
" wordi = sent_iwords[wcur + woffset]\n",
"\n",
" # if word is a word:\n",
" if wordi != IDX_UNKNOWN_WORD:\n",
" # if this word belongs to this category\n",
" if wordi in word_info:\n",
" cats_ngrams_cv[icat].append(word_info[wordi][CV])\n",
" cats_ngrams_iword[icat].append(wordi)\n",
" cats_ngrams_offset[icat].append(woffset)\n",
" word_info = word_info[wordi][NEXT]\n",
" else:\n",
" break\n",
"\n",
" cats_max_cv[icat] = (max(cats_ngrams_cv[icat])\n",
" if cats_ngrams_cv[icat] else .0)\n",
"\n",
" max_gv = max(cats_max_cv)\n",
" use_ngram = True\n",
" if (max_gv > self.__a__):\n",
" icat_max_gv = cats_max_cv.index(max_gv)\n",
" ngram_max_gv = cats_ngrams_cv[icat_max_gv].index(max_gv)\n",
" offset_max_gv = cats_ngrams_offset[icat_max_gv][ngram_max_gv] + 1\n",
"\n",
" max_gv_sum_1_grams = max([\n",
" sum([\n",
" (categories[ic][VOCAB][wi][CV]\n",
" if wi in categories[ic][VOCAB]\n",
" else 0)\n",
" for wi\n",
" in cats_ngrams_iword[ic]\n",
" ])\n",
" for ic in cats\n",
" ])\n",
"\n",
" if (max_gv_sum_1_grams > max_gv):\n",
" use_ngram = False\n",
" else:\n",
" use_ngram = False\n",
"\n",
" if not use_ngram:\n",
" offset_max_gv = 1\n",
" icat_max_gv = 0\n",
" ngram_max_gv = 0\n",
"\n",
" sent_parsed.append(\n",
" (\n",
" u\"\".join([raw_word for raw_word, _ in sent_words[wcur:wcur + offset_max_gv]]),\n",
" cats_ngrams_iword[icat_max_gv][:ngram_max_gv + 1]\n",
" )\n",
" )\n",
" wcur += offset_max_gv\n",
"\n",
" get_word = self.get_word\n",
" if not json:\n",
" words_cvs = [classify_trans(seq) for _, seq in sent_parsed]\n",
" if words_cvs:\n",
" return self.summary_op_ngrams(words_cvs)\n",
" return self.__zero_cv__\n",
" else:\n",
" get_tip = self.__trie_node__\n",
" local_value = self.__lv__\n",
" info = [\n",
" {\n",
" \"token\": u\"→\".join(map(get_word, sequence)),\n",
" \"lexeme\": raw_sequence,\n",
" \"cv\": classify_trans(sequence),\n",
" \"lv\": [local_value(sequence, ic) for ic in cats],\n",
" \"fr\": [get_tip(sequence, ic)[FR] for ic in cats]\n",
" }\n",
" for raw_sequence, sequence in sent_parsed\n",
" ]\n",
" return {\n",
" \"words\": info,\n",
" \"cv\": self.summary_op_ngrams([v[\"cv\"] for v in info]),\n",
" \"wmv\": reduce(vmax, [v[\"cv\"] for v in info]) # word max value\n",
" }\n",
"\n",
" def __classify_paragraph__(self, parag, prep, json=False, prep_func=None):\n",
" \"\"\"Classify the given paragraph.\"\"\"\n",
" if not json:\n",
" sents_cvs = [\n",
" self.__classify_sentence__(sent, prep=prep, prep_func=prep_func)\n",
" for sent in re.split(self.__sent_delimiter__, parag)\n",
" if sent\n",
" ]\n",
" if sents_cvs:\n",
" return self.summary_op_sentences(sents_cvs)\n",
" return self.__zero_cv__\n",
" else:\n",
" info = [\n",
" self.__classify_sentence__(sent, prep=prep, prep_func=prep_func, json=True)\n",
" for sent in re_split_keep(self.__sent_delimiter__, parag)\n",
" if sent\n",
" ]\n",
" if info:\n",
" sents_cvs = [v[\"cv\"] for v in info]\n",
" cv = self.summary_op_sentences(sents_cvs)\n",
" wmv = reduce(vmax, [v[\"wmv\"] for v in info])\n",
" else:\n",
" cv = self.__zero_cv__\n",
" wmv = cv\n",
" return {\n",
" \"sents\": info,\n",
" \"cv\": cv,\n",
" \"wmv\": wmv # word max value\n",
" }\n",
"\n",
" def __trie_node__(self, ngram, icat):\n",
" \"\"\"Get the trie's node for this n-gram.\"\"\"\n",
" try:\n",
" word_info = self.__categories__[icat][VOCAB][ngram[0]]\n",
" for word in ngram[1:]:\n",
" word_info = word_info[NEXT][word]\n",
" return word_info\n",
" except BaseException:\n",
" return EMPTY_WORD_INFO\n",
"\n",
" def __get_category__(self, name):\n",
" \"\"\"\n",
" Given the category name, return the category data.\n",
"\n",
" If category name doesn't exist, creates a new one.\n",
" \"\"\"\n",
" try:\n",
" return self.__categories_index__[name]\n",
" except KeyError:\n",
" self.__max_fr__.append([])\n",
" self.__max_gv__.append([])\n",
" self.__categories_index__[name] = len(self.__categories__)\n",
" self.__categories__.append([name, {}]) # name, vocabulary\n",
" self.__zero_cv__ = (0,) * len(self.__categories__)\n",
" return self.__categories_index__[name]\n",
"\n",
" def __get_category_length__(self, icat):\n",
" \"\"\"\n",
" Return the category length.\n",
"\n",
" The category length is the total number of words seen during training.\n",
" \"\"\"\n",
" size = 0\n",
" vocab = self.__categories__[icat][VOCAB]\n",
" for word in vocab:\n",
" size += vocab[word][FR]\n",
" return size\n",
"\n",
" def __get_most_probable_category__(self):\n",
" \"\"\"Return the index of the most probable category.\"\"\"\n",
" sizes = []\n",
" for icat in xrange(len(self.__categories__)):\n",
" sizes.append((icat, self.__get_category_length__(icat)))\n",
" return sorted(sizes, key=lambda v: v[1])[-1][0]\n",
"\n",
" def __get_vocabularies__(self, icat, vocab, preffix, n_grams, output):\n",
" \"\"\"Get category list of n-grams with info.\"\"\"\n",
" senq_ilen = len(preffix)\n",
" get_name = self.get_word\n",
"\n",
" seq = preffix + [None]\n",
" if len(seq) > n_grams:\n",
" return\n",
"\n",
" for word in vocab:\n",
" seq[-1] = word\n",
" if (self.__cv__(seq, icat) > 0):\n",
" output[senq_ilen].append(\n",
" (\n",
" \"_\".join([get_name(wi) for wi in seq]),\n",
" vocab[word][FR],\n",
" self.__gv__(seq, icat),\n",
" self.__cv__(seq, icat)\n",
" )\n",
" )\n",
" self.__get_vocabularies__(\n",
" icat, vocab[word][NEXT], seq, n_grams, output\n",
" )\n",
"\n",
" def __get_category_vocab__(self, icat):\n",
" \"\"\"Get category list of n-grams ordered by confidence value.\"\"\"\n",
" category = self.__categories__[icat]\n",
" vocab = category[VOCAB]\n",
" w_seqs = ([w] for w in vocab)\n",
"\n",
" vocab_icat = (\n",
" (\n",
" self.get_word(wseq[0]),\n",
" vocab[wseq[0]][FR],\n",
" self.__lv__(wseq, icat),\n",
" self.__gv__(wseq, icat),\n",
" self.__cv__(wseq, icat)\n",
" )\n",
" for wseq in w_seqs if self.__gv__(wseq, icat) > self.__a__\n",
" )\n",
" return sorted(vocab_icat, key=lambda k: -k[-1])\n",
"\n",
" def __get_def_cat__(self, def_cat):\n",
" \"\"\"Given the `def_cat` argument, get the default category value.\"\"\"\n",
" if def_cat is not None and (def_cat not in [STR_MOST_PROBABLE, STR_UNKNOWN] and\n",
" self.get_category_index(def_cat) == IDX_UNKNOWN_CATEGORY):\n",
" raise ValueError(\n",
" \"the default category must be 'most-probable', 'unknown', or a category name\"\n",
" \" (current value is '%s').\" % str(def_cat)\n",
" )\n",
" def_cat = None if def_cat == STR_UNKNOWN else def_cat\n",
" return self.get_most_probable_category() if def_cat == STR_MOST_PROBABLE else def_cat\n",
"\n",
" def __get_next_iwords__(self, sent, icat):\n",
" \"\"\"Return the list of possible following words' indexes.\"\"\"\n",
" if not self.get_category_name(icat):\n",
" return []\n",
"\n",
" vocab = self.__categories__[icat][VOCAB]\n",
" word_index = self.get_word_index\n",
" sent = Pp.clean_and_ready(sent)\n",
" sent = [\n",
" word_index(w)\n",
" for w in sent.strip(\".\").split(\".\")[-1].split(\" \") if w\n",
" ]\n",
"\n",
" tips = []\n",
" for word in sent:\n",
" if word is None:\n",
" tips[:] = []\n",
" continue\n",
"\n",
" tips.append(vocab)\n",
"\n",
" tips[:] = (\n",
" tip[word][NEXT]\n",
" for tip in tips if word in tip and tip[word][NEXT]\n",
" )\n",
"\n",
" if len(tips) == 0:\n",
" return []\n",
"\n",
" next_words = tips[0]\n",
" next_nbr_words = float(sum([next_words[w][FR] for w in next_words]))\n",
" return sorted(\n",
" [\n",
" (\n",
" word1,\n",
" next_words[word1][FR],\n",
" next_words[word1][FR] / next_nbr_words\n",
" )\n",
" for word1 in next_words\n",
" ],\n",
" key=lambda k: -k[1]\n",
" )\n",
"\n",
" def __prune_cat_trie__(self, vocab, prune=False, min_n=None):\n",
" \"\"\"Prune the trie of the given category.\"\"\"\n",
" prun_floor = min_n or self.__prun_floor__\n",
" remove = []\n",
" for word in vocab:\n",
" if prune and vocab[word][FR] <= prun_floor:\n",
" vocab[word][NEXT] = None\n",
" remove.append(word)\n",
" else:\n",
" self.__prune_cat_trie__(vocab[word][NEXT], prune=True)\n",
"\n",
" for word in remove:\n",
" del vocab[word]\n",
"\n",
" def __prune_tries__(self):\n",
" \"\"\"Prune the trie of every category.\"\"\"\n",
" Print.info(\"pruning tries...\", offset=1)\n",
" for category in self.__categories__:\n",
" self.__prune_cat_trie__(category[VOCAB])\n",
" self.__prun_counter__ = 0\n",
"\n",
" def __cache_lvs__(self, icat, vocab, preffix):\n",
" \"\"\"Cache all local values.\"\"\"\n",
" for word in vocab:\n",
" sequence = preffix + [word]\n",
" vocab[word][LV] = self.__lv__(sequence, icat, cache=False)\n",
" self.__cache_lvs__(icat, vocab[word][NEXT], sequence)\n",
"\n",
" def __cache_gvs__(self, icat, vocab, preffix):\n",
" \"\"\"Cache all global values.\"\"\"\n",
" for word in vocab:\n",
" sequence = preffix + [word]\n",
" vocab[word][GV] = self.__gv__(sequence, icat, cache=False)\n",
" self.__cache_gvs__(icat, vocab[word][NEXT], sequence)\n",
"\n",
" def __cache_sg__(self, icat, vocab, preffix):\n",
" \"\"\"Cache all significance weight values.\"\"\"\n",
" for word in vocab:\n",
" sequence = preffix + [word]\n",
" vocab[word][SG] = self.__sg__(sequence, icat, cache=False)\n",
" self.__cache_sg__(icat, vocab[word][NEXT], sequence)\n",
"\n",
" def __cache_cvs__(self, icat, vocab, preffix):\n",
" \"\"\"Cache all confidence values.\"\"\"\n",
" for word in vocab:\n",
" sequence = preffix + [word]\n",
" vocab[word][CV] = self.__cv__(sequence, icat, False)\n",
" self.__cache_cvs__(icat, vocab[word][NEXT], sequence)\n",
"\n",
" def __update_max_gvs__(self, icat, vocab, preffix):\n",
" \"\"\"Update all maximum global values.\"\"\"\n",
" gv = self.__gv__\n",
" max_gvs = self.__max_gv__[icat]\n",
" sentence_ilength = len(preffix)\n",
"\n",
" sequence = preffix + [None]\n",
" for word in vocab:\n",
" sequence[-1] = word\n",
" sequence_gv = gv(sequence, icat)\n",
" if sequence_gv > max_gvs[sentence_ilength]:\n",
" max_gvs[sentence_ilength] = sequence_gv\n",
" self.__update_max_gvs__(icat, vocab[word][NEXT], sequence)\n",
"\n",
" def __update_needed__(self):\n",
" \"\"\"Return True if an update is needed, false otherwise.\"\"\"\n",
" return (self.__s__ != self.__s_update__ or\n",
" self.__l__ != self.__l_update__ or\n",
" self.__p__ != self.__p_update__)\n",
"\n",
" def __save_cat_vocab__(self, icat, path, n_grams):\n",
" \"\"\"Save the category vocabulary inside ``path``.\"\"\"\n",
" if n_grams == -1:\n",
" n_grams = 20 # infinite\n",
"\n",
" category = self.__categories__[icat]\n",
" cat_name = self.get_category_name(icat)\n",
" vocab = category[VOCAB]\n",
" vocabularies_out = [[] for _ in xrange(n_grams)]\n",
"\n",
" terms = [\"words\", \"bigrams\", \"trigrams\"]\n",
"\n",
" self.__get_vocabularies__(icat, vocab, [], n_grams, vocabularies_out)\n",
"\n",
" Print.info(\"saving '%s' vocab\" % cat_name)\n",
"\n",
" for ilen in xrange(n_grams):\n",
" if vocabularies_out[ilen]:\n",
" term = terms[ilen] if ilen <= 2 else \"%d-grams\" % (ilen + 1)\n",
" voc_path = os.path.join(\n",
" path, \"ss3_vocab_%s(%s).csv\" % (cat_name, term)\n",
" )\n",
" f = open(voc_path, \"w+\", encoding=ENCODING)\n",
" vocabularies_out[ilen].sort(key=lambda k: -k[-1])\n",
" f.write(u\"%s,%s,%s,%s\\n\" % (\"term\", \"fr\", \"gv\", \"cv\"))\n",
" for trans in vocabularies_out[ilen]:\n",
" f.write(u\"%s,%d,%f,%f\\n\" % tuple(trans))\n",
" f.close()\n",
" Print.info(\"\\t[ %s stored in '%s'\" % (term, voc_path))\n",
"\n",
" def __update_cv_cache__(self):\n",
" \"\"\"Update numpy darray confidence values cache.\"\"\"\n",
" if self.__cv_cache__ is None:\n",
" self.__cv_cache__ = np.zeros((len(self.__index_to_word__), len(self.__categories__)))\n",
" cv = self.__cv__\n",
" for term_idx, cv_vec in enumerate(self.__cv_cache__):\n",
" for cat_idx, _ in enumerate(cv_vec):\n",
" try:\n",
" cv_vec[cat_idx] = cv([term_idx], cat_idx)\n",
" except KeyError:\n",
" cv_vec[cat_idx] = 0\n",
"\n",
" def __predict_fast__(\n",
" self, x_test, def_cat=STR_MOST_PROBABLE, labels=True,\n",
" multilabel=False, proba=False, prep=True, leave_pbar=True\n",
" ):\n",
" \"\"\"A faster version of the `predict` method (using numpy).\"\"\"\n",
" if not def_cat or def_cat == STR_UNKNOWN:\n",
" def_cat = IDX_UNKNOWN_CATEGORY\n",
" elif def_cat == STR_MOST_PROBABLE:\n",
" def_cat = self.__get_most_probable_category__()\n",
" else:\n",
" def_cat = self.get_category_index(def_cat)\n",
" if def_cat == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
"\n",
" # does the special \"[others]\" category exist? (only used in multilabel classification)\n",
" __other_idx__ = self.get_category_index(STR_OTHERS_CATEGORY)\n",
"\n",
" if self.__update_needed__():\n",
" self.update_values()\n",
"\n",
" if self.__cv_cache__ is None:\n",
" self.__update_cv_cache__()\n",
" self.__last_x_test__ = None # could have learned a new word (in `learn`)\n",
" cv_cache = self.__cv_cache__\n",
"\n",
" x_test_hash = list_hash(x_test)\n",
" if x_test_hash == self.__last_x_test__:\n",
" x_test_idx = self.__last_x_test_idx__\n",
" else:\n",
" self.__last_x_test__ = x_test_hash\n",
" self.__last_x_test_idx__ = [None] * len(x_test)\n",
" x_test_idx = self.__last_x_test_idx__\n",
" word_index = self.get_word_index\n",
" for doc_idx, doc in enumerate(tqdm(x_test, desc=\"Caching documents\",\n",
" leave=False, disable=Print.is_quiet())):\n",
" x_test_idx[doc_idx] = [\n",
" word_index(w)\n",
" for w\n",
" in re.split(self.__word_delimiter__, Pp.clean_and_ready(doc) if prep else doc)\n",
" if word_index(w) != IDX_UNKNOWN_WORD\n",
" ]\n",
"\n",
" y_pred = [None] * len(x_test)\n",
" for doc_idx, doc in enumerate(tqdm(x_test_idx, desc=\"Classification\",\n",
" leave=leave_pbar, disable=Print.is_quiet())):\n",
" if self.__a__ > 0:\n",
" doc_cvs = cv_cache[doc]\n",
" doc_cvs[doc_cvs <= self.__a__] = 0\n",
" pred_cv = np.add.reduce(doc_cvs, 0)\n",
" else:\n",
" pred_cv = np.add.reduce(cv_cache[doc], 0)\n",
"\n",
" if proba:\n",
" y_pred[doc_idx] = list(pred_cv)\n",
" continue\n",
"\n",
" if not multilabel:\n",
" if pred_cv.sum() == 0:\n",
" y_pred[doc_idx] = def_cat\n",
" else:\n",
" y_pred[doc_idx] = np.argmax(pred_cv)\n",
"\n",
" if labels:\n",
" if y_pred[doc_idx] != IDX_UNKNOWN_CATEGORY:\n",
" y_pred[doc_idx] = self.__categories__[y_pred[doc_idx]][NAME]\n",
" else:\n",
" y_pred[doc_idx] = STR_UNKNOWN_CATEGORY\n",
" else:\n",
" if pred_cv.sum() == 0:\n",
" if def_cat == IDX_UNKNOWN_CATEGORY:\n",
" y_pred[doc_idx] = []\n",
" else:\n",
" y_pred[doc_idx] = [self.get_category_name(def_cat) if labels else def_cat]\n",
" else:\n",
" r = sorted([(i, pred_cv[i])\n",
" for i in range(pred_cv.size)],\n",
" key=lambda e: -e[1])\n",
" if labels:\n",
" y_pred[doc_idx] = [self.get_category_name(cat_i)\n",
" for cat_i, _ in r[:kmean_multilabel_size(r)]]\n",
" else:\n",
" y_pred[doc_idx] = [cat_i for cat_i, _ in r[:kmean_multilabel_size(r)]]\n",
"\n",
" # if the special \"[others]\" category exists\n",
" if __other_idx__ != IDX_UNKNOWN_CATEGORY:\n",
" # if its among the predicted labels, remove (hide) it\n",
" if labels:\n",
" if STR_OTHERS_CATEGORY in y_pred[doc_idx]:\n",
" y_pred[doc_idx].remove(STR_OTHERS_CATEGORY)\n",
" else:\n",
" if __other_idx__ in y_pred[doc_idx]:\n",
" y_pred[doc_idx].remove(__other_idx__)\n",
"\n",
" return y_pred\n",
"\n",
" def summary_op_ngrams(self, cvs):\n",
" \"\"\"\n",
" Summary operator for n-gram confidence vectors.\n",
"\n",
" By default it returns the addition of all confidence\n",
" vectors. However, in case you want to use a custom\n",
" summary operator, this function must be replaced\n",
" as shown in the following example:\n",
"\n",
" >>> def my_summary_op(cvs):\n",
" >>> return cvs[0]\n",
" >>> ...\n",
" >>> clf = SS3()\n",
" >>> ...\n",
" >>> clf.summary_op_ngrams = my_summary_op\n",
"\n",
" Note that any function receiving a list of vectors and\n",
" returning a single vector could be used. In the above example\n",
" the summary operator is replaced by the user-defined\n",
" ``my_summary_op`` which ignores all confidence vectors\n",
" returning only the confidence vector of the first n-gram\n",
" (which besides being an illustrative example, makes no real sense).\n",
"\n",
" :param cvs: a list n-grams confidence vectors\n",
" :type cvs: list (of list of float)\n",
" :returns: a sentence confidence vector\n",
" :rtype: list (of float)\n",
" \"\"\"\n",
" return reduce(vsum, cvs)\n",
"\n",
" def summary_op_sentences(self, cvs):\n",
" \"\"\"\n",
" Summary operator for sentence confidence vectors.\n",
"\n",
" By default it returns the addition of all confidence\n",
" vectors. However, in case you want to use a custom\n",
" summary operator, this function must be replaced\n",
" as shown in the following example:\n",
"\n",
" >>> def dummy_summary_op(cvs):\n",
" >>> return cvs[0]\n",
" >>> ...\n",
" >>> clf = SS3()\n",
" >>> ...\n",
" >>> clf.summary_op_sentences = dummy_summary_op\n",
"\n",
" Note that any function receiving a list of vectors and\n",
" returning a single vector could be used. In the above example\n",
" the summary operator is replaced by the user-defined\n",
" ``dummy_summary_op`` which ignores all confidence vectors\n",
" returning only the confidence vector of the first sentence\n",
" (which besides being an illustrative example, makes no real sense).\n",
"\n",
" :param cvs: a list sentence confidence vectors\n",
" :type cvs: list (of list of float)\n",
" :returns: a paragraph confidence vector\n",
" :rtype: list (of float)\n",
" \"\"\"\n",
" return reduce(vsum, cvs)\n",
"\n",
" def summary_op_paragraphs(self, cvs):\n",
" \"\"\"\n",
" Summary operator for paragraph confidence vectors.\n",
"\n",
" By default it returns the addition of all confidence\n",
" vectors. However, in case you want to use a custom\n",
" summary operator, this function must be replaced\n",
" as shown in the following example:\n",
"\n",
" >>> def dummy_summary_op(cvs):\n",
" >>> return cvs[0]\n",
" >>> ...\n",
" >>> clf = SS3()\n",
" >>> ...\n",
" >>> clf.summary_op_paragraphs = dummy_summary_op\n",
"\n",
" Note that any function receiving a list of vectors and\n",
" returning a single vector could be used. In the above example\n",
" the summary operator is replaced by the user-defined\n",
" ``dummy_summary_op`` which ignores all confidence vectors\n",
" returning only the confidence vector of the first paragraph\n",
" (which besides being an illustrative example, makes no real sense).\n",
"\n",
" :param cvs: a list paragraph confidence vectors\n",
" :type cvs: list (of list of float)\n",
" :returns: the document confidence vector\n",
" :rtype: list (of float)\n",
" \"\"\"\n",
" return reduce(vsum, cvs)\n",
"\n",
" def get_name(self):\n",
" \"\"\"\n",
" Return the model's name.\n",
"\n",
" :returns: the model's name.\n",
" :rtype: str\n",
" \"\"\"\n",
" return self.__name__\n",
"\n",
" def set_name(self, name):\n",
" \"\"\"\n",
" Set the model's name.\n",
"\n",
" :param name: the model's name.\n",
" :type name: str\n",
" \"\"\"\n",
" self.__name__ = name\n",
"\n",
" def set_hyperparameters(self, s=None, l=None, p=None, a=None):\n",
" \"\"\"\n",
" Set hyperparameter values.\n",
"\n",
" :param s: the \"smoothness\" (sigma) hyperparameter\n",
" :type s: float\n",
" :param l: the \"significance\" (lambda) hyperparameter\n",
" :type l: float\n",
" :param p: the \"sanction\" (rho) hyperparameter\n",
" :type p: float\n",
" :param a: the alpha hyperparameter (i.e. all terms with a\n",
" confidence value (cv) less than alpha will be ignored during\n",
" classification)\n",
" :type a: float\n",
" \"\"\"\n",
" if s is not None:\n",
" self.set_s(s)\n",
" if l is not None:\n",
" self.set_l(l)\n",
" if p is not None:\n",
" self.set_p(p)\n",
" if a is not None:\n",
" self.set_a(a)\n",
"\n",
" def get_hyperparameters(self):\n",
" \"\"\"\n",
" Get hyperparameter values.\n",
"\n",
" :returns: a tuple with hyperparameters current values (s, l, p, a)\n",
" :rtype: tuple\n",
" \"\"\"\n",
" return self.__s__, self.__l__, self.__p__, self.__a__\n",
"\n",
" def set_model_path(self, path):\n",
" \"\"\"\n",
" Overwrite the default path from which the model will be loaded (or saved to).\n",
"\n",
" Note: be aware that the PySS3 Command Line tool looks for\n",
" a local folder called ``ss3_models`` to load models.\n",
" Therefore, the ``ss3_models`` folder will be always automatically\n",
" append to the given ``path`` (e.g. if ``path=\"my/path/\"``, it will\n",
" be converted into ``my/path/ss3_models``).\n",
"\n",
" :param path: the path\n",
" :type path: str\n",
" \"\"\"\n",
" self.__models_folder__ = os.path.join(path, STR_MODEL_FOLDER)\n",
"\n",
" def set_block_delimiters(self, parag=None, sent=None, word=None):\n",
" r\"\"\"Overwrite the default delimiters used to split input documents into blocks.\n",
"\n",
" delimiters are any regular expression from simple ones (e.g. ``\" \"``) to\n",
" more complex ones (e.g. ``r\"[^\\s\\w\\d]\"``).\n",
" Note: remember that there are certain reserved characters for regular expression,\n",
" for example, the dot (.), in which case use the backslash to indicate you're\n",
" referring the character itself and not its interpretation (e.g. ``\\.``)\n",
"\n",
" e.g.\n",
"\n",
" >>> ss3.set_block_delimiters(word=\"\\s\")\n",
" >>> ss3.set_block_delimiters(word=\"\\s\", parag=\"\\n\\n\")\n",
" >>> ss3.set_block_delimiters(parag=\"\\n---\\n\")\n",
" >>> ss3.set_block_delimiters(sent=\"\\.\")\n",
" >>> ss3.set_block_delimiters(word=\"\\|\")\n",
" >>> ss3.set_block_delimiters(word=\" \")\n",
"\n",
" :param parag: the paragraph new delimiter\n",
" :type parag: str\n",
" :param sent: the sentence new delimiter\n",
" :type sent: str\n",
" :param word: the word new delimiter\n",
" :type word: str\n",
" \"\"\"\n",
" if parag:\n",
" self.set_delimiter_paragraph(parag)\n",
" if sent:\n",
" self.set_delimiter_sentence(sent)\n",
" if word:\n",
" self.set_delimiter_word(word)\n",
"\n",
" def set_delimiter_paragraph(self, regex):\n",
" r\"\"\"\n",
" Set the delimiter used to split documents into paragraphs.\n",
"\n",
" Remember that there are certain reserved characters for regular expression,\n",
" for example, the dot (.), in which case use the backslash to indicate you're\n",
" referring the character itself and not its interpretation (e.g. ``\\.``)\n",
"\n",
" :param regex: the regular expression of the new delimiter\n",
" :type regex: str\n",
" \"\"\"\n",
" self.__parag_delimiter__ = regex\n",
"\n",
" def set_delimiter_sentence(self, regex):\n",
" r\"\"\"\n",
" Set the delimiter used to split documents into sentences.\n",
"\n",
" Remember that there are certain reserved characters for regular expression,\n",
" for example, the dot (.), in which case use the backslash to indicate you're\n",
" referring the character itself and not its interpretation (e.g. ``\\.``)\n",
"\n",
" :param regex: the regular expression of the new delimiter\n",
" :type regex: str\n",
" \"\"\"\n",
" self.__sent_delimiter__ = regex\n",
"\n",
" def set_delimiter_word(self, regex):\n",
" r\"\"\"\n",
" Set the delimiter used to split documents into words.\n",
"\n",
" Remember that there are certain reserved characters for regular expression,\n",
" for example, the dot (.), in which case use the backslash to indicate you're\n",
" referring the character itself and not its interpretation (e.g. ``\\.``)\n",
"\n",
" :param regex: the regular expression of the new delimiter\n",
" :type regex: str\n",
" \"\"\"\n",
" self.__word_delimiter__ = regex\n",
"\n",
" def set_s(self, value):\n",
" \"\"\"\n",
" Set the \"smoothness\" (sigma) hyperparameter value.\n",
"\n",
" :param value: the hyperparameter value\n",
" :type value: float\n",
" \"\"\"\n",
" self.__s__ = float(value)\n",
"\n",
" def get_s(self):\n",
" \"\"\"\n",
" Get the \"smoothness\" (sigma) hyperparameter value.\n",
"\n",
" :returns: the hyperparameter value\n",
" :rtype: float\n",
" \"\"\"\n",
" return self.__s__\n",
"\n",
" def set_l(self, value):\n",
" \"\"\"\n",
" Set the \"significance\" (lambda) hyperparameter value.\n",
"\n",
" :param value: the hyperparameter value\n",
" :type value: float\n",
" \"\"\"\n",
" self.__l__ = float(value)\n",
"\n",
" def get_l(self):\n",
" \"\"\"\n",
" Get the \"significance\" (lambda) hyperparameter value.\n",
"\n",
" :returns: the hyperparameter value\n",
" :rtype: float\n",
" \"\"\"\n",
" return self.__l__\n",
"\n",
" def set_p(self, value):\n",
" \"\"\"\n",
" Set the \"sanction\" (rho) hyperparameter value.\n",
"\n",
" :param value: the hyperparameter value\n",
" :type value: float\n",
" \"\"\"\n",
" self.__p__ = float(value)\n",
"\n",
" def get_p(self):\n",
" \"\"\"\n",
" Get the \"sanction\" (rho) hyperparameter value.\n",
"\n",
" :returns: the hyperparameter value\n",
" :rtype: float\n",
" \"\"\"\n",
" return self.__p__\n",
"\n",
" def set_a(self, value):\n",
" \"\"\"\n",
" Set the alpha hyperparameter value.\n",
"\n",
" All terms with a confidence value (cv) less than alpha\n",
" will be ignored during classification.\n",
"\n",
" :param value: the hyperparameter value\n",
" :type value: float\n",
" \"\"\"\n",
" self.__a__ = float(value)\n",
"\n",
" def get_a(self):\n",
" \"\"\"\n",
" Get the alpha hyperparameter value.\n",
"\n",
" :returns: the hyperparameter value\n",
" :rtype: float\n",
" \"\"\"\n",
" return self.__a__\n",
"\n",
" def get_categories(self, all=False):\n",
" \"\"\"\n",
" Get the list of category names.\n",
"\n",
" :returns: the list of category names\n",
" :rtype: list (of str)\n",
" \"\"\"\n",
" return [\n",
" self.get_category_name(ci)\n",
" for ci in range(len(self.__categories__))\n",
" if all or self.get_category_name(ci) != STR_OTHERS_CATEGORY\n",
" ]\n",
"\n",
" def get_most_probable_category(self):\n",
" \"\"\"\n",
" Get the name of the most probable category.\n",
"\n",
" :returns: the name of the most probable category\n",
" :rtype: str\n",
" \"\"\"\n",
" return self.get_category_name(self.__get_most_probable_category__())\n",
"\n",
" def get_ngrams_length(self):\n",
" \"\"\"\n",
" Return the length of longest learned n-gram.\n",
"\n",
" :returns: the length of longest learned n-gram.\n",
" :rtype: int\n",
" \"\"\"\n",
" return len(self.__max_fr__[0]) if len(self.__max_fr__) > 0 else 0\n",
"\n",
" def get_category_index(self, name):\n",
" \"\"\"\n",
" Given its name, return the category index.\n",
"\n",
" :param name: The category name\n",
" :type name: str\n",
" :returns: the category index (or ``IDX_UNKNOWN_CATEGORY``\n",
" if the category doesn't exist).\n",
" :rtype: int\n",
" \"\"\"\n",
" try:\n",
" return self.__categories_index__[name]\n",
" except KeyError:\n",
" return IDX_UNKNOWN_CATEGORY\n",
"\n",
" def get_category_name(self, index):\n",
" \"\"\"\n",
" Given its index, return the category name.\n",
"\n",
" :param index: The category index\n",
" :type index: int\n",
" :returns: the category name (or ``STR_UNKNOWN_CATEGORY``\n",
" if the category doesn't exist).\n",
" :rtype: str\n",
" \"\"\"\n",
" try:\n",
" if isinstance(index, list):\n",
" index = index[0]\n",
" return self.__categories__[index][NAME]\n",
" except IndexError:\n",
" return STR_UNKNOWN_CATEGORY\n",
"\n",
" def get_word_index(self, word):\n",
" \"\"\"\n",
" Given a word, return its index.\n",
"\n",
" :param name: a word\n",
" :type name: str\n",
" :returns: the word index (or ``IDX_UNKNOWN_WORD`` if the word doesn't exist).\n",
" :rtype: int\n",
" \"\"\"\n",
" try:\n",
" return self.__word_to_index__[word]\n",
" except KeyError:\n",
" return IDX_UNKNOWN_WORD\n",
"\n",
" def get_word(self, index):\n",
" \"\"\"\n",
" Given the index, return the word.\n",
"\n",
" :param index: the word index\n",
" :type index: int\n",
" :returns: the word (or ``STR_UNKNOWN_WORD`` if the word doesn't exist).\n",
" :rtype: int\n",
" :rtype: str\n",
" \"\"\"\n",
" return (\n",
" self.__index_to_word__[index]\n",
" if index in self.__index_to_word__ else STR_UNKNOWN_WORD\n",
" )\n",
"\n",
" def get_next_words(self, sent, cat, n=None):\n",
" \"\"\"\n",
" Given a sentence, return the list of ``n`` (possible) following words.\n",
"\n",
" :param sent: a sentence (e.g. \"an artificial\")\n",
" :type sent: str\n",
" :param cat: the category name\n",
" :type cat: str\n",
" :param n: the maximum number of possible answers\n",
" :type n: int\n",
" :returns: a list of tuples (word, frequency, probability)\n",
" :rtype: list (of tuple)\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" icat = self.get_category_index(cat)\n",
"\n",
" if icat == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
"\n",
" guessedwords = [\n",
" (self.get_word(iword), fr, P)\n",
" for iword, fr, P in self.__get_next_iwords__(sent, icat) if fr\n",
" ]\n",
" if n is not None and guessedwords:\n",
" return guessedwords[:n]\n",
" return guessedwords\n",
"\n",
" def get_stopwords(self, sg_threshold=.01):\n",
" \"\"\"\n",
" Get the list of (recognized) stopwords.\n",
"\n",
" :param sg_threshold: significance (sg) value used as a threshold to\n",
" consider words as stopwords (i.e. words with\n",
" sg < ``sg_threshold`` for all categories will\n",
" be considered as \"stopwords\")\n",
" :type sg_threshold: float\n",
" :returns: a list of stopwords\n",
" :rtype: list (of str)\n",
" \"\"\"\n",
" if not self.__categories__:\n",
" return\n",
"\n",
" iwords = self.__index_to_word__\n",
" sg_threshold = float(sg_threshold or .01)\n",
" categories = self.__categories__\n",
" cats_len = len(categories)\n",
" sg = self.__sg__\n",
" stopwords = []\n",
" vocab = categories[0][VOCAB]\n",
"\n",
" for word0 in iwords:\n",
" word_sg = [\n",
" sg([word0], c_i)\n",
" for c_i in xrange(cats_len)\n",
" ]\n",
" word_cats_len = len([v for v in word_sg if v < sg_threshold])\n",
" if word_cats_len == cats_len:\n",
" stopwords.append(word0)\n",
"\n",
" stopwords = [\n",
" iwords[w0]\n",
" for w0, v\n",
" in sorted(\n",
" [\n",
" (w0, vocab[w0][FR] if w0 in vocab else 0)\n",
" for w0 in stopwords\n",
" ],\n",
" key=lambda k: -k[1]\n",
" )\n",
" ]\n",
"\n",
" return stopwords\n",
"\n",
" def save_model(self, path=None):\n",
" \"\"\"\n",
" Save the model to disk.\n",
"\n",
" if a ``path`` is not present, the default will be used (\"./\"),\n",
" However, if a ``path`` is given, it will not only used to save\n",
" the model but also will overwrite the default path calling the\n",
" ``SS3``'s ``set_model_path(path)`` method (see ``set_model_path``\n",
" method documentation for more detail).\n",
"\n",
" :param path: the path to save the model to\n",
" :type path: str\n",
"\n",
" :raises: OSError\n",
" \"\"\"\n",
" if path:\n",
" self.set_model_path(path)\n",
"\n",
" stime = time()\n",
" Print.info(\n",
" \"saving model (%s/%s.%s)...\"\n",
" %\n",
" (self.__models_folder__, self.__name__, STR_MODEL_EXT),\n",
" False\n",
" )\n",
" json_file_format = {\n",
" \"__a__\": self.__a__,\n",
" \"__l__\": self.__l__,\n",
" \"__p__\": self.__p__,\n",
" \"__s__\": self.__s__,\n",
" \"__max_fr__\": self.__max_fr__,\n",
" \"__max_gv__\": self.__max_gv__,\n",
" \"__categories__\": self.__categories__,\n",
" \"__categories_index__\": self.__categories_index__,\n",
" \"__index_to_word__\": self.__index_to_word__,\n",
" \"__word_to_index__\": self.__word_to_index__,\n",
" \"__cv_mode__\": self.__cv_mode__,\n",
" \"__sg_mode__\": self.__sg_mode__,\n",
" \"__multilabel__\": self.__multilabel__\n",
" }\n",
"\n",
" try:\n",
" os.makedirs(self.__models_folder__)\n",
" except OSError as ose:\n",
" if ose.errno == errno.EEXIST and os.path.isdir(self.__models_folder__):\n",
" pass\n",
" else:\n",
" raise\n",
"\n",
" json_file = open(\n",
" \"%s/%s.%s\" % (\n",
" self.__models_folder__,\n",
" self.__name__,\n",
" STR_MODEL_EXT\n",
" ), \"w\", encoding=ENCODING\n",
" )\n",
"\n",
" try: # python 3\n",
" json_file.write(json.dumps(json_file_format))\n",
" except TypeError: # python 2\n",
" json_file.write(json.dumps(json_file_format).decode(ENCODING))\n",
"\n",
" json_file.close()\n",
" Print.info(\"(%.1fs)\" % (time() - stime))\n",
"\n",
" def load_model(self, path=None):\n",
" \"\"\"\n",
" Load model from disk.\n",
"\n",
" if a ``path`` is not present, the default will be used (\"./\"),\n",
" However, if a ``path`` is given, it will not only used to load\n",
" the model but also will overwrite the default path calling the\n",
" ``SS3``'s ``set_model_path(path)`` method (see ``set_model_path``\n",
" method documentation for more detail).\n",
"\n",
" :param path: the path to load the model from\n",
" :type path: str\n",
"\n",
" :raises: OSError\n",
" \"\"\"\n",
" if path:\n",
" self.set_model_path(path)\n",
"\n",
" stime = time()\n",
" Print.info(\"loading '%s' model from disk...\" % self.__name__)\n",
"\n",
" json_file = open(\n",
" \"%s/%s.%s\" % (\n",
" self.__models_folder__,\n",
" self.__name__,\n",
" STR_MODEL_EXT\n",
" ), \"r\", encoding=ENCODING\n",
" )\n",
" jmodel = json.loads(json_file.read(), object_hook=key_as_int)\n",
" json_file.close()\n",
"\n",
" self.__max_fr__ = jmodel[\"__max_fr__\"]\n",
" self.__max_gv__ = jmodel[\"__max_gv__\"]\n",
" self.__l__ = jmodel[\"__l__\"]\n",
" self.__p__ = jmodel[\"__p__\"]\n",
" self.__s__ = jmodel[\"__s__\"]\n",
" self.__a__ = jmodel[\"__a__\"]\n",
" self.__categories__ = jmodel[\"__categories__\"]\n",
" self.__categories_index__ = jmodel[\"__categories_index__\"]\n",
" self.__index_to_word__ = jmodel[\"__index_to_word__\"]\n",
" self.__word_to_index__ = jmodel[\"__word_to_index__\"]\n",
" self.__cv_mode__ = jmodel[\"__cv_mode__\"]\n",
" self.__multilabel__ = jmodel[\"__multilabel__\"] if \"__multilabel__\" in jmodel else False\n",
" self.__sg_mode__ = (jmodel[\"__sg_mode__\"]\n",
" if \"__sg_mode__\" in jmodel\n",
" else jmodel[\"__sn_mode__\"])\n",
"\n",
" self.__zero_cv__ = (0,) * len(self.__categories__)\n",
" self.__s_update__ = self.__s__\n",
" self.__l_update__ = self.__l__\n",
" self.__p_update__ = self.__p__\n",
"\n",
" Print.info(\"(%.1fs)\" % (time() - stime))\n",
"\n",
" def save_cat_vocab(self, cat, path=\"./\", n_grams=-1):\n",
" \"\"\"\n",
" Save category vocabulary to disk.\n",
"\n",
" :param cat: the category name\n",
" :type cat: str\n",
" :param path: the path in which to store the vocabulary\n",
" :type path: str\n",
" :param n_grams: indicates the n-grams to be stored (e.g. only 1-grams,\n",
" 2-grams, 3-grams, etc.). Default -1 stores all\n",
" learned n-grams (1-grams, 2-grams, 3-grams, etc.)\n",
" :type n_grams: int\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" if self.get_category_index(cat) == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
"\n",
" self.__save_cat_vocab__(self.get_category_index(cat), path, n_grams)\n",
"\n",
" def save_vocab(self, path=\"./\", n_grams=-1):\n",
" \"\"\"\n",
" Save learned vocabularies to disk.\n",
"\n",
" :param path: the path in which to store the vocabularies\n",
" :type path: str\n",
" :param n_grams: indicates the n-grams to be stored (e.g. only 1-grams,\n",
" 2-grams, 3-grams, etc.). Default -1 stores all\n",
" learned n-grams (1-grams, 2-grams, 3-grams, etc.)\n",
" :type n_grams: int\n",
" \"\"\"\n",
" for icat in xrange(len(self.__categories__)):\n",
" self.__save_cat_vocab__(icat, path, n_grams)\n",
"\n",
" def update_values(self, force=False):\n",
" \"\"\"\n",
" Update model values (cv, gv, lv, etc.).\n",
"\n",
" :param force: force update (even if hyperparameters haven't changed)\n",
" :type force: bool\n",
" \"\"\"\n",
" update = 0\n",
" if force or self.__s_update__ != self.__s__:\n",
" update = 3\n",
" elif self.__l_update__ != self.__l__:\n",
" update = 2\n",
" elif self.__p_update__ != self.__p__:\n",
" update = 1\n",
"\n",
" if update == 0:\n",
" Print.info(\"nothing to update...\", offset=1)\n",
" return\n",
"\n",
" category_len = len(self.__categories__)\n",
" categories = xrange(category_len)\n",
" category_names = [self.get_category_name(ic) for ic in categories]\n",
" stime = time()\n",
" Print.info(\"about to start updating values...\", offset=1)\n",
" if update == 3: # only if s has changed\n",
" Print.info(\"caching lv values\", offset=1)\n",
" for icat in categories:\n",
" Print.info(\n",
" \"lv values for %d (%s)\" % (icat, category_names[icat]),\n",
" offset=4\n",
" )\n",
" self.__cache_lvs__(icat, self.__categories__[icat][VOCAB], [])\n",
"\n",
" if update >= 2: # only if s or l have changed\n",
" Print.info(\"caching sg values\", offset=1)\n",
" for icat in categories:\n",
" Print.info(\n",
" \"sg values for %d (%s)\" % (icat, category_names[icat]),\n",
" offset=4\n",
" )\n",
" self.__cache_sg__(icat, self.__categories__[icat][VOCAB], [])\n",
"\n",
" Print.info(\"caching gv values\")\n",
" for icat in categories:\n",
" Print.info(\n",
" \"gv values for %d (%s)\" % (icat, category_names[icat]),\n",
" offset=4\n",
" )\n",
" self.__cache_gvs__(icat, self.__categories__[icat][VOCAB], [])\n",
"\n",
" if self.__cv_mode__ != STR_GV:\n",
" Print.info(\"updating max gv values\", offset=1)\n",
" for icat in categories:\n",
" Print.info(\n",
" \"max gv values for %d (%s)\" % (icat, category_names[icat]),\n",
" offset=4\n",
" )\n",
" self.__max_gv__[icat] = list(\n",
" map(lambda _: 0, self.__max_gv__[icat])\n",
" )\n",
" self.__update_max_gvs__(\n",
" icat, self.__categories__[icat][VOCAB], []\n",
" )\n",
"\n",
" Print.info(\"max gv values have been updated\", offset=1)\n",
"\n",
" Print.info(\"caching confidence values (cvs)\", offset=1)\n",
" for icat in categories:\n",
" Print.info(\n",
" \"cvs for %d (%s)\" % (icat, category_names[icat]),\n",
" offset=4\n",
" )\n",
" self.__cache_cvs__(icat, self.__categories__[icat][VOCAB], [])\n",
" Print.info(\"finished --time: %.1fs\" % (time() - stime), offset=1)\n",
"\n",
" self.__s_update__ = self.__s__\n",
" self.__l_update__ = self.__l__\n",
" self.__p_update__ = self.__p__\n",
"\n",
" if self.__cv_cache__ is not None:\n",
" self.__update_cv_cache__()\n",
"\n",
" def print_model_info(self):\n",
" \"\"\"Print information regarding the model.\"\"\"\n",
" print()\n",
" print(\" %s: %s\\n\" % (\n",
" Print.style.green(Print.style.ubold(\"NAME\")),\n",
" Print.style.warning(self.get_name())\n",
" ))\n",
"\n",
" def print_hyperparameters_info(self):\n",
" \"\"\"Print information about hyperparameters.\"\"\"\n",
" print()\n",
" print(\n",
" \" %s:\\n\" % Print.style.green(Print.style.ubold(\"HYPERPARAMETERS\"))\n",
" )\n",
" print(\"\\tSmoothness(s):\", Print.style.warning(self.__s__))\n",
" print(\"\\tSignificance(l):\", Print.style.warning(self.__l__))\n",
" print(\"\\tSanction(p):\", Print.style.warning(self.__p__))\n",
" print(\"\")\n",
" print(\"\\tAlpha(a):\", Print.style.warning(self.__a__))\n",
"\n",
" def print_categories_info(self):\n",
" \"\"\"Print information about learned categories.\"\"\"\n",
" if not self.__categories__:\n",
" print(\n",
" \"\\n %s: None\\n\"\n",
" % Print.style.green(Print.style.ubold(\"CATEGORIES\"))\n",
" )\n",
" return\n",
"\n",
" cat_len = max([\n",
" len(self.get_category_name(ic))\n",
" for ic in xrange(len(self.__categories__))\n",
" ])\n",
" cat_len = max(cat_len, 8)\n",
" row_template = Print.style.warning(\"\\t{:^%d} \" % cat_len)\n",
" row_template += \"| {:^5} | {:^10} | {:^11} | {:^13} | {:^6} |\"\n",
" print()\n",
" print(\"\\n %s:\\n\" % Print.style.green(Print.style.ubold(\"CATEGORIES\")))\n",
" print(\n",
" row_template\n",
" .format(\n",
" \"Category\", \"Index\", \"Length\",\n",
" \"Vocab. Size\", \"Word Max. Fr.\", \"N-gram\"\n",
" )\n",
" )\n",
" print(\n",
" (\n",
" \"\\t{:-<%d}-|-{:-<5}-|-{:-<10}-|-{:-<11}-|-{:-<13}-|-{:-<6}-|\"\n",
" % cat_len\n",
" )\n",
" .format('', '', '', '', '', '')\n",
" )\n",
"\n",
" mpci = self.__get_most_probable_category__()\n",
" mpc_size = 0\n",
" mpc_total = 0\n",
" for icat, category in enumerate(self.__categories__):\n",
" icat_size = self.__get_category_length__(icat)\n",
" \"\"\"print(\n",
" row_template\n",
" .format(\n",
" category[NAME],\n",
" icat, icat_size,\n",
" len(category[VOCAB]),\n",
" self.__max_fr__[icat][0],\n",
" len(self.__max_fr__[icat])\n",
" )\n",
" )\"\"\"\n",
" print(category[NAME], len(category[VOCAB]))\n",
"\n",
" mpc_total += icat_size\n",
" if icat == mpci:\n",
" mpc_size = icat_size\n",
"\n",
" print(\n",
" \"\\n\\t%s: %s %s\"\n",
" %\n",
" (\n",
" Print.style.ubold(\"Most Probable Category\"),\n",
" Print.style.warning(self.get_category_name(mpci)),\n",
" Print.style.blue(\"(%.2f%%)\" % (100.0 * mpc_size / mpc_total))\n",
" )\n",
" )\n",
" print()\n",
"\n",
" def print_ngram_info(self, ngram):\n",
" \"\"\"\n",
" Print debugging information about a given n-gram.\n",
"\n",
" Namely, print the n-gram frequency (fr), local value (lv),\n",
" global value (gv), confidence value (cv), sanction (sn) weight,\n",
" significance (sg) weight.\n",
"\n",
" :param ngram: the n-gram (e.g. \"machine\", \"machine learning\", etc.)\n",
" :type ngram: str\n",
" \"\"\"\n",
" if not self.__categories__:\n",
" return\n",
"\n",
" word_index = self.get_word_index\n",
" n_gram_str = ngram\n",
" ngram = [word_index(w)\n",
" for w in re.split(self.__word_delimiter__, ngram)\n",
" if w]\n",
"\n",
" print()\n",
" print(\n",
" \" %s: %s (%s)\" % (\n",
" Print.style.green(\n",
" \"%d-GRAM\" % len(ngram) if len(ngram) > 1 else \"WORD\"\n",
" ),\n",
" Print.style.warning(n_gram_str),\n",
" \"is unknown\"\n",
" if IDX_UNKNOWN_WORD in ngram\n",
" else \"index: \" + str(ngram if len(ngram) > 1 else ngram[0])\n",
" )\n",
" )\n",
"\n",
" if IDX_UNKNOWN_WORD in ngram:\n",
" print()\n",
" return\n",
"\n",
" cat_len = max([\n",
" len(self.get_category_name(ic))\n",
" for ic in xrange(len(self.__categories__))\n",
" ])\n",
" cat_len = max(cat_len, 8)\n",
" header_template = Print.style.bold(\n",
" \" {:<%d} | fr | lv | sg | sn | gv | cv |\"\n",
" % cat_len\n",
" )\n",
" print()\n",
" print(header_template.format(\"Category\"))\n",
" header_template = (\n",
" \" {:-<%d}-|----------|-------|-------|-------|-------|-------|\"\n",
" % cat_len\n",
" )\n",
" print(header_template.format(''))\n",
" row_template = (\n",
" \" %s | {:^8} | {:.3f} | {:.3f} | {:.3f} | {:.3f} | {:.3f} |\"\n",
" % (Print.style.warning(\"{:<%d}\" % cat_len))\n",
" )\n",
" for icat in xrange(len(self.__categories__)):\n",
" n_gram_tip = self.__trie_node__(ngram, icat)\n",
" if n_gram_tip:\n",
" print(\n",
" row_template\n",
" .format(\n",
" self.get_category_name(icat)[:16],\n",
" n_gram_tip[FR],\n",
" self.__lv__(ngram, icat),\n",
" self.__sg__(ngram, icat),\n",
" self.__sn__(ngram, icat),\n",
" self.__gv__(ngram, icat),\n",
" self.__cv__(ngram, icat),\n",
" )\n",
" )\n",
" print()\n",
"\n",
" def plot_value_distribution(self, cat):\n",
" \"\"\"\n",
" Plot the category's global and local value distribution.\n",
"\n",
" :param cat: the category name\n",
" :type cat: str\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" if self.get_category_index(cat) == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
"\n",
" import matplotlib.pyplot as plt\n",
"\n",
" icat = self.get_category_index(cat)\n",
" vocab_metrics = self.__get_category_vocab__(icat)\n",
"\n",
" x = []\n",
" y_lv = []\n",
" y_gv = []\n",
" vocab_metrics_len = len(vocab_metrics)\n",
"\n",
" for i in xrange(vocab_metrics_len):\n",
" metric = vocab_metrics[i]\n",
" x.append(i + 1)\n",
" y_lv.append(metric[2])\n",
" y_gv.append(metric[3])\n",
"\n",
" plt.figure(figsize=(20, 10))\n",
" plt.title(\n",
" \"Word Value Distribution (%s)\" % self.get_category_name(icat)\n",
" )\n",
"\n",
" plt.xlabel(\"Word Rank\")\n",
" plt.ylabel(\"Value\")\n",
" plt.xlim(right=max(x))\n",
"\n",
" plt.plot(\n",
" x, y_lv, \"-\", label=\"local value ($lv$)\",\n",
" linewidth=2, color=\"#7f7d7e\"\n",
" )\n",
" plt.plot(\n",
" x, y_gv, \"g-\", label=\"global value ($gv$)\",\n",
" linewidth=4, color=\"#2ca02c\")\n",
" plt.legend()\n",
"\n",
" plt.show()\n",
"\n",
" def extract_insight(\n",
" self, doc, cat='auto', level='word', window_size=3, min_cv=0.01, sort=True\n",
" ):\n",
" \"\"\"\n",
" Get the list of text blocks involved in the classification decision.\n",
"\n",
" Given a document, return the pieces of text that were involved in the\n",
" classification decision, along with the confidence values associated\n",
" with them. If a category is given, perform the process as if the\n",
" given category were the one assigned by the classifier.\n",
"\n",
" :param doc: the content of the document\n",
" :type doc: str\n",
" :param cat: the category in relation to which text blocks are obtained.\n",
" If not present, it will automatically use the category assigned\n",
" by SS3 after classification.\n",
" Options are 'auto' or a given category name. (default: 'auto')\n",
" :type cat: str\n",
" :param level: the level at which text blocks are going to be extracted.\n",
" Options are 'word', 'sentence' or 'paragraph'. (default: 'word')\n",
" :type level: str\n",
" :param window_size: the number of words, before and after each identified word,\n",
" to be also included along with the identified word. For instance,\n",
" ``window_size=0`` means return only individual words,\n",
" ``window_size=1`` means also include the word that was\n",
" before and the one that was after them. If multiple selected\n",
" words are close enough for their word windows to be overlapping,\n",
" then those word windows will be merged into a longer and single one.\n",
" This argument is ignored when ``level`` is not equal to 'word'.\n",
" (default: 3)\n",
" :type window_size: int\n",
" :param min_cv: the minimum confidence value each text block must have to be\n",
" included in the output. (default 0.01)\n",
" :type min_cv: float\n",
" :param sort: whether to return the text blocks ordered by their confidence value\n",
" or not. If ``sort=False`` then blocks will be returned\n",
" following the order they had in the input document. (default: True)\n",
" :type sort: bool\n",
" :returns: a list of pairs (text, confidence value) containing the text (blocks) involved,\n",
" and to what degree (*), in the classification decision.\n",
" (*) given by the confidence value\n",
" :rtype: list\n",
" :raises: InvalidCategoryError, ValueError\n",
" \"\"\"\n",
" r = self.classify(doc, json=True)\n",
" word_regex = self.__word_regex__\n",
"\n",
" if cat == 'auto':\n",
" c_i = r[\"cvns\"][0][0]\n",
" else:\n",
" c_i = self.get_category_index(cat)\n",
" if c_i == IDX_UNKNOWN_CATEGORY:\n",
" Print.error(\n",
" \"The excepted values for the `cat` argument are 'auto' \"\n",
" \"or a valid category name, found '%s' instead\" % str(cat),\n",
" raises=InvalidCategoryError\n",
" )\n",
"\n",
" if level == 'paragraph':\n",
" insights = [\n",
" (\n",
" \"\".join([word[\"lexeme\"]\n",
" for s in p[\"sents\"]\n",
" for word in s[\"words\"]]),\n",
" p[\"cv\"][c_i]\n",
" )\n",
" for p in r[\"pars\"]\n",
" if p[\"cv\"][c_i] > min_cv\n",
" ]\n",
" elif level == 'sentence':\n",
" insights = [\n",
" (\n",
" \"\".join([word[\"lexeme\"]\n",
" for word in s[\"words\"]]),\n",
" s[\"cv\"][c_i]\n",
" )\n",
" for p in r[\"pars\"] for s in p[\"sents\"]\n",
" if s[\"cv\"][c_i] > min_cv\n",
" ]\n",
" elif level == 'word':\n",
" ww_size = window_size\n",
" insights = []\n",
" for p in r[\"pars\"]:\n",
" words = [w for s in p[\"sents\"] for w in s[\"words\"]]\n",
" w_i = 0\n",
" while w_i < len(words):\n",
" w = words[w_i]\n",
" if w[\"cv\"][c_i] > min_cv:\n",
" ww = []\n",
" ww_cv = 0\n",
" ww_left = min(w_i, ww_size) + 1\n",
" w_i -= ww_left - 1\n",
" while ww_left > 0 and w_i < len(words):\n",
"\n",
" ww.append(words[w_i][\"lexeme\"])\n",
" ww_cv += words[w_i][\"cv\"][c_i]\n",
"\n",
" if words[w_i][\"cv\"][c_i] > min_cv:\n",
" ww_left += min(ww_size, (len(words) - 1) - w_i)\n",
"\n",
" if re.search(word_regex, words[w_i][\"lexeme\"]):\n",
" ww_left -= 1\n",
"\n",
" w_i += 1\n",
"\n",
" insights.append((\"\".join(ww), ww_cv))\n",
" else:\n",
" w_i += 1\n",
" else:\n",
" raise ValueError(\n",
" \"expected values for the `level` argument are \"\n",
" \"'word', 'sentence', or 'paragraph', found '%s' instead.\"\n",
" % str(level)\n",
" )\n",
"\n",
" if sort:\n",
" insights.sort(key=lambda b_cv: -b_cv[1])\n",
" return insights\n",
"\n",
" def learn(self, doc, cat, n_grams=1, prep=True, update=True):\n",
" \"\"\"\n",
" Learn a new document for a given category.\n",
"\n",
" :param doc: the content of the document\n",
" :type doc: str\n",
" :param cat: the category name\n",
" :type cat: str\n",
" :param n_grams: indicates the maximum ``n``-grams to be learned\n",
" (e.g. a value of ``1`` means only 1-grams (words),\n",
" ``2`` means 1-grams and 2-grams,\n",
" ``3``, 1-grams, 2-grams and 3-grams, and so on.\n",
" :type n_grams: int\n",
" :param prep: enables the default input preprocessing (default: True)\n",
" :type prep: bool\n",
" :param update: enables model auto-update after learning (default: True)\n",
" :type update: bool\n",
" \"\"\"\n",
" self.__cv_cache__ = None\n",
"\n",
" if not doc or cat is None:\n",
" return\n",
"\n",
" try:\n",
" doc = doc.decode(ENCODING)\n",
" except UnicodeEncodeError: # for python 2 compatibility\n",
" doc = doc.encode(ENCODING).decode(ENCODING)\n",
" except AttributeError:\n",
" pass\n",
"\n",
" icat = self.__get_category__(cat)\n",
" cat = self.__categories__[icat]\n",
" word_to_index = self.__word_to_index__\n",
" word_regex = self.__word_regex__\n",
"\n",
" if prep:\n",
" Print.info(\"preprocessing document...\", offset=1)\n",
" stime = time()\n",
" doc = Pp.clean_and_ready(doc)\n",
" Print.info(\"finished --time: %.1fs\" % (time() - stime), offset=1)\n",
" doc = re.findall(\"%s|[^%s]+\" % (word_regex, self.__word_delimiter__), doc)\n",
"\n",
" text_len = len(doc)\n",
" Print.info(\n",
" \"about to learn new document (%d terms)\" % text_len, offset=1\n",
" )\n",
"\n",
" vocab = cat[VOCAB] # getting cat vocab\n",
"\n",
" index_to_word = self.__index_to_word__\n",
" max_frs = self.__max_fr__[icat]\n",
" max_gvs = self.__max_gv__[icat]\n",
"\n",
" stime = time()\n",
" Print.info(\"learning...\", offset=1)\n",
" tips = []\n",
" for word in doc:\n",
" if re.match(word_regex, word):\n",
" self.__prun_counter__ += 1\n",
" # if word doesn't exist yet, then...\n",
" try:\n",
" word = word_to_index[word]\n",
" except KeyError:\n",
" new_index = len(word_to_index)\n",
" word_to_index[word] = new_index\n",
" index_to_word[new_index] = word\n",
" word = new_index\n",
"\n",
" tips.append(vocab)\n",
"\n",
" if len(tips) > n_grams:\n",
" del tips[0]\n",
"\n",
" tips_length = len(tips)\n",
"\n",
" for i in xrange(tips_length):\n",
" tips_i = tips[i]\n",
"\n",
" try:\n",
" max_frs[i]\n",
" except IndexError:\n",
" max_frs.append(1)\n",
" max_gvs.append(0)\n",
"\n",
" try:\n",
" word_info = tips_i[word]\n",
" word_info[FR] += 1\n",
"\n",
" if word_info[FR] > max_frs[(tips_length - 1) - i]:\n",
" max_frs[(tips_length - 1) - i] = word_info[FR]\n",
" except KeyError:\n",
" tips_i[word] = [\n",
" {}, # NEXT/VOCAB\n",
" 1, # FR\n",
" 0, # CV\n",
" 0, # SG\n",
" 0, # GV\n",
" 0 # LV\n",
" ]\n",
" word_info = tips_i[word]\n",
"\n",
" # print i, index_to_word[ word ], tips_i[word][FR]\n",
" tips[i] = word_info[NEXT]\n",
" else:\n",
" tips[:] = []\n",
" if self.__prun_counter__ >= self.__prun_trigger__:\n",
" # trie data-structures pruning\n",
" self.__prune_tries__()\n",
"\n",
" Print.info(\"finished --time: %.1fs\" % (time() - stime), offset=1)\n",
" # updating values\n",
" if update:\n",
" self.update_values(force=True)\n",
"\n",
" def classify(self, doc, prep=True, sort=True, json=False, prep_func=None):\n",
" \"\"\"\n",
" Classify a given document.\n",
"\n",
" :param doc: the content of the document\n",
" :type doc: str\n",
" :param prep: enables the default input preprocessing (default: True)\n",
" :type prep: bool\n",
" :param sort: sort the classification result (from best to worst)\n",
" :type sort: bool\n",
" :param json: return a debugging version of the result in JSON format.\n",
" :type json: bool\n",
" :param prep_func: the custom preprocessing function to be applied to\n",
" the given document before classifying it.\n",
" If not given, the default preprocessing function will\n",
" be used (as long as ``prep=True``)\n",
" :type prep_func: function\n",
" :returns: the document confidence vector if ``sort`` is False.\n",
" If ``sort`` is True, a list of pairs\n",
" (category index, confidence value) ordered by confidence value.\n",
" :rtype: list\n",
" :raises: EmptyModelError\n",
" \"\"\"\n",
" if not self.__categories__:\n",
" raise EmptyModelError\n",
"\n",
" if self.__update_needed__():\n",
" self.update_values()\n",
"\n",
" doc = doc or ''\n",
" try:\n",
" doc = doc.decode(ENCODING)\n",
" except UnicodeEncodeError: # for python 2 compatibility\n",
" doc = doc.encode(ENCODING).decode(ENCODING)\n",
" except BaseException:\n",
" pass\n",
"\n",
" if not json:\n",
" paragraphs_cvs = [\n",
" self.__classify_paragraph__(parag, prep=prep, prep_func=prep_func)\n",
" for parag in re.split(self.__parag_delimiter__, doc)\n",
" if parag\n",
" ]\n",
" if paragraphs_cvs:\n",
" cv = self.summary_op_paragraphs(paragraphs_cvs)\n",
" else:\n",
" cv = self.__zero_cv__\n",
" if sort:\n",
" return sorted(\n",
" [\n",
" (i, cv[i])\n",
" for i in xrange(len(cv))\n",
" ],\n",
" key=lambda e: -e[1]\n",
" )\n",
" return cv\n",
" else:\n",
" info = [\n",
" self.__classify_paragraph__(parag, prep=prep, prep_func=prep_func, json=True)\n",
" for parag in re_split_keep(self.__parag_delimiter__, doc)\n",
" if parag\n",
" ]\n",
"\n",
" nbr_cats = len(self.__categories__)\n",
" cv = self.summary_op_paragraphs([v[\"cv\"] for v in info])\n",
" max_v = max(cv)\n",
"\n",
" if max_v > 1:\n",
" norm_cv = map(lambda x: x / max_v, cv)\n",
" else:\n",
" norm_cv = cv\n",
"\n",
" norm_cv_sorted = sorted(\n",
" [(i, nv, cv[i]) for i, nv in enumerate(norm_cv)],\n",
" key=lambda e: -e[1]\n",
" )\n",
"\n",
" return {\n",
" \"pars\": info,\n",
" \"cv\": cv,\n",
" \"wmv\": reduce(vmax, [v[\"wmv\"] for v in info]),\n",
" \"cvns\": norm_cv_sorted,\n",
" \"ci\": [self.get_category_name(ic) for ic in xrange(nbr_cats)]\n",
" }\n",
"\n",
" def classify_label(self, doc, def_cat=STR_MOST_PROBABLE, labels=True, prep=True):\n",
" \"\"\"\n",
" Classify a given document returning the category label.\n",
"\n",
" :param doc: the content of the document\n",
" :type doc: str\n",
" :param def_cat: default category to be assigned when SS3 is not\n",
" able to classify a document. Options are\n",
" \"most-probable\", \"unknown\" or a given category name.\n",
" (default: \"most-probable\")\n",
" :type def_cat: str\n",
" :param labels: whether to return the category label or just the\n",
" category index (default: True)\n",
" :type labels: bool\n",
" :param prep: enables the default input preprocessing process (default: True)\n",
" :type prep: bool\n",
" :returns: the category label or the category index.\n",
" :rtype: str or int\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" r = self.classify(doc, sort=True, prep=prep)\n",
"\n",
" if not r or not r[0][1]:\n",
" if not def_cat or def_cat == STR_UNKNOWN:\n",
" cat = STR_UNKNOWN_CATEGORY\n",
" elif def_cat == STR_MOST_PROBABLE:\n",
" cat = self.get_most_probable_category()\n",
" else:\n",
" if self.get_category_index(def_cat) == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
" cat = def_cat\n",
" else:\n",
" cat = self.get_category_name(r[0][0])\n",
"\n",
" return cat if labels else self.get_category_index(cat)\n",
"\n",
" def classify_multilabel(self, doc, def_cat=STR_UNKNOWN, labels=True, prep=True):\n",
" \"\"\"\n",
" Classify a given document returning multiple category labels.\n",
"\n",
" This method could be used to perform multi-label classification. Internally, it\n",
" uses k-mean clustering on the confidence vector to select the proper group of\n",
" labels.\n",
"\n",
" :param doc: the content of the document\n",
" :type doc: str\n",
" :param def_cat: default category to be assigned when SS3 is not\n",
" able to classify a document. Options are\n",
" \"most-probable\", \"unknown\" or a given category name.\n",
" (default: \"unknown\")\n",
" :type def_cat: str\n",
" :param labels: whether to return the category labels or just the\n",
" category indexes (default: True)\n",
" :type labels: bool\n",
" :param prep: enables the default input preprocessing (default: True)\n",
" :type prep: bool\n",
" :returns: the list of category labels (or indexes).\n",
" :rtype: list (of str or int)\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" r = self.classify(doc, sort=True, prep=prep)\n",
"\n",
" if not r or not r[0][1]:\n",
" if not def_cat or def_cat == STR_UNKNOWN:\n",
" return []\n",
" elif def_cat == STR_MOST_PROBABLE:\n",
" cat = self.get_most_probable_category()\n",
" else:\n",
" if self.get_category_index(def_cat) == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
" cat = def_cat\n",
" if cat != STR_OTHERS_CATEGORY:\n",
" return [cat] if labels else [self.get_category_index(cat)]\n",
" else:\n",
" return []\n",
" else:\n",
" __other_idx__ = self.get_category_index(STR_OTHERS_CATEGORY)\n",
" if labels:\n",
" result = [self.get_category_name(cat_i)\n",
" for cat_i, _ in r[:kmean_multilabel_size(r)]]\n",
" # removing \"hidden\" special category (\"[other]\")\n",
" if __other_idx__ != IDX_UNKNOWN_CATEGORY and STR_OTHERS_CATEGORY in result:\n",
" result.remove(STR_OTHERS_CATEGORY)\n",
" else:\n",
" result = [cat_i for cat_i, _ in r[:kmean_multilabel_size(r)]]\n",
" # removing \"hidden\" special category (\"[other]\")\n",
" if __other_idx__ != IDX_UNKNOWN_CATEGORY and __other_idx__ in result:\n",
" result.remove(__other_idx__)\n",
" return result\n",
"\n",
" def fit(self, x_train, y_train, n_grams=1, prep=True, leave_pbar=True):\n",
" \"\"\"\n",
" Train the model given a list of documents and category labels.\n",
"\n",
" :param x_train: the list of documents\n",
" :type x_train: list (of str)\n",
" :param y_train: the list of document labels\n",
" :type y_train: list of str for singlelabel classification;\n",
" list of list of str for multilabel classification.\n",
" :param n_grams: indicates the maximum ``n``-grams to be learned\n",
" (e.g. a value of ``1`` means only 1-grams (words),\n",
" ``2`` means 1-grams and 2-grams,\n",
" ``3``, 1-grams, 2-grams and 3-grams, and so on.\n",
" :type n_grams: int\n",
" :param prep: enables the default input preprocessing (default: True)\n",
" :type prep: bool\n",
" :param leave_pbar: controls whether to leave the progress bar or\n",
" remove it after finishing.\n",
" :type leave_pbar: bool\n",
" :raises: ValueError\n",
" \"\"\"\n",
" stime = time()\n",
" x_train, y_train = list(x_train), list(y_train)\n",
"\n",
" if len(x_train) != len(y_train):\n",
" raise ValueError(\"`x_train` and `y_train` must have the same length\")\n",
"\n",
" if len(y_train) == 0:\n",
" raise ValueError(\"`x_train` and `y_train` are empty\")\n",
"\n",
" # if it's a multi-label classification problem\n",
" if is_a_collection(y_train[0]):\n",
" # flattening y_train\n",
" labels = [l for y in y_train for l in y]\n",
" self.__multilabel__ = True\n",
" else:\n",
" labels = y_train\n",
"\n",
" cats = sorted(list(set(labels)))\n",
"\n",
" # if it's a single-label classification problem\n",
" if not is_a_collection(y_train[0]):\n",
" x_train = [\n",
" \"\".join([\n",
" x_train[i]\n",
" if x_train[i] and x_train[i][-1] == '\\n'\n",
" else\n",
" x_train[i] + '\\n'\n",
" for i in xrange(len(x_train))\n",
" if y_train[i] == cat\n",
" ])\n",
" for cat in cats\n",
" ]\n",
" y_train = list(cats)\n",
"\n",
" Print.info(\"about to start training\", offset=1)\n",
" Print.verbosity_region_begin(VERBOSITY.NORMAL)\n",
" progress_bar = tqdm(total=len(x_train), desc=\"Training\",\n",
" leave=leave_pbar, disable=Print.is_quiet())\n",
"\n",
" # if it's a multi-label classification problem\n",
" if is_a_collection(y_train[0]):\n",
" __others__ = [STR_OTHERS_CATEGORY]\n",
" for i in range(len(x_train)):\n",
" for label in (y_train[i] if y_train[i] else __others__):\n",
" self.learn(\n",
" x_train[i], label,\n",
" n_grams=n_grams, prep=prep, update=False\n",
" )\n",
" progress_bar.update(1)\n",
" else:\n",
" for i in range(len(x_train)):\n",
" progress_bar.set_description_str(\"Training on '%s'\" % str(y_train[i]))\n",
" self.learn(\n",
" x_train[i], y_train[i],\n",
" n_grams=n_grams, prep=prep, update=False\n",
" )\n",
" progress_bar.update(1)\n",
" progress_bar.close()\n",
" self.__prune_tries__()\n",
" Print.verbosity_region_end()\n",
" Print.info(\"finished --time: %.1fs\" % (time() - stime), offset=1)\n",
" self.update_values(force=True)\n",
"\n",
" def predict_proba(self, x_test, prep=True, leave_pbar=True):\n",
" \"\"\"\n",
" Classify a list of documents returning a list of confidence vectors.\n",
"\n",
" :param x_test: the list of documents to be classified\n",
" :type x_test: list (of str)\n",
" :param prep: enables the default input preprocessing (default: True)\n",
" :type prep: bool\n",
" :param leave_pbar: controls whether to leave the progress bar after\n",
" finishing or remove it.\n",
" :type leave_pbar: bool\n",
" :returns: the list of confidence vectors\n",
" :rtype: list (of list of float)\n",
" :raises: EmptyModelError\n",
" \"\"\"\n",
" if not self.__categories__:\n",
" raise EmptyModelError\n",
"\n",
" if self.get_ngrams_length() == 1 and self.__summary_ops_are_pristine__():\n",
" return self.__predict_fast__(x_test, prep=prep,\n",
" leave_pbar=leave_pbar, proba=True)\n",
"\n",
" x_test = list(x_test)\n",
" classify = self.classify\n",
" return [\n",
" classify(x, sort=False)\n",
" for x in tqdm(x_test, desc=\"Classification\", disable=Print.is_quiet())\n",
" ]\n",
"\n",
" def predict(\n",
" self, x_test, def_cat=None,\n",
" labels=True, multilabel=False, prep=True, leave_pbar=True\n",
" ):\n",
" \"\"\"\n",
" Classify a list of documents.\n",
"\n",
" :param x_test: the list of documents to be classified\n",
" :type x_test: list (of str)\n",
" :param def_cat: default category to be assigned when SS3 is not\n",
" able to classify a document. Options are\n",
" \"most-probable\", \"unknown\" or a given category name.\n",
" (default: \"most-probable\", or \"unknown\" for\n",
" multi-label classification)\n",
" :type def_cat: str\n",
" :param labels: whether to return the list of category names or just\n",
" category indexes\n",
" :type labels: bool\n",
" :param multilabel: whether to perform multi-label classification or not.\n",
" if enabled, for each document returns a ``list`` of labels\n",
" instead of a single label (``str``).\n",
" If the model was trained using multilabeled data, then this\n",
" argument will be ignored and set to True.\n",
" :type multilabel: bool\n",
" :param prep: enables the default input preprocessing (default: True)\n",
" :type prep: bool\n",
" :param leave_pbar: controls whether to leave the progress bar or\n",
" remove it after finishing.\n",
" :type leave_pbar: bool\n",
" :returns: if ``labels`` is True, the list of category names,\n",
" otherwise, the list of category indexes.\n",
" :rtype: list (of int or str)\n",
" :raises: EmptyModelError, InvalidCategoryError\n",
" \"\"\"\n",
" if not self.__categories__:\n",
" raise EmptyModelError\n",
"\n",
" multilabel = multilabel or self.__multilabel__\n",
"\n",
" if def_cat is None:\n",
" def_cat = STR_UNKNOWN if multilabel else STR_MOST_PROBABLE\n",
"\n",
" if not def_cat or def_cat == STR_UNKNOWN:\n",
" if not multilabel:\n",
" Print.info(\n",
" \"default category was set to 'unknown' (its index will be -1)\",\n",
" offset=1\n",
" )\n",
" else:\n",
" if def_cat == STR_MOST_PROBABLE:\n",
" Print.info(\n",
" \"default category was automatically set to '%s' \"\n",
" \"(the most probable one)\" % self.get_most_probable_category(),\n",
" offset=1\n",
" )\n",
" else:\n",
" Print.info(\"default category was set to '%s'\" % def_cat, offset=1)\n",
" if self.get_category_index(def_cat) == IDX_UNKNOWN_CATEGORY:\n",
" raise InvalidCategoryError\n",
"\n",
" if self.get_ngrams_length() == 1 and self.__summary_ops_are_pristine__():\n",
" return self.__predict_fast__(x_test, def_cat=def_cat, labels=labels,\n",
" multilabel=multilabel, prep=prep,\n",
" leave_pbar=leave_pbar)\n",
"\n",
" stime = time()\n",
" Print.info(\"about to start classifying test documents\", offset=1)\n",
" classify = self.classify_label if not multilabel else self.classify_multilabel\n",
" x_test = list(x_test)\n",
" y_pred = [\n",
" classify(doc, def_cat=def_cat, labels=labels, prep=prep)\n",
" for doc in tqdm(x_test, desc=\"Classification\",\n",
" leave=leave_pbar, disable=Print.is_quiet())\n",
" ]\n",
"\n",
" Print.info(\"finished --time: %.1fs\" % (time() - stime), offset=1)\n",
" return y_pred\n",
"\n",
" def cv(self, ngram, cat):\n",
" \"\"\"\n",
" Return the \"confidence value\" of a given word n-gram for the given category.\n",
"\n",
" This value is obtained applying a final transformation on the global value\n",
" of the given word n-gram using the gv function [*].\n",
"\n",
" These transformation are given when creating a new SS3 instance (see the\n",
" SS3 class constructor's ``cv_m`` argument for more information).\n",
"\n",
" [*] the gv function is defined in Section 3.2.2 of the original paper:\n",
" https://arxiv.org/pdf/1905.08772.pdf\n",
"\n",
" Examples:\n",
"\n",
" >>> clf.cv(\"chicken\", \"food\")\n",
" >>> clf.cv(\"roast chicken\", \"food\")\n",
" >>> clf.cv(\"chicken\", \"sports\")\n",
"\n",
" :param ngram: the word or word n-gram\n",
" :type ngram: str\n",
" :param cat: the category label\n",
" :type cat: str\n",
" :returns: the confidence value\n",
" :rtype: float\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" return self.__apply_fn__(self.__cv__, ngram, cat)\n",
"\n",
" def gv(self, ngram, cat):\n",
" \"\"\"\n",
" Return the \"global value\" of a given word n-gram for the given category.\n",
"\n",
" (gv function is defined in Section 3.2.2 of the original paper:\n",
" https://arxiv.org/pdf/1905.08772.pdf)\n",
"\n",
" Examples:\n",
"\n",
" >>> clf.gv(\"chicken\", \"food\")\n",
" >>> clf.gv(\"roast chicken\", \"food\")\n",
" >>> clf.gv(\"chicken\", \"sports\")\n",
"\n",
" :param ngram: the word or word n-gram\n",
" :type ngram: str\n",
" :param cat: the category label\n",
" :type cat: str\n",
" :returns: the global value\n",
" :rtype: float\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" return self.__apply_fn__(self.__gv__, ngram, cat)\n",
"\n",
" def lv(self, ngram, cat):\n",
" \"\"\"\n",
" Return the \"local value\" of a given word n-gram for the given category.\n",
"\n",
" (lv function is defined in Section 3.2.2 of the original paper:\n",
" https://arxiv.org/pdf/1905.08772.pdf)\n",
"\n",
" Examples:\n",
"\n",
" >>> clf.lv(\"chicken\", \"food\")\n",
" >>> clf.lv(\"roast chicken\", \"food\")\n",
" >>> clf.lv(\"chicken\", \"sports\")\n",
"\n",
" :param ngram: the word or word n-gram\n",
" :type ngram: str\n",
" :param cat: the category label\n",
" :type cat: str\n",
" :returns: the local value\n",
" :rtype: float\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" return self.__apply_fn__(self.__lv__, ngram, cat)\n",
"\n",
" def sg(self, ngram, cat):\n",
" \"\"\"\n",
" Return the \"significance factor\" of a given word n-gram for the given category.\n",
"\n",
" (sg function is defined in Section 3.2.2 of the original paper:\n",
" https://arxiv.org/pdf/1905.08772.pdf)\n",
"\n",
" Examples:\n",
"\n",
" >>> clf.sg(\"chicken\", \"food\")\n",
" >>> clf.sg(\"roast chicken\", \"food\")\n",
" >>> clf.sg(\"chicken\", \"sports\")\n",
"\n",
" :param ngram: the word or word n-gram\n",
" :type ngram: str\n",
" :param cat: the category label\n",
" :type cat: str\n",
" :returns: the significance factor\n",
" :rtype: float\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" return self.__apply_fn__(self.__sg__, ngram, cat)\n",
"\n",
" def sn(self, ngram, cat):\n",
" \"\"\"\n",
" Return the \"sanction factor\" of a given word n-gram for the given category.\n",
"\n",
" (sn function is defined in Section 3.2.2 of the original paper:\n",
" https://arxiv.org/pdf/1905.08772.pdf)\n",
"\n",
" Examples:\n",
"\n",
" >>> clf.sn(\"chicken\", \"food\")\n",
" >>> clf.sn(\"roast chicken\", \"food\")\n",
" >>> clf.sn(\"chicken\", \"sports\")\n",
"\n",
" :param ngram: the word or word n-gram\n",
" :type ngram: str\n",
" :param cat: the category label\n",
" :type cat: str\n",
" :returns: the sanction factor\n",
" :rtype: float\n",
" :raises: InvalidCategoryError\n",
" \"\"\"\n",
" return self.__apply_fn__(self.__sn__, ngram, cat)\n",
"\n",
"\n",
"class EmptyModelError(Exception):\n",
" \"\"\"Exception to be thrown when the model is empty.\"\"\"\n",
"\n",
" def __init__(self, msg=''):\n",
" \"\"\"Class constructor.\"\"\"\n",
" Exception.__init__(\n",
" self,\n",
" \"The model is empty (it hasn't been trained yet).\"\n",
" )\n",
"\n",
"\n",
"class InvalidCategoryError(Exception):\n",
" \"\"\"Exception to be thrown when a category is not valid.\"\"\"\n",
"\n",
" def __init__(self, msg=''):\n",
" \"\"\"Class constructor.\"\"\"\n",
" Exception.__init__(\n",
" self,\n",
" \"The given category is not valid\"\n",
" )\n",
"\n",
"\n",
"def kmean_multilabel_size(res):\n",
" \"\"\"\n",
" Use k-means to tell where to split the ``SS3.classify'''s output.\n",
"\n",
" Given a ``SS3.classify``'s output (``res``), tell where to partition it\n",
" into 2 clusters so that one of the cluster holds the category labels that\n",
" the classifier should output when performing multi-label classification.\n",
" To achieve this, implement k-means (i.e. 2-means) clustering over the\n",
" category confidence values in ``res``.\n",
"\n",
" :param res: the classification output of ``SS3.classify``\n",
" :type res: list (of sorted pairs (category, confidence value))\n",
" :returns: a positive integer indicating where to split ``res``\n",
" :rtype: int\n",
" \"\"\"\n",
" cent = {\"neg\": -1, \"pos\": -1} # centroids (2 clusters: \"pos\" and \"neg\")\n",
" clust = {\"neg\": [], \"pos\": []} # clusters (2 clusters: \"pos\" and \"neg\")\n",
" new_cent_neg = res[-1][1]\n",
" new_cent_pos = res[0][1]\n",
"\n",
" if new_cent_neg == new_cent_pos:\n",
" return 0\n",
"\n",
" while (cent[\"pos\"] != new_cent_pos) or (cent[\"neg\"] != new_cent_neg):\n",
" cent[\"neg\"], cent[\"pos\"] = new_cent_neg, new_cent_pos\n",
" clust[\"neg\"], clust[\"pos\"] = [], []\n",
" for _, cat_cv in res:\n",
" if abs(cent[\"neg\"] - cat_cv) < abs(cent[\"pos\"] - cat_cv):\n",
" clust[\"neg\"].append(cat_cv)\n",
" else:\n",
" clust[\"pos\"].append(cat_cv)\n",
" if len(clust[\"neg\"]) > 0:\n",
" new_cent_neg = sum(clust[\"neg\"]) / len(clust[\"neg\"])\n",
" if len(clust[\"pos\"]) > 0:\n",
" new_cent_pos = sum(clust[\"pos\"]) / len(clust[\"pos\"])\n",
" return len(clust[\"pos\"])\n",
"\n",
"\n",
"def sigmoid(v, l):\n",
" \"\"\"A sigmoid function.\"\"\"\n",
" try:\n",
" return .5 * tanh((3. / l) * v - 3) + .5\n",
" except ZeroDivisionError:\n",
" return 0\n",
"\n",
"\n",
"def mad(values, n):\n",
" \"\"\"Median absolute deviation mean.\"\"\"\n",
" if len(values) < n:\n",
" values += [0] * int(n - len(values))\n",
" values.sort()\n",
" if n == 2:\n",
" return (values[0], values[0])\n",
" values_m = n // 2 if n % 2 else n // 2 - 1\n",
" m = values[values_m] # Median\n",
" sd = sum([abs(m - lv) for lv in values]) / float(n) # sd Mean\n",
" return m, sd\n",
"\n",
"\n",
"def key_as_int(dct):\n",
" \"\"\"Cast the given dictionary (numerical) keys to int.\"\"\"\n",
" keys = list(dct)\n",
" if len(keys) and keys[0].isdigit():\n",
" new_dct = {}\n",
" for key in dct:\n",
" new_dct[int(key)] = dct[key]\n",
" return new_dct\n",
" return dct\n",
"\n",
"\n",
"def re_split_keep(regex, string):\n",
" \"\"\"\n",
" Force the inclusion of unmatched items by re.split.\n",
"\n",
" This allows keeping the original content after splitting the input\n",
" document for later use (e.g. for using it from the Live Test)\n",
" \"\"\"\n",
" if not re.match(r\"\\(.*\\)\", regex):\n",
" regex = \"(%s)\" % regex\n",
" return re.split(regex, string)\n",
"\n",
"\n",
"def list_hash(str_list):\n",
" \"\"\"\n",
" Return a hash value for a given list of string.\n",
"\n",
" :param str_list: a list of strings (e.g. x_test)\n",
" :type str_list: list (of str)\n",
" :returns: an MD5 hash value\n",
" :rtype: str\n",
" \"\"\"\n",
" import hashlib\n",
" m = hashlib.md5()\n",
" for doc in str_list:\n",
" try:\n",
" m.update(doc)\n",
" except (TypeError, UnicodeEncodeError):\n",
" m.update(doc.encode('ascii', 'ignore'))\n",
" return m.hexdigest()\n",
"\n",
"\n",
"def vsum(v0, v1):\n",
" \"\"\"Vectorial version of sum.\"\"\"\n",
" return [v0[i] + v1[i] for i in xrange(len(v0))]\n",
"\n",
"\n",
"def vmax(v0, v1):\n",
" \"\"\"Vectorial version of max.\"\"\"\n",
" return [max(v0[i], v1[i]) for i in xrange(len(v0))]\n",
"\n",
"\n",
"def vdiv(v0, v1):\n",
" \"\"\"Vectorial version of division.\"\"\"\n",
" return [v0[i] / v1[i] if v1[i] else 0 for i in xrange(len(v0))]\n",
"\n",
"\n",
"def set_verbosity(level):\n",
" \"\"\"\n",
" Set the verbosity level.\n",
"\n",
" - ``0`` (quiet): do not output any message (only error messages)\n",
" - ``1`` (normal): default behavior, display only warning messages and progress bars\n",
" - ``2`` (verbose): display also the informative non-essential messages\n",
"\n",
" The following built-in constants can also be used to refer to these 3 values:\n",
" ``VERBOSITY.QUIET``, ``VERBOSITY.NORMAL``, and ``VERBOSITY.VERBOSE``, respectively.\n",
"\n",
" For example, if you want PySS3 to hide everything, even progress bars, you could simply do:\n",
"\n",
" >>> import pyss3\n",
" ...\n",
" >>> pyss3.set_verbosity(0)\n",
" ...\n",
" >>> # here's the rest of your code :D\n",
"\n",
" or, equivalently:\n",
"\n",
" >>> import pyss3\n",
" >>> from pyss3 import VERBOSITY\n",
" ...\n",
" >>> pyss3.set_verbosity(VERBOSITY.QUIET)\n",
" ...\n",
" >>> # here's the rest of your code :D\n",
"\n",
" :param level: the verbosity level\n",
" :type level: int\n",
" \"\"\"\n",
" Print.set_verbosity(level)\n",
"\n",
"\n",
"# user-friendly aliases\n",
"SS3.set_smoothness = SS3.set_s\n",
"SS3.get_smoothness = SS3.get_s\n",
"SS3.set_significance = SS3.set_l\n",
"SS3.get_significance = SS3.get_l\n",
"SS3.set_sanction = SS3.set_p\n",
"SS3.get_sanction = SS3.get_p\n",
"SS3.set_alpha = SS3.set_a\n",
"SS3.get_alpha = SS3.get_a\n",
"SS3.get_alpha = SS3.get_a\n",
"SS3.train = SS3.fit\n",
"SS3.save = SS3.save_model\n",
"SS3.load = SS3.load_model\n",
"SS3.update = SS3.update_values\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ik6zWFeBe2kf",
"outputId": "e8c25721-d46f-4e00-bd18-372096cdb551"
},
"source": [
"!curl https://raw.githubusercontent.com/sergioburdisso/pyss3/master/examples/datasets/topic.zip --output topic.zip"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 6485k 100 6485k 0 0 10.5M 0 --:--:-- --:--:-- --:--:-- 10.5M\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZW4LvzfxfOJG",
"outputId": "a8857b93-f027-4739-e426-41222b653b6e"
},
"source": [
"!unzip -u topic.zip"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"Archive: topic.zip\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zD9KDwgyh_SC"
},
"source": [
"# Train with whole dataset at once"
]
},
{
"cell_type": "code",
"metadata": {
"id": "1G1UvpL2ffhz"
},
"source": [
"clf = SS3()"
],
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GWJY9sJNfqMw",
"outputId": "9dbf5e06-28d8-4ed2-9962-ae1ee84a2cff"
},
"source": [
"x_train, y_train = Dataset.load_from_files(\"topic/train\", folder_label=False)\n",
"x_test, y_test = Dataset.load_from_files(\"topic/test\", folder_label=False)"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"Loading 'music' documents: 100%|██████████| 8/8 [00:00<00:00, 54.60it/s]\n",
"Loading 'music' documents: 100%|██████████| 8/8 [00:00<00:00, 492.72it/s]\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Bj7VIXFnfwFp",
"outputId": "dd64402f-060e-47d9-9fab-7d1d07a160e0"
},
"source": [
"clf.train(x_train, y_train)\n",
"clf.print_categories_info()"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"Training on 'sports': 100%|██████████| 8/8 [00:26<00:00, 3.30s/it]\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"\n",
"\n",
" \u001b[92m\u001b[4m\u001b[1mCATEGORIES\u001b[0m\u001b[0m\u001b[0m:\n",
"\n",
"\u001b[93m\t Category \u001b[0m| Index | Length | Vocab. Size | Word Max. Fr. | N-gram |\n",
"\t-------------------|-------|------------|-------------|---------------|--------|\n",
"art&photography 35597\n",
"beauty&fashion 29072\n",
"business&finance 19851\n",
"food 35993\n",
"health 25677\n",
"music 25298\n",
"science&technology 27361\n",
"sports 20019\n",
"\n",
"\t\u001b[4m\u001b[1mMost Probable Category\u001b[0m\u001b[0m: \u001b[93mhealth\u001b[0m \u001b[94m(15.96%)\u001b[0m\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 920
},
"id": "uHjkVHcuhgtz",
"outputId": "da11443d-4508-4010-c9d4-d2ae79acb953"
},
"source": [
"Evaluation.test(clf, x_test, y_test)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Classification: 100%|██████████| 800/800 [00:00<00:00, 33122.19it/s]\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"\n",
" precision recall f1-score support\n",
"\n",
" art&photography 0.63 0.88 0.73 100\n",
" beauty&fashion 0.77 0.68 0.72 100\n",
" business&finance 0.76 0.71 0.73 100\n",
" food 0.76 0.25 0.38 100\n",
" health 0.53 0.80 0.64 100\n",
" music 0.85 0.75 0.80 100\n",
"science&technology 0.60 0.75 0.67 100\n",
" sports 0.99 0.81 0.89 100\n",
"\n",
" accuracy 0.70 800\n",
" macro avg 0.74 0.70 0.69 800\n",
" weighted avg 0.74 0.70 0.69 800\n",
"\n",
"\n",
" \u001b[1mAccuracy\u001b[0m: 0.704\n",
"\n",
"\u001b[94m[ updating evaluations cache ]\u001b[0m\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x576 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.70375"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KMxtRq0KiFBy"
},
"source": [
"# Train with fractions of the dataset one by one"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qGMuzKfofzQU",
"outputId": "6041ce1d-2b15-446b-daf0-466b8403914e"
},
"source": [
"clf2 = SS3()\n",
"x_train1, x_train2 = x_train[:len(x_train)//2], x_train[len(x_train)//2:]\n",
"y_train1, y_train2 = y_train[:len(y_train)//2], y_train[len(y_train)//2:]\n",
"clf2.train(x_train1, y_train1)\n",
"clf2.train(x_train2, y_train2)\n",
"clf2.print_categories_info()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Training on 'sports': 100%|██████████| 5/5 [00:13<00:00, 2.71s/it]\n",
"Training on 'science&technology': 100%|██████████| 4/4 [00:12<00:00, 3.18s/it]\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"\n",
"\n",
" \u001b[92m\u001b[4m\u001b[1mCATEGORIES\u001b[0m\u001b[0m\u001b[0m:\n",
"\n",
"\u001b[93m\t Category \u001b[0m| Index | Length | Vocab. Size | Word Max. Fr. | N-gram |\n",
"\t-------------------|-------|------------|-------------|---------------|--------|\n",
"\u001b[93m\t beauty&fashion \u001b[0m| 0 | 574151 | 29072 | 27599 | 1 |\n",
"\u001b[93m\t business&finance \u001b[0m| 1 | 424383 | 19851 | 20855 | 1 |\n",
"\u001b[93m\t food \u001b[0m| 2 | 725110 | 35993 | 40588 | 1 |\n",
"\u001b[93m\t health \u001b[0m| 3 | 727389 | 25677 | 45729 | 1 |\n",
"\u001b[93m\t sports \u001b[0m| 4 | 498989 | 20019 | 33277 | 1 |\n",
"\u001b[93m\t art&photography \u001b[0m| 5 | 580241 | 35597 | 20913 | 1 |\n",
"\u001b[93m\t music \u001b[0m| 6 | 437098 | 25298 | 27204 | 1 |\n",
"\u001b[93m\tscience&technology \u001b[0m| 7 | 589144 | 27361 | 21620 | 1 |\n",
"\n",
"\t\u001b[4m\u001b[1mMost Probable Category\u001b[0m\u001b[0m: \u001b[93mhealth\u001b[0m \u001b[94m(15.96%)\u001b[0m\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 885
},
"id": "O18cJOoFhoso",
"outputId": "7242105c-4c6d-4339-e021-a6e33c99228d"
},
"source": [
"Evaluation.test(clf2, x_test, y_test)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"\n",
" precision recall f1-score support\n",
"\n",
" art&photography 0.63 0.88 0.73 100\n",
" beauty&fashion 0.77 0.68 0.72 100\n",
" business&finance 0.76 0.71 0.73 100\n",
" food 0.76 0.25 0.38 100\n",
" health 0.53 0.80 0.64 100\n",
" music 0.85 0.75 0.80 100\n",
"science&technology 0.60 0.75 0.67 100\n",
" sports 0.99 0.81 0.89 100\n",
"\n",
" accuracy 0.70 800\n",
" macro avg 0.74 0.70 0.69 800\n",
" weighted avg 0.74 0.70 0.69 800\n",
"\n",
"\n",
" \u001b[1mAccuracy\u001b[0m: 0.704\n",
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x576 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.70375"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment