bnagy/collatinus.ipynb Secret

## collatinus.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Before any of this will work you need Collatinus installed,\n",
    "# running, and with the Server mode activated. You can set your\n",
    "# language in the app, and it will be respected by the server.\n",
    "\n",
    "import socket\n",
    "\n",
    "class Client(object):\n",
    "    def __init__(self, host=\"127.0.0.1\", port=5555):\n",
    "        self._host = host\n",
    "        self._port = port\n",
    "        keys=['æ','Æ','œ','Œ','̀','́','à','á','è','é','ì','í','ò','ó','ù','ú','À','Á','È','É','Ì','Í','Ò','Ó','Ù','Ú',\n",
    "            '̄','̆','ā','ă','ē','ĕ','ī','ĭ','ō','ŏ','ū','ŭ','Ā','Ă','Ē','Ĕ','Ī','Ĭ','Ō','Ŏ','Ū','Ŭ']\n",
    "        vals = ['ae','Ae','oe','Oe','','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U',\n",
    "            '','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U']\n",
    "        d = dict(zip(keys,vals))\n",
    "        self._accent_trans = str.maketrans(d)\n",
    "\n",
    "    def _run_remote_command(self, cmd):\n",
    "        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:\n",
    "            sock.settimeout(1)\n",
    "            sock.connect((self._host, self._port))\n",
    "            sock.sendall((cmd).encode('utf8'))\n",
    "            chunks = []\n",
    "            while True:\n",
    "                try:\n",
    "                    chunk = sock.recv(4096)\n",
    "                    chunks.append(chunk)\n",
    "                except socket.timeout:\n",
    "                    # the only time this should happen is if the server sends\n",
    "                    # precisely 4096 bytes of data. We'd then try to recv again\n",
    "                    # and block forever.\n",
    "                    break\n",
    "                if len(chunk) < 4096:\n",
    "                    break\n",
    "\n",
    "            out = b''.join(chunks).decode('utf-8')\n",
    "            return out\n",
    "\n",
    "    def tag_best(self, latin_string, strip_accents=True):\n",
    "        out = self._run_remote_command(('-P2 \"{}\"').format(latin_string))\n",
    "        if strip_accents:\n",
    "            out = out.translate(self._accent_trans)\n",
    "        return [l.split('\\t') for l in out.split('\\n')]\n",
    "\n",
    "\n",
    "    def tag(self, latin_string, strip_accents=True):\n",
    "        out = self._run_remote_command(('-P3 \"{}\"').format(latin_string))\n",
    "        if strip_accents:\n",
    "            out = out.translate(self._accent_trans)\n",
    "        return [l.split('\\t') for l in out.split('\\n')]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['1',\n",
       "  '1',\n",
       "  '1',\n",
       "  'cadam',\n",
       "  'v1 ',\n",
       "  'cado',\n",
       "  'cado, is, ere, cecidi, casum',\n",
       "  '687',\n",
       "  'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
       "  'cadam future indicative active 1st singular']]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tag_best gives us just the best guess from the engine. For\n",
    "# docs (such as they are) on the result format see the various\n",
    "# files in https://github.com/biblissima/collatinus/\n",
    "# Summary: the first numbers are word and phrase indices. We have\n",
    "# the word being tagged, a lemma, the number of hits in the LASLA\n",
    "# corpus (I think), and the definition and morphology.\n",
    "\n",
    "c = Client()\n",
    "c.tag_best(\"cadam\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['1',\n",
       "  '1',\n",
       "  '1',\n",
       "  'cadam',\n",
       "  'v1 ',\n",
       "  'cado',\n",
       "  'cado, is, ere, cecidi, casum',\n",
       "  '687',\n",
       "  'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
       "  'cadam future indicative active 1st singular'],\n",
       " ['1',\n",
       "  '1',\n",
       "  '1',\n",
       "  'cadam',\n",
       "  'v1 ',\n",
       "  'cado',\n",
       "  'cado, is, ere, cecidi, casum',\n",
       "  '687',\n",
       "  'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
       "  'cadam future indicative active 1st singular (v1  : 0.901807)'],\n",
       " ['1',\n",
       "  '1',\n",
       "  '1',\n",
       "  'cadam',\n",
       "  'v1 ',\n",
       "  'cado',\n",
       "  'cado, is, ere, cecidi, casum',\n",
       "  '687',\n",
       "  'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
       "  'cadam present subjunctive active 1st singular (v21 : 0.0981932)']]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tag gives us the best statistical match first, along with all of \n",
    "# the alternatives. Eg here future active is much more common than\n",
    "# subjunctive active, but it's an alternative.\n",
    "\n",
    "c.tag(\"cadam\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['1', '1', '1', 'Sausages', '', 'unknown'],\n",
       " ['2',\n",
       "  '2',\n",
       "  '1',\n",
       "  'SaRcIna',\n",
       "  'n11 (n61)',\n",
       "  'sarcina',\n",
       "  'sarcina, ae, f.',\n",
       "  '81',\n",
       "  \"pack, bundle, soldier's kit; baggage (pl.), belongings, chattels; load, burden;\",\n",
       "  'sarcina nominative singular']]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The tagger is pretty robust, which is nice.\n",
    "c.tag_best(\"Sausages! &%$SaRcIna?<:\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"Omnis homines, qui sese student praestare ceteris animalibus, \n",
    "summa ope niti decet, ne vitam silentio transeant veluti pecora, \n",
    "quae natura prona atque ventri oboedientia finxit. Sed nostra omnis \n",
    "vis in animo et corpore sita est: animi imperio, corporis servitio \n",
    "magis utimur; alterum nobis cum dis, alterum cum beluis commune est. \n",
    "Quo mihi rectius videtur ingeni quam virium opibus gloriam quaerere \n",
    "et, quoniam vita ipsa, qua fruimur, brevis est, memoriam nostri quam \n",
    "maxume longam efficere. Nam divitiarum et formae gloria fluxa atque \n",
    "fragilis est, virtus clara aeternaque habetur. Sed diu magnum inter \n",
    "mortalis certamen fuit, vine corporis an virtute animi res militaris \n",
    "magis procederet. Nam et, prius quam incipias, consulto et, ubi \n",
    "consulueris, mature facto opus est. Ita utrumque per se indigens \n",
    "alterum alterius auxilio eget.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.75 ms, sys: 1.34 ms, total: 4.08 ms\n",
      "Wall time: 83.3 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# it's also very fast.\n",
    "\n",
    "results = c.tag_best(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['omnis', 'homo', 'qui', 'se', 'studeo', 'praesto', 'ceterum', 'animal', 'summa', 'ops', 'nitor', 'decet', 'ne', 'vita', 'silentium', 'transeo', 'veluti', 'pecus', 'qui', 'natura', 'pronus', 'atque', 'venter', 'oboedientia', 'fingo', 'sed', 'noster', 'omnis', 'volo', 'in, indu']\n"
     ]
    }
   ],
   "source": [
    "print([r[5] for r in results][:30])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ling",
   "language": "python",
   "name": "ling"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Before any of this will work you need Collatinus installed,\n",
	"# running, and with the Server mode activated. You can set your\n",
	"# language in the app, and it will be respected by the server.\n",
	"\n",
	"import socket\n",
	"\n",
	"class Client(object):\n",
	" def __init__(self, host=\"127.0.0.1\", port=5555):\n",
	" self._host = host\n",
	" self._port = port\n",
	" keys=['æ','Æ','œ','Œ','̀','́','à','á','è','é','ì','í','ò','ó','ù','ú','À','Á','È','É','Ì','Í','Ò','Ó','Ù','Ú',\n",
	" '̄','̆','ā','ă','ē','ĕ','ī','ĭ','ō','ŏ','ū','ŭ','Ā','Ă','Ē','Ĕ','Ī','Ĭ','Ō','Ŏ','Ū','Ŭ']\n",
	" vals = ['ae','Ae','oe','Oe','','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U',\n",
	" '','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U']\n",
	" d = dict(zip(keys,vals))\n",
	" self._accent_trans = str.maketrans(d)\n",
	"\n",
	" def _run_remote_command(self, cmd):\n",
	" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:\n",
	" sock.settimeout(1)\n",
	" sock.connect((self._host, self._port))\n",
	" sock.sendall((cmd).encode('utf8'))\n",
	" chunks = []\n",
	" while True:\n",
	" try:\n",
	" chunk = sock.recv(4096)\n",
	" chunks.append(chunk)\n",
	" except socket.timeout:\n",
	" # the only time this should happen is if the server sends\n",
	" # precisely 4096 bytes of data. We'd then try to recv again\n",
	" # and block forever.\n",
	" break\n",
	" if len(chunk) < 4096:\n",
	" break\n",
	"\n",
	" out = b''.join(chunks).decode('utf-8')\n",
	" return out\n",
	"\n",
	" def tag_best(self, latin_string, strip_accents=True):\n",
	" out = self._run_remote_command(('-P2 \"{}\"').format(latin_string))\n",
	" if strip_accents:\n",
	" out = out.translate(self._accent_trans)\n",
	" return [l.split('\\t') for l in out.split('\\n')]\n",
	"\n",
	"\n",
	" def tag(self, latin_string, strip_accents=True):\n",
	" out = self._run_remote_command(('-P3 \"{}\"').format(latin_string))\n",
	" if strip_accents:\n",
	" out = out.translate(self._accent_trans)\n",
	" return [l.split('\\t') for l in out.split('\\n')]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['1',\n",
	" '1',\n",
	" '1',\n",
	" 'cadam',\n",
	" 'v1 ',\n",
	" 'cado',\n",
	" 'cado, is, ere, cecidi, casum',\n",
	" '687',\n",
	" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
	" 'cadam future indicative active 1st singular']]"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# tag_best gives us just the best guess from the engine. For\n",
	"# docs (such as they are) on the result format see the various\n",
	"# files in https://github.com/biblissima/collatinus/\n",
	"# Summary: the first numbers are word and phrase indices. We have\n",
	"# the word being tagged, a lemma, the number of hits in the LASLA\n",
	"# corpus (I think), and the definition and morphology.\n",
	"\n",
	"c = Client()\n",
	"c.tag_best(\"cadam\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['1',\n",
	" '1',\n",
	" '1',\n",
	" 'cadam',\n",
	" 'v1 ',\n",
	" 'cado',\n",
	" 'cado, is, ere, cecidi, casum',\n",
	" '687',\n",
	" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
	" 'cadam future indicative active 1st singular'],\n",
	" ['1',\n",
	" '1',\n",
	" '1',\n",
	" 'cadam',\n",
	" 'v1 ',\n",
	" 'cado',\n",
	" 'cado, is, ere, cecidi, casum',\n",
	" '687',\n",
	" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
	" 'cadam future indicative active 1st singular (v1 : 0.901807)'],\n",
	" ['1',\n",
	" '1',\n",
	" '1',\n",
	" 'cadam',\n",
	" 'v1 ',\n",
	" 'cado',\n",
	" 'cado, is, ere, cecidi, casum',\n",
	" '687',\n",
	" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
	" 'cadam present subjunctive active 1st singular (v21 : 0.0981932)']]"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# tag gives us the best statistical match first, along with all of \n",
	"# the alternatives. Eg here future active is much more common than\n",
	"# subjunctive active, but it's an alternative.\n",
	"\n",
	"c.tag(\"cadam\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['1', '1', '1', 'Sausages', '', 'unknown'],\n",
	" ['2',\n",
	" '2',\n",
	" '1',\n",
	" 'SaRcIna',\n",
	" 'n11 (n61)',\n",
	" 'sarcina',\n",
	" 'sarcina, ae, f.',\n",
	" '81',\n",
	" \"pack, bundle, soldier's kit; baggage (pl.), belongings, chattels; load, burden;\",\n",
	" 'sarcina nominative singular']]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# The tagger is pretty robust, which is nice.\n",
	"c.tag_best(\"Sausages! &%$SaRcIna?<:\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"text = \"\"\"Omnis homines, qui sese student praestare ceteris animalibus, \n",
	"summa ope niti decet, ne vitam silentio transeant veluti pecora, \n",
	"quae natura prona atque ventri oboedientia finxit. Sed nostra omnis \n",
	"vis in animo et corpore sita est: animi imperio, corporis servitio \n",
	"magis utimur; alterum nobis cum dis, alterum cum beluis commune est. \n",
	"Quo mihi rectius videtur ingeni quam virium opibus gloriam quaerere \n",
	"et, quoniam vita ipsa, qua fruimur, brevis est, memoriam nostri quam \n",
	"maxume longam efficere. Nam divitiarum et formae gloria fluxa atque \n",
	"fragilis est, virtus clara aeternaque habetur. Sed diu magnum inter \n",
	"mortalis certamen fuit, vine corporis an virtute animi res militaris \n",
	"magis procederet. Nam et, prius quam incipias, consulto et, ubi \n",
	"consulueris, mature facto opus est. Ita utrumque per se indigens \n",
	"alterum alterius auxilio eget.\n",
	"\"\"\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 2.75 ms, sys: 1.34 ms, total: 4.08 ms\n",
	"Wall time: 83.3 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"\n",
	"# it's also very fast.\n",
	"\n",
	"results = c.tag_best(text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['omnis', 'homo', 'qui', 'se', 'studeo', 'praesto', 'ceterum', 'animal', 'summa', 'ops', 'nitor', 'decet', 'ne', 'vita', 'silentium', 'transeo', 'veluti', 'pecus', 'qui', 'natura', 'pronus', 'atque', 'venter', 'oboedientia', 'fingo', 'sed', 'noster', 'omnis', 'volo', 'in, indu']\n"
	]
	}
	],
	"source": [
	"print([r[5] for r in results][:30])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "ling",
	"language": "python",
	"name": "ling"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}