Skip to content

Instantly share code, notes, and snippets.

@bnagy bnagy/collatinus.ipynb Secret
Created Sep 25, 2018

Embed
What would you like to do?
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Before any of this will work you need Collatinus installed,\n",
"# running, and with the Server mode activated. You can set your\n",
"# language in the app, and it will be respected by the server.\n",
"\n",
"import socket\n",
"\n",
"class Client(object):\n",
" def __init__(self, host=\"127.0.0.1\", port=5555):\n",
" self._host = host\n",
" self._port = port\n",
" keys=['æ','Æ','œ','Œ','̀','́','à','á','è','é','ì','í','ò','ó','ù','ú','À','Á','È','É','Ì','Í','Ò','Ó','Ù','Ú',\n",
" '̄','̆','ā','ă','ē','ĕ','ī','ĭ','ō','ŏ','ū','ŭ','Ā','Ă','Ē','Ĕ','Ī','Ĭ','Ō','Ŏ','Ū','Ŭ']\n",
" vals = ['ae','Ae','oe','Oe','','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U',\n",
" '','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U']\n",
" d = dict(zip(keys,vals))\n",
" self._accent_trans = str.maketrans(d)\n",
"\n",
" def _run_remote_command(self, cmd):\n",
" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:\n",
" sock.settimeout(1)\n",
" sock.connect((self._host, self._port))\n",
" sock.sendall((cmd).encode('utf8'))\n",
" chunks = []\n",
" while True:\n",
" try:\n",
" chunk = sock.recv(4096)\n",
" chunks.append(chunk)\n",
" except socket.timeout:\n",
" # the only time this should happen is if the server sends\n",
" # precisely 4096 bytes of data. We'd then try to recv again\n",
" # and block forever.\n",
" break\n",
" if len(chunk) < 4096:\n",
" break\n",
"\n",
" out = b''.join(chunks).decode('utf-8')\n",
" return out\n",
"\n",
" def tag_best(self, latin_string, strip_accents=True):\n",
" out = self._run_remote_command(('-P2 \"{}\"').format(latin_string))\n",
" if strip_accents:\n",
" out = out.translate(self._accent_trans)\n",
" return [l.split('\\t') for l in out.split('\\n')]\n",
"\n",
"\n",
" def tag(self, latin_string, strip_accents=True):\n",
" out = self._run_remote_command(('-P3 \"{}\"').format(latin_string))\n",
" if strip_accents:\n",
" out = out.translate(self._accent_trans)\n",
" return [l.split('\\t') for l in out.split('\\n')]\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam future indicative active 1st singular']]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tag_best gives us just the best guess from the engine. For\n",
"# docs (such as they are) on the result format see the various\n",
"# files in https://github.com/biblissima/collatinus/\n",
"# Summary: the first numbers are word and phrase indices. We have\n",
"# the word being tagged, a lemma, the number of hits in the LASLA\n",
"# corpus (I think), and the definition and morphology.\n",
"\n",
"c = Client()\n",
"c.tag_best(\"cadam\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam future indicative active 1st singular'],\n",
" ['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam future indicative active 1st singular (v1 : 0.901807)'],\n",
" ['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam present subjunctive active 1st singular (v21 : 0.0981932)']]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tag gives us the best statistical match first, along with all of \n",
"# the alternatives. Eg here future active is much more common than\n",
"# subjunctive active, but it's an alternative.\n",
"\n",
"c.tag(\"cadam\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1', '1', '1', 'Sausages', '', 'unknown'],\n",
" ['2',\n",
" '2',\n",
" '1',\n",
" 'SaRcIna',\n",
" 'n11 (n61)',\n",
" 'sarcina',\n",
" 'sarcina, ae, f.',\n",
" '81',\n",
" \"pack, bundle, soldier's kit; baggage (pl.), belongings, chattels; load, burden;\",\n",
" 'sarcina nominative singular']]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The tagger is pretty robust, which is nice.\n",
"c.tag_best(\"Sausages! &%$SaRcIna?<:\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"text = \"\"\"Omnis homines, qui sese student praestare ceteris animalibus, \n",
"summa ope niti decet, ne vitam silentio transeant veluti pecora, \n",
"quae natura prona atque ventri oboedientia finxit. Sed nostra omnis \n",
"vis in animo et corpore sita est: animi imperio, corporis servitio \n",
"magis utimur; alterum nobis cum dis, alterum cum beluis commune est. \n",
"Quo mihi rectius videtur ingeni quam virium opibus gloriam quaerere \n",
"et, quoniam vita ipsa, qua fruimur, brevis est, memoriam nostri quam \n",
"maxume longam efficere. Nam divitiarum et formae gloria fluxa atque \n",
"fragilis est, virtus clara aeternaque habetur. Sed diu magnum inter \n",
"mortalis certamen fuit, vine corporis an virtute animi res militaris \n",
"magis procederet. Nam et, prius quam incipias, consulto et, ubi \n",
"consulueris, mature facto opus est. Ita utrumque per se indigens \n",
"alterum alterius auxilio eget.\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.75 ms, sys: 1.34 ms, total: 4.08 ms\n",
"Wall time: 83.3 ms\n"
]
}
],
"source": [
"%%time\n",
"\n",
"# it's also very fast.\n",
"\n",
"results = c.tag_best(text)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['omnis', 'homo', 'qui', 'se', 'studeo', 'praesto', 'ceterum', 'animal', 'summa', 'ops', 'nitor', 'decet', 'ne', 'vita', 'silentium', 'transeo', 'veluti', 'pecus', 'qui', 'natura', 'pronus', 'atque', 'venter', 'oboedientia', 'fingo', 'sed', 'noster', 'omnis', 'volo', 'in, indu']\n"
]
}
],
"source": [
"print([r[5] for r in results][:30])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ling",
"language": "python",
"name": "ling"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@kylepjohnson

This comment has been minimized.

Copy link

commented Sep 25, 2018

@Bangy FYI here's our Collatinus installation:

Do you see the online version offering benefits beyond what ours does? Eg, will it be kept up to date as ours might not?

@kylepjohnson

This comment has been minimized.

Copy link

commented Sep 25, 2018

If you think this belongs in the CLTK, I'd accept your PR

@bnagy

This comment has been minimized.

Copy link
Owner Author

commented Sep 26, 2018

@kylepjohnson thanks for the feedback! Right now this is talking to a local installation of Collatinus, so I needed to download and install the OSX version from their website. The Tagger is, afaict, quite new, so there probably is some benefit it at least regularly updating. I don't use their data or data structures directly as you do, I simply call their "API" and let their code do the work. I don't think this would be a great addition as it stands, because Collatinus is about a 700MB download. It might be possible to fork some of the code and rewrite, but then imho the pycollatinus team already know how to do this, so it should involve them. :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.