Skip to content

Instantly share code, notes, and snippets.

@bnagy
Created September 25, 2018 12:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bnagy/93edda849f46525ad4c67c96dd2d9b6a to your computer and use it in GitHub Desktop.
Save bnagy/93edda849f46525ad4c67c96dd2d9b6a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Before any of this will work you need Collatinus installed,\n",
"# running, and with the Server mode activated. You can set your\n",
"# language in the app, and it will be respected by the server.\n",
"\n",
"import socket\n",
"\n",
"class Client(object):\n",
" def __init__(self, host=\"127.0.0.1\", port=5555):\n",
" self._host = host\n",
" self._port = port\n",
" keys=['æ','Æ','œ','Œ','̀','́','à','á','è','é','ì','í','ò','ó','ù','ú','À','Á','È','É','Ì','Í','Ò','Ó','Ù','Ú',\n",
" '̄','̆','ā','ă','ē','ĕ','ī','ĭ','ō','ŏ','ū','ŭ','Ā','Ă','Ē','Ĕ','Ī','Ĭ','Ō','Ŏ','Ū','Ŭ']\n",
" vals = ['ae','Ae','oe','Oe','','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U',\n",
" '','','a','a','e','e','i','i','o','o','u','u','A','A','E','E','I','I','O','O','U','U']\n",
" d = dict(zip(keys,vals))\n",
" self._accent_trans = str.maketrans(d)\n",
"\n",
" def _run_remote_command(self, cmd):\n",
" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:\n",
" sock.settimeout(1)\n",
" sock.connect((self._host, self._port))\n",
" sock.sendall((cmd).encode('utf8'))\n",
" chunks = []\n",
" while True:\n",
" try:\n",
" chunk = sock.recv(4096)\n",
" chunks.append(chunk)\n",
" except socket.timeout:\n",
" # the only time this should happen is if the server sends\n",
" # precisely 4096 bytes of data. We'd then try to recv again\n",
" # and block forever.\n",
" break\n",
" if len(chunk) < 4096:\n",
" break\n",
"\n",
" out = b''.join(chunks).decode('utf-8')\n",
" return out\n",
"\n",
" def tag_best(self, latin_string, strip_accents=True):\n",
" out = self._run_remote_command(('-P2 \"{}\"').format(latin_string))\n",
" if strip_accents:\n",
" out = out.translate(self._accent_trans)\n",
" return [l.split('\\t') for l in out.split('\\n')]\n",
"\n",
"\n",
" def tag(self, latin_string, strip_accents=True):\n",
" out = self._run_remote_command(('-P3 \"{}\"').format(latin_string))\n",
" if strip_accents:\n",
" out = out.translate(self._accent_trans)\n",
" return [l.split('\\t') for l in out.split('\\n')]\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam future indicative active 1st singular']]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tag_best gives us just the best guess from the engine. For\n",
"# docs (such as they are) on the result format see the various\n",
"# files in https://github.com/biblissima/collatinus/\n",
"# Summary: the first numbers are word and phrase indices. We have\n",
"# the word being tagged, a lemma, the number of hits in the LASLA\n",
"# corpus (I think), and the definition and morphology.\n",
"\n",
"c = Client()\n",
"c.tag_best(\"cadam\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam future indicative active 1st singular'],\n",
" ['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam future indicative active 1st singular (v1 : 0.901807)'],\n",
" ['1',\n",
" '1',\n",
" '1',\n",
" 'cadam',\n",
" 'v1 ',\n",
" 'cado',\n",
" 'cado, is, ere, cecidi, casum',\n",
" '687',\n",
" 'to fall, sink, drop, plummet, topple; to be slain, die; to end, cease, abate; to decay;',\n",
" 'cadam present subjunctive active 1st singular (v21 : 0.0981932)']]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tag gives us the best statistical match first, along with all of \n",
"# the alternatives. Eg here future active is much more common than\n",
"# subjunctive active, but it's an alternative.\n",
"\n",
"c.tag(\"cadam\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1', '1', '1', 'Sausages', '', 'unknown'],\n",
" ['2',\n",
" '2',\n",
" '1',\n",
" 'SaRcIna',\n",
" 'n11 (n61)',\n",
" 'sarcina',\n",
" 'sarcina, ae, f.',\n",
" '81',\n",
" \"pack, bundle, soldier's kit; baggage (pl.), belongings, chattels; load, burden;\",\n",
" 'sarcina nominative singular']]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The tagger is pretty robust, which is nice.\n",
"c.tag_best(\"Sausages! &%$SaRcIna?<:\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"text = \"\"\"Omnis homines, qui sese student praestare ceteris animalibus, \n",
"summa ope niti decet, ne vitam silentio transeant veluti pecora, \n",
"quae natura prona atque ventri oboedientia finxit. Sed nostra omnis \n",
"vis in animo et corpore sita est: animi imperio, corporis servitio \n",
"magis utimur; alterum nobis cum dis, alterum cum beluis commune est. \n",
"Quo mihi rectius videtur ingeni quam virium opibus gloriam quaerere \n",
"et, quoniam vita ipsa, qua fruimur, brevis est, memoriam nostri quam \n",
"maxume longam efficere. Nam divitiarum et formae gloria fluxa atque \n",
"fragilis est, virtus clara aeternaque habetur. Sed diu magnum inter \n",
"mortalis certamen fuit, vine corporis an virtute animi res militaris \n",
"magis procederet. Nam et, prius quam incipias, consulto et, ubi \n",
"consulueris, mature facto opus est. Ita utrumque per se indigens \n",
"alterum alterius auxilio eget.\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.75 ms, sys: 1.34 ms, total: 4.08 ms\n",
"Wall time: 83.3 ms\n"
]
}
],
"source": [
"%%time\n",
"\n",
"# it's also very fast.\n",
"\n",
"results = c.tag_best(text)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['omnis', 'homo', 'qui', 'se', 'studeo', 'praesto', 'ceterum', 'animal', 'summa', 'ops', 'nitor', 'decet', 'ne', 'vita', 'silentium', 'transeo', 'veluti', 'pecus', 'qui', 'natura', 'pronus', 'atque', 'venter', 'oboedientia', 'fingo', 'sed', 'noster', 'omnis', 'volo', 'in, indu']\n"
]
}
],
"source": [
"print([r[5] for r in results][:30])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ling",
"language": "python",
"name": "ling"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@kylepjohnson
Copy link

@Bangy FYI here's our Collatinus installation:

Do you see the online version offering benefits beyond what ours does? Eg, will it be kept up to date as ours might not?

@kylepjohnson
Copy link

If you think this belongs in the CLTK, I'd accept your PR

@bnagy
Copy link
Author

bnagy commented Sep 26, 2018

@kylepjohnson thanks for the feedback! Right now this is talking to a local installation of Collatinus, so I needed to download and install the OSX version from their website. The Tagger is, afaict, quite new, so there probably is some benefit it at least regularly updating. I don't use their data or data structures directly as you do, I simply call their "API" and let their code do the work. I don't think this would be a great addition as it stands, because Collatinus is about a 700MB download. It might be possible to fork some of the code and rewrite, but then imho the pycollatinus team already know how to do this, so it should involve them. :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment