Skip to content

Instantly share code, notes, and snippets.

@bnagy
Created June 26, 2018 04:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bnagy/aa6d5310cd206c8601d2513e7752a224 to your computer and use it in GitHub Desktop.
Save bnagy/aa6d5310cd206c8601d2513e7752a224 to your computer and use it in GitHub Desktop.
Teaching a computer to teach me how to scan latin poetry
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# In this notebook, we attempt to scan Catullus 33, a moderately filthy hendecasyllable.\n",
"text = \"\"\"O FVRVM optime balneariorum\n",
"Vibenni pater et cinaede fili\n",
"(nam dextra pater inquinatiore,\n",
"culo filius est uoraciore),\n",
"cur non exilium malasque in oras\n",
"itis? quandoquidem patris rapinae\n",
"notae sunt populo, et natis pilosas,\n",
"fili, non potes asse uenditare.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'O FVRVM optime balneariorum\\nVibenni pater et cinaede fili\\n(nam dextra pater inquinatiore,\\nculo filius est uoraciore),\\ncur non exilium malasque in oras\\nitis? quandoquidem patris rapinae\\nnotae sunt populo, et natis pilosas,\\nfili, non potes asse uenditare.'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# raw text with \\n (linebreaks), punctuation and garbage.\n",
"text"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# a text translator that will replace anything in the string.punctuation list with None\n",
"import string\n",
"t = text.maketrans('','',string.punctuation)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'o fvrvm optime balneariorum\\nvibenni pater et cinaede fili\\nnam dextra pater inquinatiore\\nculo filius est uoraciore\\ncur non exilium malasque in oras\\nitis quandoquidem patris rapinae\\nnotae sunt populo et natis pilosas\\nfili non potes asse uenditare'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean = text.translate(t).lower()\n",
"clean"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['o fvrvm optime balneariorum',\n",
" 'vibenni pater et cinaede fili',\n",
" 'nam dextra pater inquinatiore',\n",
" 'culo filius est uoraciore',\n",
" 'cur non exilium malasque in oras',\n",
" 'itis quandoquidem patris rapinae',\n",
" 'notae sunt populo et natis pilosas',\n",
" 'fili non potes asse uenditare']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_lines = clean.splitlines()\n",
"clean_lines"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# The scansion modules work better on text with macrons. In fact, we'll see that \n",
"# the macronisation is fairly important if we want a good scan.\n",
"\n",
"from cltk.prosody.latin.macronizer import Macronizer\n",
"\n",
"macronizer = Macronizer('tag_ngram_123_backoff') # following the CLTK docs"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ō fvrvm optimē balneariorum',\n",
" 'vibenni pater et cinaede fili',\n",
" 'nam dextrā pater inquinatiore',\n",
" 'culo fīlius est uoraciore',\n",
" 'cūr nōn exilium malasque in ōrās',\n",
" 'itis quandoquidem patris rāpīnae',\n",
" 'notae sunt populō et nātīs pilosas',\n",
" 'fili nōn potes asse uenditare']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m_lines = [macronizer.macronize_text(l) for l in clean_lines]\n",
"m_lines"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# This is a Hendecasyllable, so we select the appropriate scanner.\n",
"\n",
"from cltk.prosody.latin.HendecasyllableScanner import HendecasyllableScanner"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"hen_scan = HendecasyllableScanner()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"- - - - - U- U- U \n",
"ō fvrvm optimē balneariorum\n"
]
}
],
"source": [
"v = hen_scan.scan(m_lines[0])\n",
"print(v.scansion)\n",
"print(v.original)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Ew. See the gap in the scansion? It had no idea what to do with fvrvm, we need to normalize first"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from cltk.stem.latin.j_v import JVReplacer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"j = JVReplacer() # convert 'jam' to 'iam' etc"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"clean_lines = [j.replace(l) for l in clean_lines]\n",
"m_lines = [macronizer.macronize_text(l) for l in clean_lines] # macronize again"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"- U - U - - U- U- U \n",
"ō furum optimē balneariorum\n"
]
}
],
"source": [
"v = hen_scan.scan(m_lines[0])\n",
"print(v.scansion)\n",
"print(v.original)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"uibenni pater et cinaede fili\n"
]
},
{
"data": {
"text/plain": [
"Verse(original='uibenni pater et cinaede fili', scansion='', meter='hendecasyllable', valid=False, syllable_count=12, accented='', scansion_notes=['Invalid hendecasyllables; more than eleven syllables detected'], syllables = ['u', 'i', 'bēn', 'ni', 'pa', 'ter', 'ēt', 'ci', 'nae', 'de', 'fi', 'li'])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Hooray - but it's still not right. Let's move on to line two.\n",
"v = hen_scan.scan(m_lines[1])\n",
"print(v.scansion)\n",
"print(v.original)\n",
"v"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Our proper noun Vibenni has turned into a four syllable word :( \n",
"# This means the Hendecasyllable scanner won't have any truck with it."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"clean_lines[1] = 'vibenni pater et cinaede fili'\n",
"m_lines = [macronizer.macronize_text(l) for l in clean_lines] # macronize yet again"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" U - - U U - U - U - U\n",
"vibenni pater et cinaede fili\n"
]
},
{
"data": {
"text/plain": [
"Verse(original='vibenni pater et cinaede fili', scansion=' U - - U U - U - U - U', meter='hendecasyllable', valid=True, syllable_count=11, accented='vibēnnī pater ēt cinaede fīli', scansion_notes=['Corrected invalid start.'], syllables = ['vi', 'bēn', 'ni', 'pa', 'ter', 'ēt', 'ci', 'nae', 'de', 'fi', 'li'])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now at least it scans!\n",
"v = hen_scan.scan(m_lines[1])\n",
"print(v.scansion)\n",
"print(v.original)\n",
"v"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0] 123: ō furum optimē balneariorum\n",
"[0] TNT: ō fūrum optimē balneāriōrum\n",
"[0] CRF: ō fūrum optimē balneāriōrum\n",
"[1] 123: vibenni pater et cinaede fili\n",
"[1] TNT: vibenni pater et cinaede fīlī\n",
"[1] CRF: vibenni pater et cinaede fīlī\n",
"[2] 123: nam dextrā pater inquinatiore\n",
"[2] TNT: nam dextrā pater inquinātiōre\n",
"[2] CRF: nam dextra pater inquinātiōre\n",
"[3] 123: culo fīlius est uoraciore\n",
"[3] TNT: cūlō fīlius est vorāciōre\n",
"[3] CRF: cūlō fīlius est vorāciōre\n",
"[4] 123: cūr nōn exilium malasque in ōrās\n",
"[4] TNT: cūr nōn exilium malasque in ōrās\n",
"[4] CRF: cūr nōn exilium malasque in ōrās\n",
"[5] 123: itis quandoquidem patris rāpīnae\n",
"[5] TNT: ītis quandōquidem patris rāpīnae\n",
"[5] CRF: itis quandōquidem patris rāpīnae\n",
"[6] 123: notae sunt populō et nātīs pilosas\n",
"[6] TNT: nōtae sunt populō et nātīs pilōsās\n",
"[6] CRF: nōtae sunt populō et natīs pilōsās\n",
"[7] 123: fili nōn potes asse uenditare\n",
"[7] TNT: fīlī nōn potes asse vēnditāre\n",
"[7] CRF: fīlī nōn potes asse vēnditāre\n"
]
}
],
"source": [
"# That line is still not quite right, though, probably because fili should be macronized as fīlī.\n",
"# Let's test out the other Macronizer models\n",
"m123 = Macronizer('tag_ngram_123_backoff')\n",
"mtnt = Macronizer('tag_tnt')\n",
"mcrf = Macronizer('tag_crf')\n",
"for i, l in enumerate(clean_lines):\n",
" print('[%d] 123: ' % i, m123.macronize_text(l))\n",
" print('[%d] TNT: ' % i, mtnt.macronize_text(l))\n",
" print('[%d] CRF: ' % i, mcrf.macronize_text(l))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# TNT and CRF are both better on this text. We choose TNT, narrowly, because\n",
"# we like ītis to start long in line 5 (I think it should actually be ītīs, tbh)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"- - - U - - U- U- U \n",
"ō fūrum optimē balneāriōrum\n",
" U - - U U - U - U - -\n",
"vibenni pater et cinaede fīlī\n",
" - - - U U - U - U- U\n",
"nam dextrā pater inquinātiōre\n",
" - - - UU - U - U- U\n",
"cūlō fīlius est vorāciōre\n",
" - - - U U- U - U - - \n",
"cūr nōn exilium malasque in ōrās\n",
"- - - - U - U - U - -\n",
"ītis quandōquidem patris rāpīnae\n",
" - - - U U - U - U - - \n",
"nōtae sunt populō et nātīs pilōsās\n",
" - - - U U - U - U - U\n",
"fīlī nōn potes asse vēnditāre\n"
]
}
],
"source": [
"# One more try:\n",
"\n",
"for l in clean_lines:\n",
" v = hen_scan.scan(mtnt.macronize_text(l))\n",
" print(v.scansion)\n",
" print(v.original)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Now we finally have something that is close to a scansion produced by a human!\n",
"# For comparison: http://rudy.negenborn.net/catullus/text2/sc33.htm"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment