Skip to content

Instantly share code, notes, and snippets.

@willismonroe
Created January 22, 2019 20:19
Show Gist options
  • Save willismonroe/539fcbcab36d3428817d1fafde509cb8 to your computer and use it in GitHub Desktop.
Save willismonroe/539fcbcab36d3428817d1fafde509cb8 to your computer and use it in GitHub Desktop.
M388 suffixes and prefixes.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import re\nfrom nltk.util import bigrams\nfrom collections import Counter\nfrom beautifultable import BeautifulTable",
"execution_count": 99,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "with open('protoElamTranslits20111120.txt') as f:\n data = f.read().split('\\n')",
"execution_count": 100,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "data[:15]",
"execution_count": 101,
"outputs": [
{
"data": {
"text/plain": "['&P009331 = MDP 26S, 5233',\n '@obverse',\n '@column 1',\n '1. M010# , 2(N14)',\n '@reverse',\n '$ broken',\n '',\n '&P009342 = MDP 31, 003',\n '@obverse',\n '@column 1',\n '1. [...] ,',\n '2. x , 8(N14) 4(N01)',\n '3. M096~d , 4(N01) 2(N39B) 1(N24)#',\n '4. x , 5(N01)#',\n '@reverse']"
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "line_re = re.compile(r\"\\d+\\.\")\ndamage_re = re.compile(r\"( x |\\[\\.\\.\\.\\])\")",
"execution_count": 102,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# grab lines of text\nraw_lines = [line for line in data if line_re.match(line)]\n# reject lines with damage\ncomplete_lines = [line for line in raw_lines if not damage_re.search(line)]\n# remove line numerals\nlines = [line.split(' ', maxsplit=1)[1] for line in complete_lines if len(line.split(' ', maxsplit=1)) > 1]",
"execution_count": 103,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "lines[:5]",
"execution_count": 104,
"outputs": [
{
"data": {
"text/plain": "['M010# , 2(N14)',\n 'M096~d , 4(N01) 2(N39B) 1(N24)#',\n 'M388 , 9(N01)',\n 'M046 M254~b# , 1(N01)',\n 'M417~h , 3(N01)#']"
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# make one big line of signs\ncorpus = ' '.join(lines)\n# tokenize the line by spaces (naive)\nwords = corpus.split(' ')",
"execution_count": 105,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "m388_re = re.compile(r\"M388\")\nm388_suffixes = Counter([word[1] for word in bigrams(words) if m388_re.search(word[0])])\nm388_prefixes = Counter([word[0] for word in bigrams(words) if m388_re.search(word[1])])",
"execution_count": 106,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "t_m388 = BeautifulTable()\nt_m388.column_headers = [\"suffix\", \"count\", \"prefix\", \"count\"]\nfor row in zip(m388_suffixes.most_common(5), m388_prefixes.most_common(5)): \n t_m388.append_row([item for duo in row for item in duo])\nt_m388.insert_column(2, ' ', [' ']*5)\nprint(t_m388)",
"execution_count": 107,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "+--------+-------+---+--------+-------+\n| suffix | count | | prefix | count |\n+--------+-------+---+--------+-------+\n| , | 93 | | 1(N01) | 35 |\n+--------+-------+---+--------+-------+\n| M218 | 13 | | , | 19 |\n+--------+-------+---+--------+-------+\n| M066 | 11 | | M305 | 12 |\n+--------+-------+---+--------+-------+\n| M387 | 7 | | 2(N01) | 10 |\n+--------+-------+---+--------+-------+\n| M347 | 7 | | M054 | 10 |\n+--------+-------+---+--------+-------+\n"
}
]
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.7",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "M388 suffixes and prefixes.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment