willismonroe/ORACC Reader.ipynb

## ORACC Reader.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              ORACC Reader.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## oracc_reader.py
import json, pprint

class ORACC_text_reader:
    def __init__(self, json_string, DEBUG=False):
        self.DEBUG = DEBUG
        self.data = json.loads(json_string)
        try:
            for node in self.data['cdl'][0]['cdl']:
                if 'cdl' in node.keys():
                    self.text = node['cdl'][0]['cdl']
        except:
            if self.DEBUG: pprint.pprint(self.data)


    def output_translit(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                line += ' ' + node['frag']
        output.append(line)

        return output

    def output_norm(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                if 'norm' in node['f'].keys():
                    line += ' ' + node['f']['norm']
                else:
                    line += ' ' + node ['f']['form']
        output.append(line)

        return output

    def output_sense(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                if 'sense' in node['f'].keys():
                    line += ' ' + node['f']['sense']
                else:
                    line += ' ' + node ['f']['form']
        output.append(line)

        return output

    def output_cuneiform(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                translit = node['frag']
                gdl = node['f']['gdl']
                if len(gdl) == 1:
                    # single gdl
                    sign = ''
                    if 'group' in gdl[0].keys():
                        group = gdl[0]['group']
                        for el in group:
                            if 'gdl_utf8' in el.keys():
                                sign += el['gdl_utf8']
                            else:
                                sign += el['seq'][0]['gdl_utf8']
                    else:
                        sign = gdl[0]['gdl_utf8']
                    if self.DEBUG: print("Single gdl ✓ {} = {}".format(translit, sign))
                    line += ' ' + sign
                elif len(gdl) > 1:
                    sign = ''
                    for el in gdl:
                        if 'gdl_utf8' in el.keys():
                            sign += el['gdl_utf8']
                        elif 'seq' in el.keys():
                            for seq in el['seq']:
                                sign += seq['gdl_utf8']
                        elif 'group' in el.keys():
                            for el2 in el['group']:
                                sign += el2['gdl_utf8']
                        else:
                            if self.DEBUG: print("Error with multi-gdl ✗ {} = {}".format(translit, sign))
                            pprint.pprint(gdl)
                    if self.DEBUG: print("Multi-gdl ✓ {} = {}".format(translit, sign))
                    line += ' ' + sign
                else:
                    if self.DEBUG: print("Can't process ✗")
                    pprint.pprint(node)
                    print()
        output.append(line)

        return output

## SAA8.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pprint\n",
    "import collections\n",
    "import tabulate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from oracc_reader import ORACC_text_reader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "big_text = ''\n",
    "for filename in os.listdir('corpusjson/'):\n",
    "    if filename.endswith(\".json\"):\n",
    "        oracc_reader = ORACC_text_reader(open('corpusjson/' + filename).read(), DEBUG=True)\n",
    "        output = oracc_reader.output_norm(with_line_headers=False)\n",
    "        big_text += ''.join(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "words_follow_moon = []\n",
    "big_text_words = big_text.split()\n",
    "for i, w in enumerate(big_text_words):\n",
    "    if w == \"Sin\":\n",
    "        words_follow_moon.append(' '.join(big_text_words[i+1:i+3]))\n",
    "\n",
    "words_follow_sun = []\n",
    "for i, w in enumerate(big_text_words):\n",
    "    if w == \"Šamaš\":\n",
    "        words_follow_sun.append(' '.join(big_text_words[i+1:i+3]))\n",
    "        \n",
    "words_follow_planets = []\n",
    "for i, w in enumerate(big_text_words):\n",
    "    if w in [\"Sagmegar\", \"Dilbat\", \"Kayyamanu\", \"Šihṭu\", \"Ṣalbatanu\"]:\n",
    "        words_follow_planets.append(' '.join(big_text_words[i+1:i+3]))\n",
    "        \n",
    "c_moon = collections.Counter(words_follow_moon)\n",
    "c_sun = collections.Counter(words_follow_sun)\n",
    "c_planets = collections.Counter(words_follow_planets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most common words following the moon:\n",
      "Word            Count\n",
      "--------------  -------\n",
      "u Šamaš         207\n",
      "ina tāmartīšu   105\n",
      "tarbāṣu ilmīma  84\n",
      "ūm 01-KAM₂      44\n",
      "itti Šamaš      19\n",
      "ina Simani      18\n",
      "...\n",
      "Total:          933\n",
      "\n",
      "Most common words following the sun:\n",
      "Word          Count\n",
      "------------  -------\n",
      "itti ahāmeš   89\n",
      "šutātû šar    50\n",
      "šitqulū māti  22\n",
      "innammarma 1  14\n",
      "lā uqqīma     13\n",
      "šitqulū atmû  10\n",
      "...\n",
      "Total:        422\n",
      "\n",
      "Most common words following any of the planets:\n",
      "Word            Count\n",
      "--------------  -------\n",
      "ina libbi       26\n",
      "ina libbīšu     21\n",
      "x x             20\n",
      "ina harrāni     10\n",
      "ina erēb-šamši  8\n",
      "ina Nisanni     7\n",
      "...\n",
      "Total:          315\n"
     ]
    }
   ],
   "source": [
    "print(\"Most common words following the moon:\")\n",
    "print(tabulate.tabulate(c_moon.most_common()[:6] +\n",
    "                        [('...','')] + [(\"Total:\", str(len(words_follow_moon)))],\n",
    "                        headers=[\"Word\", \"Count\"]))\n",
    "print()\n",
    "print(\"Most common words following the sun:\")\n",
    "print(tabulate.tabulate(c_sun.most_common()[:6] +\n",
    "                        [('...','')] + [(\"Total:\", str(len(words_follow_sun)))],\n",
    "                        headers=[\"Word\", \"Count\"]))\n",
    "print()\n",
    "print(\"Most common words following any of the planets:\")\n",
    "print(tabulate.tabulate(c_planets.most_common()[:6] +\n",
    "                        [('...','')] + [(\"Total:\", str(len(words_follow_planets)))],\n",
    "                        headers=[\"Word\", \"Count\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	import json, pprint

	class ORACC_text_reader:
	def __init__(self, json_string, DEBUG=False):
	self.DEBUG = DEBUG
	self.data = json.loads(json_string)
	try:
	for node in self.data['cdl'][0]['cdl']:
	if 'cdl' in node.keys():
	self.text = node['cdl'][0]['cdl']
	except:
	if self.DEBUG: pprint.pprint(self.data)


	def output_translit(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	line += ' ' + node['frag']
	output.append(line)

	return output

	def output_norm(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	if 'norm' in node['f'].keys():
	line += ' ' + node['f']['norm']
	else:
	line += ' ' + node ['f']['form']
	output.append(line)

	return output

	def output_sense(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	if 'sense' in node['f'].keys():
	line += ' ' + node['f']['sense']
	else:
	line += ' ' + node ['f']['form']
	output.append(line)

	return output

	def output_cuneiform(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	translit = node['frag']
	gdl = node['f']['gdl']
	if len(gdl) == 1:
	# single gdl
	sign = ''
	if 'group' in gdl[0].keys():
	group = gdl[0]['group']
	for el in group:
	if 'gdl_utf8' in el.keys():
	sign += el['gdl_utf8']
	else:
	sign += el['seq'][0]['gdl_utf8']
	else:
	sign = gdl[0]['gdl_utf8']
	if self.DEBUG: print("Single gdl ✓ {} = {}".format(translit, sign))
	line += ' ' + sign
	elif len(gdl) > 1:
	sign = ''
	for el in gdl:
	if 'gdl_utf8' in el.keys():
	sign += el['gdl_utf8']
	elif 'seq' in el.keys():
	for seq in el['seq']:
	sign += seq['gdl_utf8']
	elif 'group' in el.keys():
	for el2 in el['group']:
	sign += el2['gdl_utf8']
	else:
	if self.DEBUG: print("Error with multi-gdl ✗ {} = {}".format(translit, sign))
	pprint.pprint(gdl)
	if self.DEBUG: print("Multi-gdl ✓ {} = {}".format(translit, sign))
	line += ' ' + sign
	else:
	if self.DEBUG: print("Can't process ✗")
	pprint.pprint(node)
	print()
	output.append(line)

	return output
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import json\n",
	"import pprint\n",
	"import collections\n",
	"import tabulate"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from oracc_reader import ORACC_text_reader"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"big_text = ''\n",
	"for filename in os.listdir('corpusjson/'):\n",
	" if filename.endswith(\".json\"):\n",
	" oracc_reader = ORACC_text_reader(open('corpusjson/' + filename).read(), DEBUG=True)\n",
	" output = oracc_reader.output_norm(with_line_headers=False)\n",
	" big_text += ''.join(output)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"words_follow_moon = []\n",
	"big_text_words = big_text.split()\n",
	"for i, w in enumerate(big_text_words):\n",
	" if w == \"Sin\":\n",
	" words_follow_moon.append(' '.join(big_text_words[i+1:i+3]))\n",
	"\n",
	"words_follow_sun = []\n",
	"for i, w in enumerate(big_text_words):\n",
	" if w == \"Šamaš\":\n",
	" words_follow_sun.append(' '.join(big_text_words[i+1:i+3]))\n",
	" \n",
	"words_follow_planets = []\n",
	"for i, w in enumerate(big_text_words):\n",
	" if w in [\"Sagmegar\", \"Dilbat\", \"Kayyamanu\", \"Šihṭu\", \"Ṣalbatanu\"]:\n",
	" words_follow_planets.append(' '.join(big_text_words[i+1:i+3]))\n",
	" \n",
	"c_moon = collections.Counter(words_follow_moon)\n",
	"c_sun = collections.Counter(words_follow_sun)\n",
	"c_planets = collections.Counter(words_follow_planets)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Most common words following the moon:\n",
	"Word Count\n",
	"-------------- -------\n",
	"u Šamaš 207\n",
	"ina tāmartīšu 105\n",
	"tarbāṣu ilmīma 84\n",
	"ūm 01-KAM₂ 44\n",
	"itti Šamaš 19\n",
	"ina Simani 18\n",
	"...\n",
	"Total: 933\n",
	"\n",
	"Most common words following the sun:\n",
	"Word Count\n",
	"------------ -------\n",
	"itti ahāmeš 89\n",
	"šutātû šar 50\n",
	"šitqulū māti 22\n",
	"innammarma 1 14\n",
	"lā uqqīma 13\n",
	"šitqulū atmû 10\n",
	"...\n",
	"Total: 422\n",
	"\n",
	"Most common words following any of the planets:\n",
	"Word Count\n",
	"-------------- -------\n",
	"ina libbi 26\n",
	"ina libbīšu 21\n",
	"x x 20\n",
	"ina harrāni 10\n",
	"ina erēb-šamši 8\n",
	"ina Nisanni 7\n",
	"...\n",
	"Total: 315\n"
	]
	}
	],
	"source": [
	"print(\"Most common words following the moon:\")\n",
	"print(tabulate.tabulate(c_moon.most_common()[:6] +\n",
	" [('...','')] + [(\"Total:\", str(len(words_follow_moon)))],\n",
	" headers=[\"Word\", \"Count\"]))\n",
	"print()\n",
	"print(\"Most common words following the sun:\")\n",
	"print(tabulate.tabulate(c_sun.most_common()[:6] +\n",
	" [('...','')] + [(\"Total:\", str(len(words_follow_sun)))],\n",
	" headers=[\"Word\", \"Count\"]))\n",
	"print()\n",
	"print(\"Most common words following any of the planets:\")\n",
	"print(tabulate.tabulate(c_planets.most_common()[:6] +\n",
	" [('...','')] + [(\"Total:\", str(len(words_follow_planets)))],\n",
	" headers=[\"Word\", \"Count\"]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}