willismonroe/CDLI frequency analysis.ipynb

## CDLI frequency analysis.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "from itertools import chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Importer.file_importer import FileImport\n",
    "from Importer.cdli_corpus import CDLICorpus\n",
    "from ATFConverter.tokenizer import Tokenizer\n",
    "from ATFConverter.atf_converter import ATFConverter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Frequency analysis on the ARM1 corpus\n",
    "These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example.  The stopwords can be customized to match the text as the researcher sees fit."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "fi = FileImport('texts/ARM1Akkadian.txt')\n",
    "fi.read_file()\n",
    "cc = CDLICorpus()\n",
    "cc.ingest_text_file(fi.file_lines)\n",
    "tk = Tokenizer()\n",
    "atf = ATFConverter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('a-wi-lum', 133),\n",
       " ('lu', 108),\n",
       " ('be-el', 78),\n",
       " ('_e2_', 76),\n",
       " ('a-wi-lim', 65),\n",
       " ('i-na-ad-di-in', 50),\n",
       " ('šu-u₂', 45),\n",
       " ('_a-ša3_', 44),\n",
       " ('_ku₃-babbar_', 43),\n",
       " ('ku₃-babbar_', 40)]"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords = ['a-na', 'u3', 'sza', '[...]', 'i-na', '=',\n",
    "             'ARM', '01,', 'lang', 'akk', 'um-ma', 'la',\n",
    "             'u2-ul', 'mesz_', 'asz-szum', '0.1', 'broken',\n",
    "             'isz-tu', '_lu2_', 'ki-a-am', '1(disz)', 'ki-ma',\n",
    "            'x', 'sza-a-ti', 'the', '_lu2', '...]', 'lu-u2',\n",
    "            'sza#', 'a-na#', '_u4', 'beginning', 'of', '2(disz)',\n",
    "            '[a-na', 'szum-ma', 'hi-a_', 'ana', 'a-di']\n",
    "bag_of_words = []\n",
    "for lines in [text['transliteration'][0] for text in cc.texts]:\n",
    "    for line in lines:\n",
    "        for word in tk.word_tokenizer(line):\n",
    "            if word[0] not in stopwords:\n",
    "                bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n",
    "Counter(bag_of_words).most_common(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Frequency analysis on the law code of H\n",
    "These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example.  The stopwords can be customized to match the text as the researcher sees fit."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "fi = FileImport('texts/Akkadian.txt')\n",
    "fi.read_file()\n",
    "cc = CDLICorpus()\n",
    "cc.ingest_text_file(fi.file_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('a-wi-lum', 133),\n",
       " ('be-el', 78),\n",
       " ('_e2_', 76),\n",
       " ('a-wi-lim', 65),\n",
       " ('i-na-ad-di-in', 50),\n",
       " ('_a-ša3_', 44),\n",
       " ('_ku₃-babbar_', 43),\n",
       " ('ku₃-babbar_', 40),\n",
       " ('šu-a-ti', 39),\n",
       " ('_dumu-meš_', 36)]"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords = ['a-na', 'szum-ma', 'i-na', 'u3', 'sza', 'la',\n",
    "             'lu', 'u2-ul', 'szu-u2']\n",
    "bag_of_words = []\n",
    "for lines in [text['transliteration'][0] for text in cc.texts]:\n",
    "    for line in lines:\n",
    "        for word in tk.word_tokenizer(line):\n",
    "            if word[0] not in stopwords:\n",
    "                bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n",
    "Counter(bag_of_words).most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"from collections import Counter\n",
	"from itertools import chain"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {},
	"outputs": [],
	"source": [
	"from Importer.file_importer import FileImport\n",
	"from Importer.cdli_corpus import CDLICorpus\n",
	"from ATFConverter.tokenizer import Tokenizer\n",
	"from ATFConverter.atf_converter import ATFConverter"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Frequency analysis on the ARM1 corpus\n",
	"These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example. The stopwords can be customized to match the text as the researcher sees fit."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {},
	"outputs": [],
	"source": [
	"fi = FileImport('texts/ARM1Akkadian.txt')\n",
	"fi.read_file()\n",
	"cc = CDLICorpus()\n",
	"cc.ingest_text_file(fi.file_lines)\n",
	"tk = Tokenizer()\n",
	"atf = ATFConverter()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 78,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('a-wi-lum', 133),\n",
	" ('lu', 108),\n",
	" ('be-el', 78),\n",
	" ('_e2_', 76),\n",
	" ('a-wi-lim', 65),\n",
	" ('i-na-ad-di-in', 50),\n",
	" ('šu-u₂', 45),\n",
	" ('_a-ša3_', 44),\n",
	" ('_ku₃-babbar_', 43),\n",
	" ('ku₃-babbar_', 40)]"
	]
	},
	"execution_count": 78,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"stopwords = ['a-na', 'u3', 'sza', '[...]', 'i-na', '=',\n",
	" 'ARM', '01,', 'lang', 'akk', 'um-ma', 'la',\n",
	" 'u2-ul', 'mesz_', 'asz-szum', '0.1', 'broken',\n",
	" 'isz-tu', '_lu2_', 'ki-a-am', '1(disz)', 'ki-ma',\n",
	" 'x', 'sza-a-ti', 'the', '_lu2', '...]', 'lu-u2',\n",
	" 'sza#', 'a-na#', '_u4', 'beginning', 'of', '2(disz)',\n",
	" '[a-na', 'szum-ma', 'hi-a_', 'ana', 'a-di']\n",
	"bag_of_words = []\n",
	"for lines in [text['transliteration'][0] for text in cc.texts]:\n",
	" for line in lines:\n",
	" for word in tk.word_tokenizer(line):\n",
	" if word[0] not in stopwords:\n",
	" bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n",
	"Counter(bag_of_words).most_common(10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Frequency analysis on the law code of H\n",
	"These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example. The stopwords can be customized to match the text as the researcher sees fit."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 76,
	"metadata": {},
	"outputs": [],
	"source": [
	"fi = FileImport('texts/Akkadian.txt')\n",
	"fi.read_file()\n",
	"cc = CDLICorpus()\n",
	"cc.ingest_text_file(fi.file_lines)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 77,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('a-wi-lum', 133),\n",
	" ('be-el', 78),\n",
	" ('_e2_', 76),\n",
	" ('a-wi-lim', 65),\n",
	" ('i-na-ad-di-in', 50),\n",
	" ('_a-ša3_', 44),\n",
	" ('_ku₃-babbar_', 43),\n",
	" ('ku₃-babbar_', 40),\n",
	" ('šu-a-ti', 39),\n",
	" ('_dumu-meš_', 36)]"
	]
	},
	"execution_count": 77,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"stopwords = ['a-na', 'szum-ma', 'i-na', 'u3', 'sza', 'la',\n",
	" 'lu', 'u2-ul', 'szu-u2']\n",
	"bag_of_words = []\n",
	"for lines in [text['transliteration'][0] for text in cc.texts]:\n",
	" for line in lines:\n",
	" for word in tk.word_tokenizer(line):\n",
	" if word[0] not in stopwords:\n",
	" bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n",
	"Counter(bag_of_words).most_common(10)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}