Skip to content

Instantly share code, notes, and snippets.

@willismonroe
Created July 9, 2018 17:38
Show Gist options
  • Save willismonroe/c4f2aaa310062db3341d36eda1c325e6 to your computer and use it in GitHub Desktop.
Save willismonroe/c4f2aaa310062db3341d36eda1c325e6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"from Importer.file_importer import FileImport\n",
"from Importer.cdli_corpus import CDLICorpus\n",
"from ATFConverter.tokenizer import Tokenizer\n",
"from ATFConverter.atf_converter import ATFConverter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Frequency analysis on the ARM1 corpus\n",
"These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example. The stopwords can be customized to match the text as the researcher sees fit."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"fi = FileImport('texts/ARM1Akkadian.txt')\n",
"fi.read_file()\n",
"cc = CDLICorpus()\n",
"cc.ingest_text_file(fi.file_lines)\n",
"tk = Tokenizer()\n",
"atf = ATFConverter()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('a-wi-lum', 133),\n",
" ('lu', 108),\n",
" ('be-el', 78),\n",
" ('_e2_', 76),\n",
" ('a-wi-lim', 65),\n",
" ('i-na-ad-di-in', 50),\n",
" ('šu-u₂', 45),\n",
" ('_a-ša3_', 44),\n",
" ('_ku₃-babbar_', 43),\n",
" ('ku₃-babbar_', 40)]"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stopwords = ['a-na', 'u3', 'sza', '[...]', 'i-na', '=',\n",
" 'ARM', '01,', 'lang', 'akk', 'um-ma', 'la',\n",
" 'u2-ul', 'mesz_', 'asz-szum', '0.1', 'broken',\n",
" 'isz-tu', '_lu2_', 'ki-a-am', '1(disz)', 'ki-ma',\n",
" 'x', 'sza-a-ti', 'the', '_lu2', '...]', 'lu-u2',\n",
" 'sza#', 'a-na#', '_u4', 'beginning', 'of', '2(disz)',\n",
" '[a-na', 'szum-ma', 'hi-a_', 'ana', 'a-di']\n",
"bag_of_words = []\n",
"for lines in [text['transliteration'][0] for text in cc.texts]:\n",
" for line in lines:\n",
" for word in tk.word_tokenizer(line):\n",
" if word[0] not in stopwords:\n",
" bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n",
"Counter(bag_of_words).most_common(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Frequency analysis on the law code of H\n",
"These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example. The stopwords can be customized to match the text as the researcher sees fit."
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"fi = FileImport('texts/Akkadian.txt')\n",
"fi.read_file()\n",
"cc = CDLICorpus()\n",
"cc.ingest_text_file(fi.file_lines)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('a-wi-lum', 133),\n",
" ('be-el', 78),\n",
" ('_e2_', 76),\n",
" ('a-wi-lim', 65),\n",
" ('i-na-ad-di-in', 50),\n",
" ('_a-ša3_', 44),\n",
" ('_ku₃-babbar_', 43),\n",
" ('ku₃-babbar_', 40),\n",
" ('šu-a-ti', 39),\n",
" ('_dumu-meš_', 36)]"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stopwords = ['a-na', 'szum-ma', 'i-na', 'u3', 'sza', 'la',\n",
" 'lu', 'u2-ul', 'szu-u2']\n",
"bag_of_words = []\n",
"for lines in [text['transliteration'][0] for text in cc.texts]:\n",
" for line in lines:\n",
" for word in tk.word_tokenizer(line):\n",
" if word[0] not in stopwords:\n",
" bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n",
"Counter(bag_of_words).most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment