Skip to content

Instantly share code, notes, and snippets.

@D-K-E
Created April 27, 2017 21:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save D-K-E/97f3eef627f27190606517ae07ac5e82 to your computer and use it in GitHub Desktop.
Save D-K-E/97f3eef627f27190606517ae07ac5e82 to your computer and use it in GitHub Desktop.
cAtfParser.py
# Packages -------------------
__author__ = "Doğu Kaan Eraslan, <kaaneraslan@gmail.com>"
import re
import itertools
# ---------------------------------
"""
Objects and their relations to each other.
Text
Line
Word
Sign
Al Occurance
Part
Text CONTAINS Line, Word, Sign, AL Occurance, Part
Part CONTAINS Line, Word, Sign, AL Occurance.
Line CONTAINS Word, Sign, CAN CONTAIN AL Occurance.
AL Occurance CONTAINS Word, Sign CAN CONTAIN Line, Part ?
Word CONTAINS Sign
Text Attributes:
Id,
Language,
objectType,
parts
lines
words,
signs,
al_occurances
Part Attributes:
Name/Id,
lines,
words,
signs,
al_occurances
Line attributes:
Comment_Line
Structure_line
Text_line
words,
signs,
AL_Occurance attributes:
Language,
Words,
Signs,
lines,
parts, ?
Word attributes:
Signs
Sign attributes:
Damaged
Unkown reading
API use cases:
find sign X, determines if the X is inside the text, gives the first
user specified level location, first word, first line, first part, etc.
findall sign X, determines if the X is inside the text, gives all the user specified level locations, word level, line level, part level etc.
Findall or find verbose check, gives the full feature dictionary with the wanted occurance.
Find X with attribute Y, determines if there is an x with attribute y, then gives the first user specified level location.
Findall X with attribute Y, same thing with find X @Y but with all the occurances.
Find if there is anything with the attribute Y.
Findall all those with the attribute Y.
Give Word Count
Give Sign Count
Give Part Count
"""
# random test text -----------------
with open("Archival view of P462811.txt","r",encoding="utf-8") as cAtfFile:
test_file = cAtfFile.read()
# ---------------------------------
# Block level Functions -----------
# Object part Functions ---------------
def get_object_parts(atf_section):
"""
params: atf_section, str.
return: object_part_list, []
"""
#
object_part_split = atf_section.split("\n@")
object_part_id_part = object_part_split[0]
object_part_parts = object_part_split[1:]
object_parts_at = ["@" + part for part in object_part_parts]
object_parts_at.insert(0,object_part_id_part)
#
return object_parts_at
def char_convert(text):
"""
Convert CDLI C-ATF characters
to unicode
"""
#
text_sz = text.replace("sz","\u0161") # sz -> š
text_SZ = text_sz.replace("SZ", "\u0160") # SZ -> Š
text_sPo = text_SZ.replace("s,", "\u1e63") # s, -> ṣ
text_SPo = text_sPo.replace("S,", "\u1e62") # S, -> Ṣ
text_tch = text_SPo.replace("t,", "\u1e6d") # t, -> ṭ
text_TCH = text_tch.replace("T,", "\u1e6c") # T, -> Ṭ
text_s = text_TCH.replace("s'", "\u015b") # s' -> ś
text_S = text_s.replace("S'","\u015a") # S' -> Ś
text_ayn = text_S.replace("'", "\u02be") # ' -> ʾ
text_sub0 = text_ayn.replace("0","\u2080")# Subscript numbers
text_sub1 = text_sub0.replace("1","\u2081")
text_sub2 = text_sub1.replace("2","\u2082")
text_sub3 = text_sub2.replace("3","\u2083")
text_sub4 = text_sub3.replace("4","\u2084")
text_sub5 = text_sub4.replace("5","\u2085")
text_sub6 = text_sub5.replace("6","\u2086")
text_sub7 = text_sub6.replace("7","\u2087")
text_sub8 = text_sub7.replace("8","\u2088")
text_sub9 = text_sub8.replace("9","\u2089")
text_subx = text_sub9.replace("x²","\u208a") # subscript x
text_subX = text_subx.replace("X²","\u208a")
#
return text_subX
def get_words(atf_line):
"""
params: atf_line, str.
return: line_words
"""
#
line_words = re.findall(" .*? ", atf_line)
#
return line_words
def get_signs(atf_word):
"""
params: atf_word, str.
return: word_signs
"""
#
if "}" in atf_word:
atf_word = atf_word.replace("}","}-")
#
#
word_signs = atf_word.split("-")
#
return word_signs
# Line Level Tests ------------------------
class cAtfLineTester(object):
"""
a class for testing lines of c-atf texts
"""
def __init__(self, atf_line):
#
self.cAtf_line = atf_line
#
#
def test_id_line(self):
"""
params: atf_line, str.
return: boolean
Tests if the line starts with &, the id marker.
"""
#
find_id_line = re.search("(^&P\d+)", self.cAtf_line)
#
if find_id_line is None:
return False
else:
return True
#
#
def test_language_line(self):
"""
params: atf_line, str.
return: boolean
tests if the line gives the language of the text
"""
#
find_lang_line = re.search("atf: lang", self.cAtf_line)
#
if find_lang_line is None:
return False
else:
return True
#
#
def test_line_content(self):
"""
params: atf_line, str.
return: boolean
Tests if the line is commentary about the line content
"""
#
find_line_content_comment = re.search("^#.*", self.cAtf_line)
#
if find_line_content_comment is None:
return False
else:
return True
#
#
def test_object_type_object_part(self):
"""
params: atf_line, str.
return: boolean
Tests if the line indicates the object type or object part
"""
#
find_object_type_part = re.search("^@.*", self.cAtf_line)
#
if find_object_type_part is None:
return False
else:
return True
#
#
def test_text_structure(self):
"""
params: atf_line, str.
return: boolean
Tests if the line belongs to a commentary on the text structure
"""
#
find_text_structure = re.search("^\$.*", self.cAtf_line)
#
if find_text_structure is None:
return False
else:
return True
#
#
def test_text_line(self):
"""
params: atf_line, str.
return: boolean
Tests if the line belongs to a translitteration of a text
"""
#
find_text_line = re.search("^\d.*", self.cAtf_line)
#
if find_text_line is None:
return False
else:
return True
#
#
def test_lineHas_anotherLanguage(self):
"""
params: atf_line, str.
return: boolean
"""
#
find_logogram = re.search("_.*?_", self.cAtf_line)
#
if find_logogram is None:
return False
else:
return True
class cAtfALTester(object):
"""
class for handling another languages in the lines
"""
#
def __init__(self):
#
self.cAtf_line = ""
self.al_oc = ""
self.al_word = ""
#
#
#
def testALHasPreSign(self, al_oc):
"""
params: atf_line, str.
return: boolean
Test to see if the another language occurence
has a preeceding sign
"""
#
if "-_" in self.cAtf_line or "-_" in al_oc:
return True
else:
return False
#
#
def test_ALHasFolSign(self, al_oc):
"""
params: atf_line, str.
return: boolean
Test to see if the another language occurence
has a preeceding sign
"""
#
if "_-" in self.cAtf_line or "_-" in al_oc:
return True
else:
return False
#
#
def test_ALSwitch(self, cAtf_alWord):
"""
params: atf_logogram, str.
return: boolean
Tests if the occurence have a
language switch
"""
#
if "%" in cAtf_alWord:
return True
else:
return False
def test_wordHasAnotherLanguage(atf_word):
"""
params: atf_word, str.
return: boolean
"""
#
find_logogram = re.search("_.*?_", atf_word)
#
if find_logogram is None:
return False
else:
return True
"""
Logogram bölgesinden
dil değiştiriciyi al
ondan sonra onun içindeki
işaretleri aldığın dilde
kodla
"""
"""
If logogram has more than one space
divide from the space and check if there is
more than two signs -
if the logograme has space
see if there is a phonetic complement of the logogramme after the _
see if the logogramme has a sign following
or preeceding it.
see
uppercase = unknown reading.
"""
# Word level Tests
# Sign Level Tests -------------------------------
class cAtfWordTester(object):
"""
Class for testing the signs in a word
"""
#
def __init__(self, catf_word):
#
self.cAtf_word = catf_word
#
#
@staticmethod
def test_String(string1,string2):
"""
Returns true if string2
contains string1
"""
#
if string1 in string2:
return True
else:
return False
#
def test_damaged_sign(self):
"""
params: atf_word, str.
return: boolean
"""
#
find_damage_sign = re.search("\w+#", self.cAtf_word)
#
if find_damage_sign is None:
return False
else:
return True
#
#
def test_determinative_sign(self):
"""
params: atf_word, str.
return: boolean
"""
#
find_determinative_sign = re.search("\{\w.*?\}",self.cAtf_word)
#
if find_determinative_sign is None:
return False
else:
return True
#
#
def test_isNumber(self):
"""
params: atf_word, str.
return: boolean
"""
#
number_form_1 = re.search("\d+\(\w+.*?\)", self.cAtf_word)
number_form_2 = re.search("n\(\w+.*?\)", self.cAtf_word)
number_form_3 = re.search("n\+\d+\(\w+.*?\)", self.cAtf_word)
#
if number_form_1 is None and number_form_2 is None and number_form_3 is None:
return False
else:
return True
#
# Punctuation Tests -----------
#
def test_isColon(self):
"""
returns true if the word
has :
"""
#
if ":" == self.cAtf_word:
return True
else:
return False
#
#
def test_isDColon(self):
"""
returns true if the word
is ::
"""
#
if "::" == self.cAtf_word:
return True
else:
return False
#
def test_isColonRQ(self):
"""
returns true if the word
is :'
"""
#
if ":'" == self.cAtf_word:
return True
else:
return False
#
def test_isColonDQ(self):
"""
returns true if the word
is :"
"""
#
if ':"' == self.cAtf_word:
return True
else:
return False
#
def test_isDoubleColon(self):
"""
returns true if the word
is ::
"""
if "::" == self.cAtf_word:
return True
else:
return False
#
#
def test_isColonPoint(self):
"""
returns true if the word
is :.
"""
#
if ":." == self.cAtf_word:
return True
else:
return False
#
def test_isWordDivider(self):
"""
returns true if the word
is /
"""
if "/" == self.cAtf_word:
return True
else:
return False
#
def test_isWordDivider_Specified(self):
"""
returns true if the word has
/(
"""
#
return self.test_String("/(", self.cAtf_word)
#
# Individual Sign Tests ------------
#
def test_has_complement(self):
"""
Returns true if the sign has
+
"""
#
if "+" in self.cAtf_word:
return True
else:
return False
#
#
def test_has_unknownReading(self):
"""
Returns true if the sign
is uppercase
"""
#
if self.cAtf_word.isupper() is True:
return True
else:
return False
#
#
def test_has_composite(self):
"""
Returns true if the sign
has |
"""
#
return self.test_String("|", self.cAtf_word)
#
def test_has_specification(self):
"""
Returns true if the sign
has (
"""
#
return self.test_String("(", self.cAtf_word)
#
def test_has_query(self):
"""
Returns true if the sign
has ?
"""
#
return self.test_String("?", self.cAtf_word)
#
def test_has_collation(self):
"""
returns true if the sign
has *
"""
#
return self.test_String("*", self.cAtf_word)
#
def test_has_correction(self):
"""
returns true if the sign
has !
"""
#
return self.test_String("!", self.cAtf_word)
#
def test_hasCurved(self):
"""
returns true if the sign
has @c
"""
#
return self.test_String("@c", self.cAtf_word)
#
def test_hasFlat(self):
"""
returns true if the sign
has @f
"""
#
return self.test_String("@f", self.cAtf_word)
#
def test_hasGunu(self):
"""
returns true if the sign has
@g
"""
return self.test_String("@g", self.cAtf_word)
#
def test_hasSheshig(self):
"""
returns true if the sign has
@s
"""
#
return self.test_String("@s", self.cAtf_word)
#
def test_hasTenu(self):
"""
returns true if the sign has
@t
"""
#
return self.test_String("@t", self.cAtf_word)
#
def test_hasNutillu(self):
"""
returns true if the sign has
@n
"""
#
return self.test_String("@n", self.cAtf_word)
#
def test_hasZidatenu(self):
"""
returns true if the sign has
@z
"""
#
return self.test_String("@z", self.cAtf_word)
#
def test_hasKabatenu(self):
"""
returns true if the sign
has @k
"""
#
return self.test_String("@k", self.cAtf_word)
#
def test_hasVertReflected(self):
"""
returns true if the sign
has @r
"""
#
return self.test_String("@r", self.cAtf_word)
#
def test_hasHorReflected(self):
"""
returns true if the sign
has @h
"""
#
return self.test_String("@h", self.cAtf_word)
#
def test_hasVariant(self):
"""
returns true if the sign
has @v
"""
#
return self.test_String("@v", self.cAtf_word)
#
def test_hasRotated(self):
"""
returns true if the
sign has @\d+
"""
#
if re.search("@\d+",self.cAtf_word) is not None:
return True
else:
return False
#
# Compound Sign Tests ------------------
#
def test_hasBeside(self):
"""
returns true if the
sign has .
"""
#
return self.test_String(".", self.cAtf_word)
#
def test_hasJoining(self):
"""
returns true if
the sign has +
"""
#
return self.test_String("+", self.cAtf_word)
#
def test_hasAbove(self):
"""
returns true if the sign
has &
"""
#
return self.test_String("&", self.cAtf_word)
#
def test_hasCrossing(self):
"""
returns true if the sign
has %
"""
#
return self.test_String("%", self.cAtf_word)
#
def test_hasAllograph(self):
"""
returns true if the sign
has ~
"""
#
return self.test_String("~", self.cAtf_word)
#
def test_hasSpecialAllograph(self):
"""
returns true if the sign
has ~v
"""
#
return self.test_String("~v", self.cAtf_word)
#
def test_hasFormVariant(self):
"""
returns true if the sign
has \
"""
#
return self.test_String("\\", self.cAtf_word)
#
def test_hasContaining(self):
"""
returns true if the sign
has x
"""
#
return self.test_String("x", self.cAtf_word)
#
def test_hasContaining_Group(self):
"""
returns true if the sign
has x(
"""
#
return self.test_String("x(", self.cAtf_word)
#
# TODO Take the signs for specified punctuations
# TODO Take the sign from numbers
# ----------------------------------------------------
class cAtfSignTester(object):
"""
Class for testing signs in order to buildinga sign dict afterwards
"""
#
def __init__(self, cAtf_Sign):
#
self.catf_sign = cAtf_Sign
#
#
@staticmethod
def test_String(string1,string2):
"""
Returns true if string2
contains string1
"""
#
if string1 in string2:
return True
else:
return False
#
def test_isDamaged(self):
"""
Returns true if the self.catf_sign
has #
"""
#
return self.test_String("#", self.catf_sign)
#
#
def test_isComplement(self):
"""
Returns true if the self.catf_sign has
+
"""
#
if self.test_String("+", self.catf_sign) and self.test_isComposite():
return True
else:
return False
#
def test_isUnknownReading(self):
"""
Returns true if the self.catf_sign
is uppercase
"""
#
if self.catf_sign.isupper() is True:
return True
else:
return False
#
#
def test_isComposite(self):
"""
Returns true if the self.catf_sign
has |
"""
#
return self.test_String("|", self.catf_sign)
#
def test_isSpecification(self):
"""
Returns true if the self.catf_sign
has (
"""
#
return self.test_String("(", self.catf_sign)
#
def test_is_query(self):
"""
Returns true if the self.catf_sign
has ?
"""
#
return self.test_String("?", self.catf_sign)
#
def test_is_collation(self):
"""
returns true if the self.catf_sign
has *
"""
#
return self.test_String("*", self.catf_sign)
#
def test_is_correction(self):
"""
returns true if the self.catf_sign
has !
"""
#
return self.test_String("!", self.catf_sign)
#
# Modifier Tests ------------------------
#
def test_isCurved(self):
"""
returns true if the self.catf_sign
has @c
"""
#
return self.test_String("@c", self.catf_sign)
#
def test_isFlat(self):
"""
returns true if the self.catf_sign
has @f
"""
#
return self.test_String("@f", self.catf_sign)
#
def test_isGunu(self):
"""
returns true if the self.catf_sign has
@g
"""
return self.test_String("@g", self.catf_sign)
#
def test_isSheshig(self):
"""
returns true if the self.catf_sign has
@s
"""
#
return self.test_String("@s", self.catf_sign)
#
def test_isTenu(self):
"""
returns true if the self.catf_sign has
@t
"""
#
return self.test_String("@t", self.catf_sign)
#
def test_isNutillu(self):
"""
returns true if the self.catf_sign has
@n
"""
#
return self.test_String("@n", self.catf_sign)
#
def test_isZidatenu(self):
"""
returns true if the self.catf_sign has
@z
"""
#
return self.test_String("@z", self.catf_sign)
#
def test_isKabatenu(self):
"""
returns true if the self.catf_sign
has @k
"""
#
return self.test_String("@k", self.catf_sign)
#
def test_isVertReflected(self):
"""
returns true if the self.catf_sign
has @r
"""
#
return self.test_String("@r", self.catf_sign)
#
def test_isHorReflected(self):
"""
returns true if the self.catf_sign
has @h
"""
#
return self.test_String("@h", self.catf_sign)
#
def test_isVariant(self):
"""
returns true if the self.catf_sign
has @v
"""
#
return self.test_String("@v", self.catf_sign)
#
def test_isRotated(self):
"""
returns true if the
self.catf_sign has @\d+
"""
#
if re.search("@\d+",self.catf_sign) is not None:
return True
else:
return False
#
def test_isModifier(self):
"""
returns true
if the self.catf_sign passes all
the tests related to modifiers
"""
#
if self.test_isRotated(self.catf_sign) is True or self.test_isVariant(self.catf_sign) is True or self.test_isHorReflected(self.catf_sign) is True or self.test_isCurved(self.catf_sign) is True or self.test_isFlat(self.catf_sign) is True or self.test_isGunu(self.catf_sign) is True or self.test_isSheshig(self.catf_sign) is True or self.test_isTenu(self.catf_sign) is True or self.test_isNutillu(self.catf_sign) is True or self.test_isZidatenu(self.catf_sign) is True or self.test_isKabatenu(self.catf_sign) is True or self.test_isVertReflected(self.catf_sign) is True:
return True
else:
return False
# Compound Self.Catf_Sign Tests ------------------
#
@staticmethod
def test_isBinaryScope(operator):
"""
Tests if the operator has
binary scope
the x and the @ will be
handled individually
"""
#
if operator == "&" or operator == "%":
return True
else:
return False
#
#
def test_hasBeside(self):
"""
returns true if the
self.catf_sign has .
"""
#
return self.test_String(".", self.catf_sign)
#
def test_hasJoining(self):
"""
returns true if
the self.catf_sign has +
"""
#
return self.test_String("+", self.catf_sign)
#
def test_hasContaining(self):
"""
returns true if the self.catf_sign
has x
"""
#
return self.test_String("x", self.catf_sign)
#
def test_hasContaining_Group(self):
"""
returns true if the self.catf_sign
has x(
"""
#
return self.test_String("x(", self.catf_sign)
#
def test_hasAbove(self):
"""
returns true if the self.catf_sign
has &
"""
#
return self.test_String("&", self.catf_sign)
#
def test_hasCrossing(self):
"""
returns true if the self.catf_sign
has %
"""
#
return self.test_String("%", self.catf_sign)
#
def test_hasOpposing(self):
"""
returns true if the seperated strings
are in uppercase
"""
#
test_list = []
if self.test_String("@",self.catf_sign) is True:
rep_string = self.catf_sign.replace("@", " ")
no_number = re.sub("\d+","", rep_string)
no_whiteSpace = no_number.replace(" ","")
if no_whiteSpace.isupper() is True:
return True
else:
return False
else:
return False
#
def test_hasAllograph(self):
"""
returns true if the self.catf_sign
has ~
"""
#
return self.test_String("~", self.catf_sign)
#
def test_hasSpecialAllograph(self):
"""
returns true if the self.catf_sign
has ~v
"""
#
return self.test_String("~v", self.catf_sign)
#
def test_hasFormVariant(self):
"""
returns true if the self.catf_sign
has \
"""
#
return self.test_String("\\", self.catf_sign)
#
def test_hasRepeated(self):
"""
returns true if the first
seperated character is digit
"""
#
if self.test_String("x", self.catf_sign) is True:
str_split = self.catf_sign.split("x")
if str_split[0].isdigit():
return True
else:
return False
else:
return False
#
# -------------------------------------------
class cAtfLineGetter(cAtfLineTester):
"""
a class for getting text lines
according to tests
"""
#
def __init__(self, atf_line):
super().__init__(atf_line)
self.cAtf_line = atf_line
self.text_id = ""
self.text_id_alternatives = []
self.text_lang = ""
self.content_comment_line = ""
self.objectSurface_title = ""
self.structure_comment = ""
self.text_line = ""
self.lineNumber = int()
self.lineWordCount = int()
self.lineWords = []
self.lineText = ""
#
#
def get_id_line(self):
"""
checks the line for
conforming the id no syntax,
then gets it.
"""
#
atf_line = self.cAtf_line
if self.test_id_line() == True:
text_id_search = re.search("&P\d+\s", atf_line)
text_id_brut = text_id_search.group(0)
text_id = text_id_brut[:-1] # Cleans the last space
self.text_id = text_id[1:] # Cleans the &
else:
pass
#
return self.text_id
#
#
def get_id_alternatives(self):
"""
Checks the line for id syntax.
Gets the id alternatives
separated with the "=".
"""
#
atf_line = self.cAtf_line
#
if self.test_id_line() == True:
text_id_alternative_split = atf_line.split("=")
text_id_alternative_brut = text_id_alternative_split[1:]
text_id_alternative = [alternative.strip() for alternative in text_id_alternative_brut]
self.text_id_alternatives = text_id_alternative
else:
pass
#
return self.text_id_alternatives
#
#
def get_language_line(self):
"""
Checks the line for
language protocol syntax
Gets the indicated language
"""
#
atf_line = self.cAtf_line
#
if self.test_language_line() == True:
text_lang_search = re.search("atf: lang.*", atf_line)
text_lang_brut = text_lang_search.group(0)
text_lang = text_lang_brut[len("atf: lang "):].strip()
self.text_lang = text_lang
else:
pass
#
return self.text_lang
#
#
def get_content_comment(self):
"""
Checks the line for
content comment syntax
ie #.
Gets the content comment line
"""
#
atf_line = self.cAtf_line
#
if self.test_line_content() == True:
content_comment_search = re.search("^#.*", atf_line)
content_comment = content_comment_search.group(0)
self.content_comment_line = content_comment
else:
pass
#
return self.content_comment_line
#
#
def get_object_part_title(self):
"""
Checks if the line starts with @.
Gets the line if it does.
"""
#
atf_line = self.cAtf_line
#
if self.test_object_type_object_part() == True:
object_title_search = re.search("^@.*", atf_line)
object_surface_title = object_title_search.group(0)
self.objectSurface_title = object_surface_title
#
else:
pass
#
return self.objectSurface_title
#
#
def get_structure_comment(self):
"""
Checks if the line starts with $
Gets the line if it does.
"""
#
atf_line = self.cAtf_line
#
if self.test_text_structure() == True:
structure_comment_search = re.search("^\$.*", atf_line)
structure_comment = structure_comment_search.group(0)
self.structure_comment = structure_comment
#
else:
pass
#
return self.structure_comment
#
#
def get_text_line(self):
"""
Checks if the line starts with a \d+.
Gets the line if it does.
"""
#
atf_line = self.cAtf_line
#
if self.test_text_line() == True:
text_line_search = re.search("^\d+\.\s.*", atf_line)
text_line = text_line_search.group(0)
self.text_line = text_line
#
#
else:
pass
#
return self.text_line
#
#
def get_line_text(self):
"""
Gets the line text
excluding the line number.
"""
#
if self.test_text_line() == True:
#
# Getting rid of the line number
#
line_no_search = re.search("^\d+\.\s", self.cAtf_line)
line_no_brut = line_no_search.group(0)
text_line = self.cAtf_line[len(line_no_brut):]
self.lineText = text_line
else:
pass
return self.lineText
#
#
def get_line_number(self):
"""
return: self.lineNumber, int.
Checks if the line is text line
gets the line number if it is.
"""
#
if self.test_text_line() == True:
line_no_search = re.search("^\d+\.\s", self.cAtf_line)
line_no_brut = line_no_search.group(0)
line_no_str = line_no_brut[:-2] # Cleans the white space and the dot.
line_no = int(line_no_str)
self.lineNumber = line_no
#
else:
self.lineNumber = None
#
return self.lineNumber
#
#
def get_line_word_count(self):
"""
gets the number of words in text line
assuming that they are
seperated by whitespace
"""
#
text_line_no_number = self.get_line_text()
text_line_split = text_line_no_number.split(" ")
#
# See if there is anything empty
#
for text_line in text_line_split:
if len(text_line) == 0:
text_line_split.remove(text_line)
#
#
#
word_count = len(text_line_split)
self.lineWordCount = word_count
#
#
return self.lineWordCount
#
#
def get_line_words(self):
"""
params: lineText, str.
return: lineWords, []
Gets the whitespace delimited
words in line
"""
#
text_line_no_number = self.get_line_text()
text_line_split = text_line_no_number.split(" ")
#
# See if there is anything empty
#
for text_line in text_line_split:
if len(text_line) == 0:
text_line_split.remove(text_line)
#
#
#
line_words = text_line_split
self.lineWords = line_words
#
#
return self.lineWords
#
class cAtfLineDictBuilder(cAtfLineGetter):
"""
class for building the line_dict,
dictionary.
"""
#
def __init__(self, atf_line):
#
super().__init__(atf_line)
#
self.cAtf_line = atf_line
self.isLineStructure = False
self.isLineComment = False
self.lineDict = {}
#
#
def isLineStruc(self):
"""
Test if the line is a
structure comment
"""
#
if self.test_text_structure() == True:
self.isLineStructure = True
#
else:
self.isLineStructure = False
#
return self.isLineStructure
#
def isLineCom(self):
"""
test if the line is a
comment about the content
"""
#
if self.test_line_content() == True:
self.isLineComment = True
else:
self.isLineComment = False
#
return self.isLineComment
#
#
def lineDictBuild(self):
"""
builds the line dict
based on preeceding
methods
"""
#
self.lineDict["isLineStructure"] = self.isLineStruc()
self.lineDict["isLineContent"] = self.isLineCom()
self.lineDict["lineNumber"] = self.get_line_number()
self.lineDict["lineWordCount"] = self.get_line_word_count()
self.lineDict["lineText"] = self.get_line_text()
self.lineDict["lineWords"] = list(set(self.get_line_words()))
# Removed duplicates, for efficiency.
self.lineDict["lineWordPos"] = list(enumerate(self.get_line_words()))
if len(self.lineDict["lineWords"]) == 0 and self.lineDict["lineNumber"] is None:
return None
else:
pass
#
return self.lineDict
#
class cAtfALHandler(cAtfALTester):
"""
Handle Another Language occurances.
"""
#
def __init__(self, cAtf_part):
super().__init__()
#
self.cAtf_part = cAtf_part
self.lineDict_list = []
self.cAtf_part_lines = []
self.alRef_list = []
self.alGroup_list = []
self.mulAlOc_group_list = []
self.singAlOc_group_list = []
self.mulAlOc_line_list = []
self.mulAlOc_lineDict_list = []
self.mulAlOcS = []
self.singAlOcS = []
self.alOc_list = []
self.AlOcS = []
self.alLanguage = ""
self.textLang = ""
#
#
#
def set_ALOC_lang(self, lang):
"""
Sets the value of self.alLanguage
"""
#
self.alLanguage = lang
#
return self.alLanguage
#
def set_textLang(self, lang):
"""
Sets the value of self.textLang
"""
#
self.textLang = lang
#
return self.textLang
#
def splitPartLines(self):
"""
params: self.cAtf_part, str.
return: self.cAtf_part_lines, []
splits the part into lines
"""
#
self.cAtf_part_lines = self.cAtf_part.splitlines()
#
return self.cAtf_part_lines
#
@staticmethod
def lineDictBuild(cAtf_line):
"""
Uses the lineDictBuilder class
method
"""
#
line_class = cAtfLineDictBuilder(cAtf_line)
line_dict = line_class.lineDictBuild()
#
return line_dict
#
#
def get_lineDict_list(self):
"""
params: self.cAtf_part_lines, []
return: self.lineDict_list, []
gets the lines in dict form
"""
#
for cAtf_line in self.cAtf_part_lines:
lineDict = self.lineDictBuild(cAtf_line)
if lineDict is not None:
self.lineDict_list.append(lineDict)
#
return self.lineDict_list
#
@staticmethod
def test_twoTimesUnScore(lineWord):
"""
Tests if a word has the underscore
two times or not.
"""
#
unscoCount = lineWord.count("_")
if unscoCount == 2:
return True
elif unscoCount == 1:
return False
elif unscoCount < 1:
return None
else:
pass
#
return None
#
def get_ALRefs_lineLevel(self):
"""
Searches whether words of a line
contain a another language switch
If the word contains the underscore 2 times
it is added 2 times for facilitating grouping
after.
"""
#
lineDict_list_sorted = sorted(self.lineDict_list, key=lambda lineDict:lineDict["lineNumber"])
for lineDict in lineDict_list_sorted:
lw_list = list(lineDict["lineWordPos"])
line_word_list_sorted = sorted(lw_list, key=lambda wpTuple:wpTuple[0])
for WordP, lineWord in line_word_list_sorted:
if self.test_twoTimesUnScore(lineWord) is True:
self.alRef_list.append((WordP, lineWord, lineDict["lineNumber"]))
self.alRef_list.append((WordP, lineWord, lineDict["lineNumber"]))
elif self.test_twoTimesUnScore(lineWord) is False:
# (1, WORD, lineNO)
self.alRef_list.append((WordP, lineWord, lineDict["lineNumber"]))
#
#
@staticmethod
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
#
return itertools.zip_longest(*args, fillvalue=fillvalue)
#
def group_ALRefs(self):
"""
Groups the AL references for
marking the AL occurances
"""
#
al_ref_groups = self.grouper(self.alRef_list, 2)
# There should be no need for a fill value, but ...
# I am hesitating...
self.alGroup_list = list(al_ref_groups)
#
return self.alGroup_list
#
@staticmethod
def test_multilineALGroup(ALGroup):
"""
params: ALGroup, ((()),(()))
return: boolean
Tests if the AL references
stocked in the al group
points to a AL occurance
that spreads into multiple
lines
"""
#((11, '_re-e2-um', 1), (11, 're-e2-um_', 2))
start_point = ALGroup[0]
end_point = ALGroup[1]
# (1, WORD, lineNO), (1, WORD, lineNO)
#
if start_point[2] != end_point[2]:
return True
else:
return False
#
#
@staticmethod
def test_singALGroup(ALGroup):
"""
params: ALGroup, ((()),(()))
return: boolean
Tests if the AL references
stocked in the al group
points to a AL occurance
that is confined to 1 line
"""
#
# ((11, '_re-e2-um_', 1), (11, '_re-e2-um_', 1))
start_point = ALGroup[0]
end_point = ALGroup[1]
# (1, WORD, lineNO), (1, WORD, lineNO)
#
if start_point[2] == end_point[2]:
return True
else:
return False
#
def populate_mulALOC_refs(self):
"""
Populates the multiline
AL occurance reference list.
"""
#
self.mulAlOc_group_list = []
for alGroup in self.alGroup_list:
if self.test_multilineALGroup(alGroup) is True:
self.mulAlOc_group_list.append(alGroup)
#
else:
pass
#
return self.mulAlOc_group_list
#
#
def populate_singALOC_refs(self):
"""
Populates the single line
AL occurance reference list.
"""
#
self.singAlOc_group_list = []
for alGroup in self.alGroup_list:
if self.test_singALGroup(alGroup) is True:
self.singAlOc_group_list.append(alGroup)
#
#
return self.singAlOc_group_list
#
#
@staticmethod
def get_mulAlOc_lines(alGroup, lineDictList):
"""
Gets the related lines from the lineDictList, by
using the alGroup elements as point of reference.
"""
#
start_point = alGroup[0]
end_point = alGroup[1]
#
mulAlOc_line_range = range(start_point[2],end_point[2]+1)
# (1, WORD, lineNO), (1, WORD, lineNO)
# +1 compensates the function's exclusion of the final element
mulAlOc_group_line_dict_list = []
#
for lineDict in lineDictList:
if lineDict["lineNumber"] in mulAlOc_line_range:
mulAlOc_group_line_dict_list.append(lineDict)
#
#
return mulAlOc_group_line_dict_list
#
def get_mulAlOc_lineDict_list(self):
"""
Gets the related lineDicts for
AL occurances that spread to multiple lines
"""
#
self.mulAlOc_lineDict_list = []
#
for mulAlOc in self.mulAlOc_group_list:
line_list = self.get_mulAlOc_lines(mulAlOc,self.lineDict_list)
self.mulAlOc_lineDict_list.append(line_list)
#
#
return self.mulAlOc_lineDict_list
#
@staticmethod
def get_FW_mulAlOc(mulAlOc_group):
"""
Gets the First Word and its position of the
AL Occurance that spreads to multiple
lines.
"""
#
first_item_dict = {}
mulAlOc_group_sort = sorted(mulAlOc_group, key=lambda lineDict:lineDict["lineNumber"])
#
mulAlOc_first_lineDict = mulAlOc_group_sort[0]
fLineDict_words = mulAlOc_first_lineDict["lineWordPos"]
#
for wordPos, flineWord in fLineDict_words:
if "_" in flineWord:
first_item_dict[flineWord] = wordPos
#
first_item_sort = sorted(tuple(first_item_dict.items()), key=lambda wordWP:wordWP[1])
first_item = (first_item_sort[-1],mulAlOc_first_lineDict["lineNumber"])
#
return first_item
#
@staticmethod
def get_LW_mulAlOc(mulAlOc_group):
"""
Gets the Last Word and its position of
the AL Occurance that spreads to multiple
lines
"""
#
last_item_dict = {}
mulAlOc_group_sort = sorted(mulAlOc_group, key=lambda lineDict:lineDict["lineNumber"])
mulAlOc_last_lineDict = mulAlOc_group_sort[-1]
#
laLineDict_words = mulAlOc_last_lineDict["lineWords"]
#
for lalineWord in laLineDict_words:
if "_" in lalineWord:
last_item_dict[lalineWord] = laLineDict_words.index(lalineWord)
#
last_item_sort = sorted(tuple(last_item_dict.items()), key=lambda wordWP:wordWP[1])
last_item = (last_item_sort[0],mulAlOc_last_lineDict["lineNumber"])
#
return last_item
#
def get_ALOC_lang(self,alOc):
"""
Gets the AL occurance language
if it has one specified with
%,
if not, we get the specified AL language
in the constructor
"""
alWord = alOc[0]
#
if self.test_ALSwitch(alWord) is True:
alword_find = re.search("%\w+",alWord)
alword_get = alword_find.group(0)
else:
alword_get = self.alLanguage
#
return alword_get
#
def mk_mulAlOc(self, first_item, last_item, mulAlOc_group):
"""
params:
first_item, ()
last_item, ()
mulAlOc_group, [{},{}, ... ]
Creates multiline AL Occurance from the parameters.
alWord_word, str. Another Language word in AL_occurance
alWord_LineNumber, int. The line number for the al_word
alWord_AlOc_Position, dict. Relative position of the alWord inside the AL_occurance.
alWord_AlOc, str. Al_occurance in which the al_word is observed
alWord_AlOc_LineNumber, list. Line number(s) in which the al_oc is observed
alWord_LinePosition, dict. Relative position of the alWord inside the Line.
"""
#
alWord_dict_list = []
#
alOc_words = []
#
for lineDict in mulAlOc_group:
lineNo = lineDict["lineNumber"]
lineWordPos = lineDict["lineWordPos"]
lineWCount = lineDict["lineWordCount"]
#
for wordPos, lineWord in lineWordPos:
#
if lineNo == first_item[1] and wordPos >= first_item[0][1]:
alOc_words.append((lineWord,wordPos,lineNo,lineWCount))
elif first_item[1] < lineNo < last_item[1]:
alOc_words.append((lineWord,wordPos,lineNo, lineWCount))
elif lineNo == last_item[1] and wordPos <= last_item[0][1]:
alOc_words.append((lineWord,wordPos,lineNo, lineWCount))
else:
pass
#
#
alOc_words_sorted = sorted(alOc_words, key=lambda al:(al[2],al[1]))
alOc_word_list = [al[0] for al in alOc_words_sorted]
alOc_line_list = [al[2] for al in alOc_words_sorted]
alOc_text = " ".join(alOc_word_list)
alOc_wordPos = enumerate(alOc_words_sorted)
#
for wordP, alOc_tuple in alOc_wordPos:
alWord_dict = {}
alWord_dict["alWord_word"] = alOc_tuple[0]
alWord_dict["alWord_LineNumber"] = alOc_tuple[2]
alWord_dict["alWord_AlOc"] = alOc_text
alWord_dict["alWord_language"] = self.get_ALOC_lang(alOc_word_list)
alWord_dict["alWord_textLanguage"] = self.textLang
alWord_dict["alWord_alOc_LineNumber"] = alOc_line_list
alOc_pos_dict = {}
alOc_pos_dict["totalWords_AlOc"] = len(alOc_word_list)
alOc_pos_dict["alWord_Position"] = wordP
alWord_dict["alWord_AlOc_Position"] = alOc_pos_dict
alOc_line_dict = {}
alOc_line_dict["totalWords_Line"] = alOc_tuple[3]
alOc_line_dict["alWord_Position"] = alOc_tuple[1]
alWord_dict["alWord_LinePosition"] = alOc_line_dict
alWord_dict_list.append(alWord_dict)
#
return alWord_dict_list
#
#
def get_mulAlOcS(self):
"""
Gets the AL Occurances that spread into multiple lines
as lists of another language word dictionary
"""
#
self.mulAlOcS = []
#
for mulAlOc_group in self.get_mulAlOc_lineDict_list():
first_point = self.get_FW_mulAlOc(mulAlOc_group)
last_point = self.get_LW_mulAlOc(mulAlOc_group)
mulAlOc = self.mk_mulAlOc(first_point, last_point, mulAlOc_group)
self.mulAlOcS.append(mulAlOc)
#
#
return self.mulAlOcS
#
@staticmethod
def get_AlRefs_WordLevel(lineWP):
"""
params: lineWP, ()
Gets the starting point and
end point of the AL Occurance observed
in a single line
"""
# lineWP == (WordPOS, WORD, LineNumber )
#
alRef_WP_list = []
#
if "_" in lineWP[1]:
alRef_WP_list.append((lineWP[0], lineWP[1]))
# (WORDPOS, WORD)
#
return alRef_WP_list
#
def group_ALRef_sing_Wordlevel(self, alRef_WP_list):
"""
groups the AL occurance references
observed in a single line
"""
#
alRef_WP_groups = self.grouper(alRef_WP_list,2)
#
return alRef_WP_groups
#
def mk_singAlOc(self, lineDict_list,alRef_WP_group):
"""
params: lineDict, {}
alRef_WP_group, ()
Creates the AL occurance from the lineDict,
by using the values in the alRef_WP_groups
"""
#
alWord_dict_list = []
#
alRef_WP_group_sort = sorted(alRef_WP_group, key=lambda alRef:alRef[0])
alRef_WP_range = range(alRef_WP_group_sort[0][0], alRef_WP_group_sort[1][0]+1)
#
lineDict = list(filter(lambda Ldicts: Ldicts.get("lineNumber") == alRef_WP_group[0][2], lineDict_list))[0]
# Gets the lineDict from the lineDict list for the relative
# al occurance
#
alOc_words = []
#
lineWordPos = lineDict["lineWordPos"]
#
for WP, word in lineWordPos:
if WP in alRef_WP_range:
alOc_words.append((WP, word))
#
#
alOc_words_sorted = sorted(alOc_words, key=lambda alWords:alWords[0])
alOc_word_list = [al[1] for al in alOc_words_sorted]
alOc_text = " ".join(alOc_word_list)
alOc_wordPos = enumerate(alOc_words_sorted)
#
for WP, alWordTuple in alOc_wordPos:
alWord_dict = {}
alWord_dict["alWord_word"] = alWordTuple[1]
alWord_dict["alWord_textLanguage"] = self.textLang
alWord_dict["alWord_language"] = self.get_ALOC_lang(alOc_word_list)
alWord_dict["alWord_LineNumber"] = lineDict["lineNumber"]
alWord_dict["alWord_AlOc"] = alOc_text
alWord_dict["alWord_alOc_LineNumber"] = lineDict["lineNumber"]
alOc_pos_dict = {}
alOc_pos_dict["totalWords_AlOc"] = len(alOc_word_list)
alOc_pos_dict["alWord_Position"] = WP
alWord_dict["alWord_AlOc_Position"] = alOc_pos_dict
alOc_line_dict = {}
alOc_line_dict["totalWords_Line"] = lineDict["lineWordCount"]
alOc_line_dict["alWord_Position"] = alWordTuple[0]
alWord_dict["alWord_LinePosition"] = alOc_line_dict
alWord_dict_list.append(alWord_dict)
#
return alWord_dict_list
#
def get_singALOcS(self):
"""
Gets AL Occurances confined to a single
line as list of AL word dictionary.
"""
#
self.singAlOcS = []
#
for singAlOc_group in self.singAlOc_group_list:
# singAlOc_group == ((10, '_kur_', 62), (10, '_kur_', 62))
# (WORDPOS, WORD)
singAlOc = self.mk_singAlOc(self.lineDict_list,singAlOc_group)
self.singAlOcS.append(singAlOc)
#
return self.singAlOcS
#
def get_ALOcS(self):
"""
General Method for regrouping
The methods above.
"""
#
self.splitPartLines()
self.get_lineDict_list()
self.get_ALRefs_lineLevel()
self.group_ALRefs()
self.populate_mulALOC_refs()
self.populate_singALOC_refs()
self.get_mulAlOc_lineDict_list()
self.get_mulAlOcS()
self.get_singALOcS()
#
self.alOc_list = self.mulAlOcS + self.singAlOcS
flatten_alOc_list = list(itertools.chain.from_iterable(self.alOc_list))
sort_aloc_list = sorted(flatten_alOc_list, key=lambda alword_dict:(alword_dict["alWord_LineNumber"],alword_dict["alWord_LinePosition"]["alWord_Position"]))
self.AlOcS = []
for key, group in itertools.groupby(sort_aloc_list, key=lambda alWord_dict:alWord_dict["alWord_AlOc"]):
self.AlOcS.append(list(group))
#
#
return self.AlOcS
class cAtfWordDictBuilder(cAtfWordTester):
"""
Class for building Word dictionaries
of a normal text line
"""
#
def __init__(self,cAtf_Word):
super().__init__(cAtf_Word)
self.wordPos_list = []
self.word = cAtf_Word
self.lineDict_list = []
self.det_signList = []
self.detMarkList = []
self.detRef_general_list = []
self.detRef_Group_list = []
self.signList = []
self.signList_pos = []
self.textLang = ""
self.wordLang = ""
self.detLang = ""
self.clean_word = ""
self.detDict_list = []
self.wordDict = {}
#
#
def set_textLang(self, lang):
"""
Text language attribute
"""
#
self.textLang = lang
#
return self.textLang
#
def set_wordLang(self, value):
"""
Word Language property
"""
#
self.wordLang = value
#
return self.wordLang
#
def set_detLang(self,value):
"""
Set Determinative Language
"""
#
self.detLang = value
#
return self.detLang
#
@staticmethod
def set_sign_seperator_curvR(cAtf_Word):
"""
Sets the sign seperator -
to the entities with
parantheses
"""
#
if "}" in cAtf_Word and "}-" in cAtf_Word and "}#" in cAtf_Word:
rep_string = cAtf_Word.replace("}#","#}")
rep_word = rep_string.replace("}-","}")
curv_par_sep = rep_word.split("}")
curv_par = "}-".join(curv_par_sep)
elif "}" in cAtf_Word and "}-" not in cAtf_Word and "}#" in cAtf_Word:
rep_word = cAtf_Word.replace("}#","#}")
curv_par = rep_word.replace("}","}-")
elif "}#" in cAtf_Word:
curv_par = cAtf_Word.replace("}#","#}")
else:
curv_par = cAtf_Word
#
return curv_par
#
@staticmethod
def set_sign_seperator_curvL(cAtf_Word):
"""
Sets the sign seperator -
to the entities with
parantheses
"""
#
if "{" in cAtf_Word and "-{" in cAtf_Word:
rep_word = cAtf_Word.replace("-{","{")
curv_par_sep = rep_word.split("{")
curv_par = "-{".join(curv_par_sep)
elif "{" in cAtf_Word and "-{" not in cAtf_Word:
curv_par = cAtf_Word.replace("{","-{")
else:
curv_par = cAtf_Word
return curv_par
#
@staticmethod
def set_sign_seperator_corBL(cAtf_Word):
"""
Sets the sign seperator -
to the entities with
parantheses
"""
#
if "[" in cAtf_Word and "-[" in cAtf_Word:
rep_word = cAtf_Word.replace("-[","[")
corn_par_sep = rep_word.split("[")
corn_par = "-[".join(corn_par_sep)
elif "[" in cAtf_Word and "-[" not in cAtf_Word:
corn_par = cAtf_Word.replace("[","-[")
else:
corn_par = cAtf_Word
#
return corn_par
#
@staticmethod
def set_sign_seperator_corBR(cAtf_Word):
"""
Sets the sign seperator -
to the entities with
parantheses
"""
#
if "]" in cAtf_Word and "]-" in cAtf_Word:
rep_word = cAtf_Word.replace("]-","]")
corn_par_sep = rep_word.split("]")
corn_par = "]-".join(corn_par_sep)
elif "]" in cAtf_Word and "]-" not in cAtf_Word:
corn_par = cAtf_Word.replace("]","]-")
else:
corn_par = cAtf_Word
#
return corn_par
#
@staticmethod
def cleanWord(cWord):
"""
Cleans the excessive
sign seperators that might
have been generated by the
set_sign_seperators method
"""
#
first_el = cWord[0]
last_el = cWord[-1]
#
if "-" == first_el:
cWord = cWord[1:]
elif "-" == last_el:
cWord = cWord[:-1]
else:
pass
#
return cWord
#
#
def set_sign_seperators(self):
"""
Uses the previous sign
seperator methods to add
sign seperator - to right
places
"""
#
cvl_word = self.set_sign_seperator_curvL(self.cAtf_word)
cvr_word = self.set_sign_seperator_curvR(cvl_word)
crl_word = self.set_sign_seperator_corBL(cvr_word)
crr_word = self.set_sign_seperator_corBR(crl_word)
self.clean_word = self.cleanWord(crr_word)
#
return self.clean_word
#
@staticmethod
def seperate_signs(clean_word):
"""
Seperates the signs and assigns
them an index number.
"""
#
sign_list_brut = clean_word.split("-")
sign_list = [sign.strip() for sign in sign_list_brut if sign.strip()]
sign_list = sign_list
#
return sign_list
#
def get_detRefs(self):
"""
Gets the starting point and the end point
of determinatives
"""
#
signList_unsort = self.seperate_signs(self.clean_word)
self.signList = signList_unsort
signList_pos = list(enumerate(signList_unsort))
self.signList_pos = sorted(signList_pos, key=lambda signPos:signPos[0])
# (0,'lu'),(1, 'mesz'), etc.
#
self.detRef_general_list = []
#
for signPos, sign in self.signList_pos:
if "{" in sign and "}" in sign:
self.detRef_general_list.append((signPos,sign))
self.detRef_general_list.append((signPos,sign))
elif "{" in sign or "}" in sign:
self.detRef_general_list.append((signPos,sign))
#
else:
pass
#
#
return self.detRef_general_list
#
@staticmethod
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
#
return itertools.zip_longest(*args, fillvalue=fillvalue)
#
def group_detRefs(self):
"""
Groups the AL references for
marking the AL occurances
"""
#
det_ref_groups = self.grouper(self.detRef_general_list, 2)
#
self.detRef_Group_list = list(det_ref_groups)
# (signPos,sign), (signPos,sign)
#
return self.detRef_Group_list
#
@staticmethod
def detRanger(detRef_Group):
"""
Gives the range of sign positions
included in the determinative
"""
#
first_sign = detRef_Group[0]
last_sign = detRef_Group[1]
#
detRange = range(first_sign[0], last_sign[0]+1)
#
return detRange
#
def get_detSigns(self):
"""
gets the signs of the determinatives
"""
#
for detRef_group in self.detRef_Group_list:
detSign_list = []
detRange = self.detRanger(detRef_group)
for SP, sign in self.signList_pos:
#(SignPos, Sign),(SignPos, Sign), etc.
if SP in detRange:
detSign_list.append((SP, sign))
self.det_signList.append(tuple(detSign_list))
#
#
#self.det_signList.append(tuple(detSign_list))
#
return self.det_signList
#
def uniqDetSigns(self):
"""
Filter duplicates from det_signList
"""
#
detSy = set()
det_list = []
#
for detl in self.det_signList:
if detl not in detSy:
detSy.add(detl)
det_list.append(detl)
#
self.det_signList = det_list
#
return self.det_signList
#
def mrk_dets(self):# detSignlist element of self.det_signList
"""
params: detSignlist, [(signPos, sign),(), ...]
Marks the determinatives as
prepos, postpos, inpos
"""
#
mark_set = set()
#
signList = sorted(self.signList_pos, key=lambda x:x[0])
#
for detSignlist in self.det_signList:
detList = sorted(detSignlist, key=lambda x:x[0])
# detSignlist == [(signPos, sign),(), ...]
# sort according to sign position
# sort according to sign position
if detList[0][0] > signList[0][0] and detList[-1][0] < signList[-1][0]:
detList.append("inpos")
mark_set.add(tuple(detList))
elif detList[0][0] == 0:
detList.append("prepos")
mark_set.add(tuple(detList))
elif detList[-1][0] == signList[-1][0]:
detList.append("postpos")
mark_set.add(tuple(detList))
#
self.detMarkList = list(mark_set)
#
return self.detMarkList
#
@staticmethod
def mk_detDict(detMark, sign_list):
"""
params: detMark, ((),(),(), ...,"")
Constructs the determinatives dictionary.
"""
#
# detMark == [(signPos, sign),(signPos, sign),MARK]
det_signList = [detm for detm in detMark if isinstance(detm, tuple)]
detList_sort = sorted(det_signList, key=lambda x:x[0])
det_mark_str = detMark[-1]
totalSigns = len(sign_list)
detSigns = [det[1] for det in detList_sort]
detText = "-".join(detSigns)
detSignPos = list(enumerate(detList_sort))
detPos_list = [det[0] for det in detList_sort]
detPos = (detPos_list[0],detPos_list[-1])
detLength = len(detSigns)
#
detEntity_list = []
#
for detSign in detSignPos:
# detSign == (0,(3,an)),(1,(4,mesz)), etc
detSign_dict = {}
detSign_dict["detSign_det"] = detText
detSign_dict["detSign_det_WordPos"] = detPos
detSign_dict["detSign_detMark"] = det_mark_str
detSign_dict["detSign_detSign"] = detSign[1][1]
detSign_word_pos = {}
detSign_word_pos["totalSigns_word"] = totalSigns
detSign_word_pos["detSign_position"] = detSign[1][0]
detSign_dict["detSign_WordPosition"] = detSign_word_pos
detSign_sign_pos = {}
detSign_sign_pos["totalSigns_determinative"] = detLength
detSign_sign_pos["detSign_position"] = detSign[0]
detSign_dict["detSign_DetPosition"] = detSign_sign_pos
detEntity_list.append(detSign_dict)
#
detEntity_tuple = tuple(detEntity_list)
#
return detEntity_tuple
#
def get_detDictS(self):
"""
Populates the determinative list
in the form of list of list of dicts.
Dicts represent a sign of a determinative
list of dicts represent the determinative
list of list of dicts represent the
determinatives of the word.
"""
#
self.get_detRefs()
self.group_detRefs()
signlist = self.signList_pos
#
self.get_detSigns()
self.uniqDetSigns()
#detSignlist == [[(signPos, sign),(), ...], [(signPos, sign),(), ...] ]
# detsign == [(signPos, sign),(), ...]
detMarkList = self.mrk_dets()
#
for detMark in detMarkList:
# detMark == [(signPos, sign),(signPos, sign),MARK]
detDicts = self.mk_detDict(detMark, signlist)
self.detDict_list.append(detDicts)
#
return self.detDict_list
#
def wordDictBuild(self):
"""
Builds the wordDict
"""
#
self.set_sign_seperators()
self.get_detRefs()
self.detDict_list = []
self.get_detDictS()
#
self.wordDict = {}
self.wordDict["word_wordSignCount"] = len(self.signList_pos)
self.wordDict["word_word"] = self.cAtf_word
self.wordDict["word_determinatives"] = self.detDict_list
self.wordDict["word_wordSignsPos"] = self.signList_pos
self.wordDict["word_Signs"] = list(set(self.signList))
# Removed duplicates for efficiency
self.wordDict["word_hasDamage"] = self.test_damaged_sign()
self.wordDict["word_wordLang"] = self.wordLang
self.wordDict["word_isNumber"] = self.test_isNumber()
self.wordDict["word_hasComplement"] = self.test_has_complement()
self.wordDict["word_hasUnknownReading"] = self.test_has_unknownReading()
self.wordDict["word_hasComposite"] = self.test_has_composite()
self.wordDict["word_hasSpecification"] = self.test_has_specification()
self.wordDict["word_hasQuery"] = self.test_has_query()
self.wordDict["word_hasCollation"] = self.test_has_collation()
self.wordDict["word_hasCorrection"] = self.test_has_correction()
self.wordDict["word_isColon"] = self.test_isColon()
self.wordDict["word_isDColon"] = self.test_isDColon()
self.wordDict["word_isColonRQ"] = self.test_isColonRQ()
self.wordDict["word_isColonDQ"] = self.test_isColonDQ()
self.wordDict["word_isWordDivider"] = self.test_isWordDivider()
self.wordDict["word_isSpecifiedWordDivider"] = self.test_isWordDivider_Specified()
self.wordDict["word_hasComplement"] = self.test_has_complement()
self.wordDict["word_hasUnknownReading"] = self.test_has_unknownReading()
self.wordDict["word_hasCurved"] = self.test_hasCurved()
self.wordDict["word_hasFlat"] = self.test_hasFlat()
self.wordDict["word_hasGunu"] = self.test_hasGunu()
self.wordDict["word_hasSheshig"] = self.test_hasSheshig()
self.wordDict["word_hasTenu"] = self.test_hasTenu()
self.wordDict["word_hasNutillu"] = self.test_hasNutillu()
self.wordDict["word_hasZidatenu"] = self.test_hasZidatenu()
self.wordDict["word_hasKabatenu"] = self.test_hasKabatenu()
self.wordDict["word_hasVertReflected"] = self.test_hasVertReflected()
self.wordDict["word_hasHorReflected"] = self.test_hasHorReflected()
self.wordDict["word_hasVariant"] = self.test_hasVariant()
self.wordDict["word_hasRotated"] = self.test_hasRotated()
self.wordDict["word_hasBeside"] = self.test_hasBeside()
self.wordDict["word_hasJoining"] = self.test_hasJoining()
self.wordDict["word_hasAbove"] = self.test_hasAbove()
self.wordDict["word_hasCrossing"] = self.test_hasCrossing()
self.wordDict["word_hasAllograph"] = self.test_hasAllograph()
self.wordDict["word_hasSpecialAllograph"] = self.test_hasSpecialAllograph()
self.wordDict["word_hasFormVariant"] = self.test_hasFormVariant()
self.wordDict["word_hasContaining"] = self.test_hasContaining()
self.wordDict["word_hasContainingGroup"] = self.test_hasContaining_Group()
#
return self.wordDict
# ----------------------------------
class cAtfSignDictBuilder(cAtfSignTester):
"""
Class regrouping methods for building a signDict
"""
#
# Operator types for Compound Signs ----------------------
operator_dict = {
"beside":".",
"joining":"+",
"containing":"x", # This is also used for indicating repetitions.
# Thus needs to be handled DONE # Binary scope
"above":"&", # Binary scope
"crossing":"%", # Binary scope
"opposing":"@", # This needs to be handled, it is also used in
# modifiers and part titles. TODO modifiers DONE
# binary scope
}
modifier_dict = {
"curved":"@c",
"flat":"@f",
"gunu":"@g", # 4 extra wedges
"sheshig":"@s", # added sze sign
"tenu":"@t", # slanting
"nutillu":"@n", # unfinished
"zidatenu":"@z", # slanting right
"kabatenu":"@k", # slanting left
"verticallyReflected":"@r",
"horizontallyReflected":"@h",
"variant":"@v"
# Rotations need to be handled seperately DONE
}
#
def __init__(self, catf_sign):
super().__init__(catf_sign)
self.catf_sign = catf_sign
self.signDict = {}
self.compositeSign = ""
self.prnthsPosition_list = []
self.sign_dict_list = []
self.signRelation_dict_list = []
#
#
#
"""
TODO
Composed signs should have
nesting level indicators
for signs like |ANx(AN.AN)| etc.
And the relations should be specified
in the feature dict.
TODO Specifications are treated as
words when they are delimited by space
signs when they are delimited by -
Sayılarla ilgili bir karar vermem lazım.
Karmaşık işaretlerden de oluşuyor olabilirler.
"""
#
def get_compositeSign(self):
"""
Gets the composite sign.
"""
#
if self.test_isComposite() is True:
composite_sign_search = re.search("\|.*?\|", self.catf_sign)
self.signDict["sign_isDamaged"] = self.test_isDamaged()
# This test is done here because
# C-ATF treates compound signs as atoms
# If one would like to extend this extractor to
# O-ATF then this has to moved to elsewhere.
composite_sign = composite_sign_search.group(0)
self.compositeSign = composite_sign[1:-1]
# 1 - -1 for getting rid of | on both sides
else:
pass
#
return self.compositeSign
#
@staticmethod
def get_nestElements(nestedString):
"""
Generates the paranthese content
with its associated level
if the composite sign is nested.
Code adapted from SO:
author: Gareth Rees
date Published: 2010-11-26-12-32
date Retrieved: 2017-04-23-19-54
url: http://stackoverflow.com/questions/4284991/parsing-nested-parentheses-in-python-grab-content-by-level
"""
#
paren_stack = []
for i, char in enumerate(nestedString):
# Ex. CompositeSign == |AN.(ANxAN)&((AN.AN)%AN)|
if char == "(":
paren_stack.append(i)
# Adds the position of (
elif char == ")" and paren_stack:
# Comes the next )
start = paren_stack.pop()
# Gives the last added ( position
# The logic is that the last added ( would correspond to
# the first ) and by using pop we ensure
# that the second ) doesn't mismatch with the ( of
# the previous right paranthese.
yield (len(paren_stack),list(range(start, i+1)), nestedString[start+1:i])
# the last expression inside the [] excludes the i and
# adds one to the position of the ( so that we have the
# content.
# **WARNING** Range values includes parantheses
#
def get_OpPositions(self, compoundSign):
"""
gets the operator positions from the
compound sign.
"""
#
opPosition_list = []
#
for charPos, char in enumerate(compoundSign):
if char in self.operator_dict.values():
opPosition_list.append((charPos, char))
#
#
return opPosition_list
#
@staticmethod
def get_nestLevelDict(nestList):
"""
Maps the output of the generator
expression to a dictionary
for facilitating later use.
"""
#
nestLevel_dict_list = []
#
for nestL in nestList:
nestLDict = {}
nestLDict["nest_level"] = nestL[0]
nestLDict["nest_range"] = nestL[1]
nestLDict["nest_content"] = nestL[2]
nestLevel_dict_list.append(nestLDict)
#
return nestLevel_dict_list
#
@staticmethod
def get_nestDict(nestList):
"""
Creates a dictionary based on nest levels.
"""
#
nestDict = {}
#
sort_nestList = sorted(nestList, key=lambda x:x[0]) #
#
for nestEl in sort_nestList:
nestDict.setdefault(nestEl[0], []).append(nestEl[1:])
#
return nestDict
#
@staticmethod
def nestDict_LevelRangeCreator(nestDict):
"""
Regroups the range list of nest elements
for each level and appends it to the end
of the value associated with the nest level
"""
#
nestDict_Ranges = {}
for key, nestEl in nestDict.items():
nestLevel_range_list = []
for nestTuple in nestEl:
nestLevel_range_list.extend(nestTuple[0])
# nestTuple[0] should correspond to list of char positions
#
nestDict_Ranges[key] = nestEl
nestDict_Ranges[key].append(nestLevel_range_list)
#
return nestDict_Ranges
#
# "|(AN.((IR2%IR3).((AN&AN)+(IR3xAN))).((AN.IR3)xNITA))|" Test sign
#
@staticmethod
def get_OpDict_list(nestDict_Ranges, opPosition_list):
"""
Gets the operator levels plus one position before and after the
operator position. Maps all of this to a dictionary.
Appends the dictionary to a list
"""
#
opDictList = []
#
for opPosition in opPosition_list:
for level, nestEl in nestDict_Ranges.items():
nestRangeList = nestEl[-1]
if opPosition[0] in nestRangeList:
posPlace = nestRangeList.index(opPosition[0])
posDict = {}
posDict["operatorPosition_nestlevel"] = level
posDict["operatorPosition_after"] = nestRangeList[posPlace+1:posPlace+4]
# This for checking modifier types afterwards
# Especially the rotation.
posDict["operatorPosition_before"] = nestRangeList[posPlace-1]
# Might come in handy for checking 'repeated' operator
posDict["operatorPosition_position"] = opPosition[0]
posDict["operatorPosition_operator"] = opPosition[1]
opDictList.append(posDict)
#
#
return opDictList
#
@staticmethod
def get_OpLevelPosition(opDictList):
"""
Eliminates the duplicate occurances
for the operators. Only the
highest level in which the
operator occured is retained.
Function groups the operators
according to their positions
then makes a list with the highest levels
within the group.
"""
#
opdictsSorted = sorted(opDictList, key=lambda opDict:opDict["operatorPosition_position"]) # Sort list according to operator positions
opDictsGrouped = [list(group) for key, group in itertools.groupby(opdictsSorted, key=lambda x:x["operatorPosition_position"])]
# Group elements according to operator positions
opDictGroupsSort = [sorted(groupList, key=lambda opDict:opDict["operatorPosition_nestlevel"]) for groupList in opDictsGrouped]
# Sort group list according to the nest level
operatorPos_level_list = [sorted_group[-1] for sorted_group in opDictGroupsSort]
#
return operatorPos_level_list
#
def get_SignRelationBS(self,
operatorPos_level_list,
nestLevel_dict_list,
compositeSign):
"""
Gets the sign or sign groups that
are associated with each other through
a binary scoped operator
"""
#
signRelation_dict_list = []
#
for operatorPos_level in operatorPos_level_list:
operatorNestLevel = operatorPos_level["operatorPosition_nestlevel"]
operatorPos = operatorPos_level["operatorPosition_position"]
operator = operatorPos_level["operatorPosition_operator"]
for nestLevel_dict in nestLevel_dict_list:
nestRange = nestLevel_dict["nest_range"]
nestLevel = nestLevel_dict["nest_level"]
nestContent = nestLevel_dict["nest_content"]
if self.test_isBinaryScope(operator) is True:
# x and @ will be handled individually
# we test only for % and &
if operatorPos in nestRange and operatorNestLevel == nestLevel:
opPosinRange = nestRange.index(operatorPos)
opPrecedents = nestRange[1:opPosinRange]
# 1 for excluding the (
opFollowers = nestRange[opPosinRange+1:-1]
# -1 for excluding )
opPrecLength = len(opPrecedents)
opPrecChars = nestContent[:opPrecLength]
opFolChars = nestContent[opPrecLength+1:]
# +1 for excluding the operator
signRelation_dict = {}
signRelation_dict["SR_operator"] = operator
signRelation_dict["SR_operator_antec"] = opPrecChars
signRelation_dict["SR_operator_subsq"] = opFolChars
signRelation_dict["SR_nest_level"] = nestLevel
signRelation_dict["SR_nest_content"] = nestContent
signRelation_dict["SR_compositeSign"] = compositeSign
signRelation_dict["SR_nest_range"] = nestRange
if "(" in opPrecChars and ")" in opPrecChars and ")" in opFolChars and "(" in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Group"}
elif "(" in opPrecChars and ")" in opPrecChars and ")" not in opFolChars and not "(" in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Sign"}
elif "(" not in opPrecChars and ")" not in opPrecChars and ")" in opFolChars and "(" in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Group"}
elif "(" not in opPrecChars and ")" not in opPrecChars and ")" not in opFolChars and "(" not in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Sign"}
signRelation_dict["SR_operator_position"] = operatorPos
if operator == "%":
signRelation_dict["SR_operator_type"] = "crossing"
elif operator == "&":
signRelation_dict["SR_operator_type"] = "above"
signRelation_dict["SR_operator_antec_range"] = opPrecedents
signRelation_dict["SR_operator_subseq_range"] = opFollowers
self.signRelation_dict_list.append(signRelation_dict)
#
return self.signRelation_dict_list
#
def get_SignRelationSpeCases(self,operatorPos_level_list, nestLevel_dict_list, compositeSign):
"""
Gets the sign or sign groups that
are associated with each other through
x and @ operators
"""
#
signRelation_dict_list = []
#
for operatorPos_level in operatorPos_level_list:
operatorNestLevel = operatorPos_level["operatorPosition_nestlevel"]
operatorPos = operatorPos_level["operatorPosition_position"]
operator = operatorPos_level["operatorPosition_operator"]
for nestLevel_dict in nestLevel_dict_list:
nestRange = nestLevel_dict["nest_range"]
nestLevel = nestLevel_dict["nest_level"]
nestContent = nestLevel_dict["nest_content"]
if operatorPos in nestRange and operatorNestLevel == nestLevel:
opPosinRange = nestRange.index(operatorPos)
opPrecedents = nestRange[1:opPosinRange]
# 1 for excluding the (
opFollowers = nestRange[opPosinRange+1:-1]
# -1 for excluding )
opPrecLength = len(opPrecedents)
opPrecChars = nestContent[:opPrecLength]
opFolChars = nestContent[opPrecLength+1:]
# +1 for excluding the operator
signRelation_dict = {}
signRelation_dict["SR_operator"] = operator
signRelation_dict["SR_operator_antec"] = opPrecChars
signRelation_dict["SR_operator_subsq"] = opFolChars
signRelation_dict["SR_nest_level"] = nestLevel
signRelation_dict["SR_nest_content"] = nestContent
signRelation_dict["SR_compositeSign"] = compositeSign
signRelation_dict["SR_nest_range"] = nestRange
if "(" in opPrecChars and ")" in opPrecChars and ")" in opFolChars and "(" in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Group"}
elif "(" in opPrecChars and ")" in opPrecChars and ")" not in opFolChars and not "(" in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Sign"}
elif "(" not in opPrecChars and ")" not in opPrecChars and ")" in opFolChars and "(" in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Group"}
elif "(" not in opPrecChars and ")" not in opPrecChars and ")" not in opFolChars and "(" not in opFolChars:
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Sign"}
signRelation_dict["SR_operator_position"] = operatorPos
signRelation_dict["SR_operator_antec_range"] = opPrecedents
signRelation_dict["SR_operator_subseq_range"] = opFollowers
if operator == ".":
signRelation_dict["SR_operator_type"] = "beside"
elif operator == "+":
signRelation_dict["SR_operator_type"] = "joining"
elif operator == "x" and opPrecChars.isdigit():
signRelation_dict["SR_operator_type"] = "repeated"
elif operator == "x" and not opPrecChars.isdigit():
signRelation_dict["SR_operator_type"] = "containing"
elif operator == "@":
if re.search("^\d+", opFolChars) is not None:
# This means that the @ sign is
# a modifier here so we restart looping
continue
else:
opFolCharsOper = nestContent[opPrecLength:opPrecLength+3]
# Includes the operator @
if "@c" in opFolCharsOper or "@f" in opFolCharsOper or "@g" in opFolCharsOper or "@s" in opFolCharsOper or "@s" in opFolCharsOper or "@t" in opFolCharsOper or "@n" in opFolCharsOper or "@z" in opFolCharsOper or "@k" in opFolCharsOper or "@r" in opFolCharsOper or "@h" in opFolCharsOper or "@v" in opFolCharsOper:
# This means that @ sign is
# a modifier so we restart looping
continue
else:
signRelation_dict["SR_operator_type"] = "opposing"
#
self.signRelation_dict_list.append(signRelation_dict)
#
#
return self.signRelation_dict_list
#
@staticmethod
def get_unNestedCompSigns(compositeSign, opPosition_list):
"""
gets the signs of composite sign
that is not nested.
"""
#
signRelation_dict_list = []
#
for opPos in opPosition_list:
opP = opPos[0]
opChar = opPos[1]
opAnte = compositeSign[:opP]
opSubseq = compositeSign[opP:]
signRelation_dict = {}
signRelation_dict["SR_operator"] = opChar
signRelation_dict["SR_operator_antec"] = opAnte
signRelation_dict["SR_operator_subsq"] = opSubseq[1:]
# 1 for excluding the operator in mapping
signRelation_dict["SR_compositeSign"] = compositeSign
signRelation_dict["SR_nest_level"] = 0
signRelation_dict["SR_nest_content"] = compositeSign
signRelation_dict["SR_nest_range"] = list(range(0,len(compositeSign)))
signRelation_dict["SR_operator_position"] = opP
signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Sign"}
if opChar == "%":
signRelation_dict["SR_operator_type"] = "crossing"
elif opChar == "&":
signRelation_dict["SR_operator_type"] = "above"
elif opChar == ".":
signRelation_dict["SR_operator_type"] = "beside"
elif opChar == "+":
signRelation_dict["SR_operator_type"] = "joining"
elif opChar == "x" and opAnte.isdigit():
signRelation_dict["SR_operator_type"] = "repeated"
elif opChar == "x" and not opAnte.isdigit():
signRelation_dict["SR_operator_type"] = "containing"
elif operator == "@":
if re.search("^\d+",opSubseq[1:]) is not None:
# Starts from 1 because opSubseq[0]== operator
# This means that the @ sign is
# a modifier here so we restart looping
continue
else:
opFolCharsOper = opSubseq[0:2]
# Includes the operator @
if "@c" in opFolCharsOper or "@f" in opFolCharsOper or "@g" in opFolCharsOper or "@s" in opFolCharsOper or "@s" in opFolCharsOper or "@t" in opFolCharsOper or "@n" in opFolCharsOper or "@z" in opFolCharsOper or "@k" in opFolCharsOper or "@r" in opFolCharsOper or "@h" in opFolCharsOper or "@v" in opFolCharsOper:
# This means that @ sign is
# a modifier so we restart looping
continue
else:
signRelation_dict["SR_operator_type"] = "opposing"
signRelation_dict["SR_operator_antec_range"] = list(range(0, opP))
signRelation_dict["SR_operator_subseq_range"] = list(range(opP, len(compositeSign)))
#
self.signRelation_dict_list.append(signRelation_dict)
#
return self.signRelation_dict_list
#
@staticmethod
def get_signsSR(signRelDict):
"""
Gets signs from the sign dict.
"""
#
compoundSign_signList = []
if signRelDict["SR_relation_type"]["operator_antecedent"] == "Sign" and signRelDict["SR_relation_type"]["operator_subsequent"] == "Sign":
compoundSign_signList.append(signRelDict["SR_operator_antec"])
compoundSign_signList.append(signRelDict["SR_operator_subsq"])
compoundSign_signList.append(signRelDict)
#
elif signRelDict["SR_relation_type"]["operator_antecedent"] == "Group" and signRelDict["SR_relation_type"]["operator_subsequent"] == "Sign":
compoundSign_signList.append(signRelDict["SR_operator_subsq"])
compoundSign_signList.append(signRelDict)
#
elif signRelDict["SR_relation_type"]["operator_antecedent"] == "Sign" and signRelDict["SR_relation_type"]["operator_subsequent"] == "Group":
compoundSign_signList.append(signRelDict)
#
return compoundSign_signList
#
def get_signComplement(self):
"""
Gets the signs from a sign
that has a complement
"""
#
complement_sign_list = []
if self.test_isComplement(self.catf_sign) is True:
compSplit = self.catf_sign.split("+")
complement_sign = compSplit[1]
complement_sign_list.append(complement_sign)
#
return complement_sign_list
#
@staticmethod
def char_convert(sign):
"""
Convert CDLI C-ATF characters
to unicode
"""
#
text_sz = sign.replace("sz","\u0161") # sz -> š
text_SZ = text_sz.replace("SZ", "\u0160") # SZ -> Š
text_sPo = text_SZ.replace("s,", "\u1e63") # s, -> ṣ
text_SPo = text_sPo.replace("S,", "\u1e62") # S, -> Ṣ
text_tch = text_SPo.replace("t,", "\u1e6d") # t, -> ṭ
text_TCH = text_tch.replace("T,", "\u1e6c") # T, -> Ṭ
text_s = text_TCH.replace("s'", "\u015b") # s' -> ś
text_S = text_s.replace("S'","\u015a") # S' -> Ś
text_ayn = text_S.replace("'", "\u02be") # ' -> ʾ
text_sub0 = text_ayn.replace("0","\u2080")# Subscript numbers
text_sub1 = text_sub0.replace("1","\u2081")
text_sub2 = text_sub1.replace("2","\u2082")
text_sub3 = text_sub2.replace("3","\u2083")
text_sub4 = text_sub3.replace("4","\u2084")
text_sub5 = text_sub4.replace("5","\u2085")
text_sub6 = text_sub5.replace("6","\u2086")
text_sub7 = text_sub6.replace("7","\u2087")
text_sub8 = text_sub7.replace("8","\u2088")
text_sub9 = text_sub8.replace("9","\u2089")
text_subx = text_sub9.replace("x²","\u208a") # subscript x
text_subX = text_subx.replace("X²","\u208a")
text_h = text_subX.replace("h,", "\u1e2b") # h, -> ḫ
text_H = text_h.replace("H,", "\u1e2a") # H, -> Ḫ
text_j = text_H.replace("j","\u014b") # j -> ŋ
text_J = text_j.replace("J","\u014a") # J -> Ŋ
#
return text_J
#
@staticmethod
def signDictBuild(sign):
"""
params:
sign, str.
C(ompound/complement) S(ign), boolean
Returns the sign dict
with all the features.
"""
#
signDict = {}
tester_class = cAtfSignTester(sign)
signDict["sign_sign"] = sign
signDict["sign_isComplement"] = tester_class.test_isComplement()
signDict["sign_isQuery"] = tester_class.test_is_query()
signDict["sign_isCorrection"] = tester_class.test_is_correction()
signDict["sign_isCollation"] = tester_class.test_is_collation()
signDict["sign_isCurved"] = tester_class.test_isCurved()
signDict["sign_isFlat"] = tester_class.test_isFlat()
signDict["sign_isGunu"] = tester_class.test_isGunu()
signDict["sign_isSheshig"] = tester_class.test_isSheshig()
signDict["sign_isTenu"] = tester_class.test_isTenu()
signDict["sign_isNutillu"] = tester_class.test_isNutillu()
signDict["sign_isZidatenu"] = tester_class.test_isZidatenu()
signDict["sign_isKabatenu"] = tester_class.test_isKabatenu()
signDict["sign_isVertReflected"] = tester_class.test_isVertReflected()
signDict["sign_hasAllograph"] = tester_class.test_hasAllograph()
signDict["sign_hasSpecialAllograph"] = tester_class.test_hasSpecialAllograph()
signDict["sign_isHorReflected"] = tester_class.test_isHorReflected()
signDict["sign_isVariant"] = tester_class.test_isVariant()
signDict["sign_isRotated"] = tester_class.test_isRotated()
#signDict["sign_isPartOfComposite"] = test_isComposite()
#signDict["sign_nestLevel"] = 0 Composite değilse
#signDict["sign_isUnknownReading"] = test_isUnknownReading() # Composite değilse
#signDict["sign_relatedSigns"] = {} # Buraya composite
# işaretleri oluşturan liste eklenecek
return signDict
#
def buildSignDict(self):
"""
Wraps the methods defined throughout the class.
"""
#
sign_dict_list = []
#
if self.test_isComposite() is True and self.test_isSpecification() is True:
# Basically it is a nested composite sign
compositeSign = self.get_compositeSign()
nestedElements = self.get_nestElements(compositeSign)
opPositonList = self.get_OpPositions(compositeSign)
nestList = list(nestedElements)
nestLevelDictList = self.get_nestLevelDict(nestList)
nest_dict = self.get_nestDict(nestList)
nest_dict_levelRange = self.nestDict_LevelRangeCreator(nest_dict)
opDict_list = self.get_OpDict_list(
nest_dict_levelRange,
opPositonList
)
opLvlPosition = self.get_OpLevelPosition(opDict_list)
SR_dictList_BS = self.get_SignRelationBS(
opLvlPosition,
nestLevelDictList,
compositeSign
)
SR_dictList_SCases = self.get_SignRelationSpeCases(
opLvlPosition,
nestLevelDictList,
compositeSign
)
SR_dictList = SR_dictList_SCases + SR_dictList_BS
compoundSign_SR_lists_brut = [self.get_signsSR(SignDict) for SignDict in SR_dictList]
# There are empty list in the brut file
# Created by the group - group associations
compoundSign_SR_lists = list(filter(None, compoundSign_SR_lists_brut))
# They are filtered now.
for compoundSignList in compoundSign_SR_lists:
SR_dict = compoundSignList[-1]
for signElement in compoundSignList:
if not isinstance(signElement, dict):
self.signDict = self.signDictBuild(signElement)
self.signDict["sign_isPartOfComposite"] = True
self.signDict["sign_isUnknownReading"] = False
self.signDict["sign_relatedSigns"] = SR_dict
self.signDict["sign_nestLevel"] = SR_dict["SR_nest_level"]
self.signDict["sign_compositeSign"] = SR_dict["SR_compositeSign"]
sign_dict_list.append(self.signDict)
# Compound Nested DONE
#
elif self.test_isComposite() is True and self.test_isSpecification() is False:
# Compound Not Nested
compositeSign = self.get_compositeSign()
opPositonList = self.get_OpPositions(compositeSign)
unNestedList = self.get_unNestedCompSigns(
compositeSign, opPositonList
)
compoundSign_SR_lists = [self.get_signsSR(SignDict) for SignDict in unNestedList]
for compoundSignList in compoundSign_SR_lists:
SR_dict = compoundSignList.pop()
for signElement in compoundSignList:
self.signDict = self.signDictBuild(signElement)
self.signDict["sign_isPartOfComposite"] = True
self.signDict["sign_isUnknownReading"] = False
self.signDict["sign_relatedSigns"] = SR_dict
self.signDict["sign_nestLevel"] = SR_dict["SR_nest_level"]
self.signDict["sign_compositeSign"] = SR_dict["SR_compositeSign"]
sign_dict_list.append(self.signDict)
#
# Compound not Nested DONE
#
elif self.test_isComposite() is False and self.test_isComplement() is True:
# Not a Compound Sign but is a complement
complementSignList = self.get_signComplement(sign)
for complementSign in complementSignList:
self.signDict = self.signDictBuild(complementSign)
self.signDict["sign_isPartOfComposite"] = False
self.signDict["sign_isUnknownReading"] = self.test_isUnknownReading(sign)
self.signDict["sign_relatedSigns"] = {} # TODO get Related Sign for Complement Signs
self.signDict["sign_nestLevel"] = 0
self.signDict["sign_compositeSign"] = ""
sign_dict_list.append(self.signDict)
# Complement sign DONE
#
elif self.test_isComplement() is False and self.test_isComposite() is False:
self.signDict = self.signDictBuild(self.catf_sign)
self.signDict["sign_isPartOfComposite"] = False
self.signDict["sign_isUnknownReading"] = self.test_isUnknownReading()
self.signDict["sign_isDamaged"] = self.test_isDamaged()
self.signDict["sign_relatedSigns"] = {} # TODO get Related Sign
self.signDict["sign_nestLevel"] = 0
self.signDict["sign_compositeSign"] = ""
sign_dict_list.append(self.signDict)
#
return sign_dict_list
#
# Algorithm DONE
# Tests! DONE
class cAtfTextBuilder(object):
"""
Builds the brut text as a feature
dictionary, by calling the methods
from the classes above.
"""
#
def __init__(self, text):
#
self.text_brut = text
self.atf_section = ""
self.object_parts_list = []
self.objectIdPart = []
self.catf_text_dict = {}
self.objectPartLines_list = []
self.objectTextParts = []
self.textPart_dict_list = []
#
# Section Methods
#
def get_atf_section(self):
"""
params: atf_file, str.
return: atf_section, str.
Takes a text given as the text output
of the cdli splits the atf section
for later use.
"""
#
find_atf_section = re.search("&P\d+.*", self.text_brut, re.DOTALL)
#
self.atf_section = find_atf_section.group(0)
#
return self.atf_section
#
def get_object_parts(self):
"""
params: atf_section, str.
return: object_part_list, []
"""
#
try:
if "\n" not in self.atf_section:
raise ValueError("Newline character doesn't match to expected unix input type")
else:
pass
except ValueError as newlineError:
print(newlineError)
print("\n\n check if you have indeed specified \\n as \n the newline character while opening the text.")
return
else:
pass
object_part_split = self.atf_section.split("\n@")
object_part_id_part = object_part_split[0]
object_part_parts = object_part_split[1:]
self.object_parts_list = ["@" + part for part in object_part_parts]
self.object_parts_list.insert(0,object_part_id_part)
#
return self.object_parts_list
#
def splitLinesOParts(self):
"""
Splits the object part
into lines
"""
#
self.objectPartLines_list = [objectPart.splitlines() for objectPart in self.object_parts_list]
#
return self.objectPartLines_list
#
def get_ObjetIdPart(self):
"""
Gets the part in which
the id of the text occurs
# In objectPartLines_list:
# [0] is the id part, [1] is the type part
# [2] is the text parts
"""
#
self.objectIdPart = self.objectPartLines_list[0]
#
return self.objectIdPart
#
def get_text_id(self):
"""
Gets the text id from the
object id part
"""
#
for line in self.objectIdPart:
c_atf_line = cAtfLineGetter(line)
if len(c_atf_line.get_id_line()) != 0:
self.catf_text_dict["text_id"] = c_atf_line.get_id_line()
elif len(c_atf_line.get_id_alternatives()) != 0:
self.catf_text_dict["text_id_alternatives"] = c_atf_line.get_id_alternatives()
elif len(c_atf_line.get_language_line()) != 0:
self.catf_text_dict["text_language"] = c_atf_line.get_language_line()
#
return self.catf_text_dict
#
def get_objectTypePart(self):
"""
Gets the parts of
the text indicated by @
# In objectPartLines_list:
# [0] is the id part, [1] is the type part
# [2] is the text parts
"""
#
self.objectTypePart = self.objectPartLines_list[1][0].strip()
# [1] corresponds to the list which contains only the type string
# Hence [0].strip()
#
return self.objectTypePart
#
def get_textParts(self):
"""
Gets the list of text parts
from the object part list
This should correspond to [2:]
"""
#
self.objectTextParts = self.objectPartLines_list[2:]
#
return self.objectTextParts
#
def set_text_PartInfo(self):
"""
Sets what we have so far
to the text dictionary
"""
#
self.catf_text_dict["text_objectType"] = self.objectTypePart
self.catf_text_dict["text_textPartCount"] = len(self.objectTextParts)
#[2:] because [0] is the id part and [1] is the type part
#
return self.catf_text_dict
#
@staticmethod
def textPartString(textPart):
"""
params: textPart, []
return: textPart_str, ''
Regroups the lines
belonging to the part in
string form for handling
Another Language Occurances
"""
#
partLines = textPart[1:]
# Since [0] is the part title indicated with @
# the rest should be text lines, comments, etc.
textPart_str = "\n".join(partLines)
#
return textPart_str
#
@staticmethod
def get_ALs(textPart_str):
"""
Passes the textPart_str to AL
handler for getting Another Language
occurances
"""
#
alClass = cAtfALHandler(textPart_str)
alOcS = alClass.get_ALOcS()
#
return alOcS
#
@staticmethod
def lineDicts(textPartLine):
"""
Converts the text part line
to a line dict
"""
#
lineClass = cAtfLineDictBuilder(textPartLine)
lineDict = lineClass.lineDictBuild()
#
return lineDict
#
@staticmethod
def worDictBuilder(lineWord):
"""
Converts the words inside
a line dict to a
wordDict by using cAtfWordDictBuilder
"""
#
wordClass = cAtfWordDictBuilder(lineWord)
#
word_dict = wordClass.wordDictBuild()
#
return word_dict
#
@staticmethod
def signDictBuilder(WordSign):
"""
Converts the signs inside
a word dict to
a signDict by using
cAtfSignDictBuilder
"""
#
signClass = cAtfSignDictBuilder(WordSign)
sign_dict = signClass.buildSignDict()
#
return sign_dict
#
def get_SignDicts(self, wordDict):
"""
Builds sign dicts for the signs
in a word dict.
"""
signs = wordDict["word_Signs"]
signDict_list = [self.signDictBuilder(sign) for sign in signs]
wordDict["word_Signs"] = signDict_list
#
return wordDict
#
def get_WordDicts(self, lineDict):
"""
Builds word dicts for words
in a line dict
"""
#
words = lineDict["lineWords"]
wordDict_list = [self.worDictBuilder(word) for word in words]
lineDict["lineWords"] = wordDict_list
#
return lineDict
#
def set_partDict(self, textPart):
"""
Creates the part dictionary
from the textpart which is an
element of the objectpart list
"""
#
part_dict = {}
#
part_dict["part_partTitle"] = textPart[0].strip()
part_string = self.textPartString(textPart)
part_dict["part_partString"] = part_string
partlines = textPart[1:]
alOccurances = self.get_ALs(part_string)
# pass text language to al occurances TODO
part_dict["part_AL_occurances"] = alOccurances
# and the Adventure of Iteration starts ...
partLine_dict_list = []
for line in partlines:
line_dict = self.lineDicts(line)
lineWord_dict = self.get_WordDicts(line_dict)
# text language can be passed to lines here TODO
lineWordDict_list = lineWord_dict["lineWords"]
linewordsign_dict_list = []
for lineWordDict in lineWordDict_list:
wordSigndict = self.get_SignDicts(lineWordDict)
linewordsign_dict_list.append(wordSigndict)
lineWord_dict["lineWords"] = linewordsign_dict_list
partLine_dict_list.append(lineWord_dict)
#
# TODO partlines aynı kalıyor
#
#
part_dict["part_parLines"] = partLine_dict_list
#
return part_dict
#
def buildTextDict(self):
"""
Wraps the methods above for
building the text dictionary
"""
#
self.get_atf_section()
self.get_object_parts()
# Text is splited into parts
self.splitLinesOParts()
# Each object part is splited into lines
self.get_ObjetIdPart()
# The part in which one observes the object id
# is seperated
self.get_text_id()
# From the object id part
# the text id is taken
self.get_objectTypePart()
# From the object part list
# object type part is taken
self.get_textParts()
# from the object parts that
# has been divided into lines
# textparts are taken
self.set_text_PartInfo()
# The type information
# and partCount is added to
# text dictionary
self.textPart_dict_list = [self.set_partDict(textpart) for textpart in self.objectTextParts]
# part dict is created for each text part.
self.catf_text_dict["text_textParts"] = self.textPart_dict_list
#
return self.catf_text_dict
# TODO Part Level daki işleri yazmaya devam et
# AL ile ilgili işler bu seviyede unutma
# Satır seviyesine geldiğinde sözcük ve işaret
# metodlarını unutma Allah kerim
"""
c_atf_text_dict = {
'textId':str,
'textWordCount':int
'textPartCount':int
'textLineCount':int
'textSignCount':int
'textParts':[{part_dict},{part_dict}, ... ]
}
part_dict = {
'partTitle':str,
'partWordCount':int,
'partTextLineCount':int,
'partSignCount': int,
'partLineStructures':[str, str,] # Lines starting with $
'partLineContents':[str, str,] # Lines starting with #
'partTextLines':[str,str,]
'partLines:[{line_dict}, {line_dict}, ...]
}
line_dict = {
'isLineStructure':boolean
'isLineContent':boolean
'lineNumber':int
'lineWordCount:int
'lineText':str.
'lineWords':[{word_dict}, {word_dict}, ... ]
}
word_dict = {
'word':str,
'hasDeterminative':boolean
'determinatives':[(prepos, LÚ),(postpos, MESZ), ...]
'hasDamaged':boolean
'relativePositionInLine': int.
'relativePositionInObjectPart':int
'absolutePositionInText':int
'isAnotherLanguage':boolean
'wordSignCount':int
'wordSigns':[{sign_dict},{sign_dict}, ... ]
}
sign_dict = {
'sign':str.
'isDamaged':boolean
'isDeterminative':boolean
'relativePositionInWord':int
'relativePositionInObjectPart':int
'relativePositionInLine':int
'absolutePositionInText':int
'isComplement':boolean
'isUnknownReading':boolean
'isDifferentLanguage':boolean
'language':str.
}
"""
test_get_file = get_atf_section(test_file)
test_object_parts = get_object_parts(test_get_file)
test_lines = [x.splitlines() for x in test_object_parts]
testLine = test_lines[2][3]
test_get_words = get_words(testLine)
testWord = test_get_words[3].strip()
test_get_signs = get_signs(testWord)
# Take ids ----------------------------------------------
# DONE: Idler alınabiliyor
c_atf_text = {}
c_atf_text_liste = []
for test_line in test_lines[0]:
c_atf_line = cAtfLineGetter(test_line)
if len(c_atf_line.get_id_line()) != 0:
c_atf_text["text_id"] = c_atf_line.get_id_line()
elif len(c_atf_line.get_id_alternatives()) != 0:
c_atf_text["text_id_alternatives"] = c_atf_line.get_id_alternatives()
elif len(c_atf_line.get_language_line()) != 0:
c_atf_text["text_language"] = c_atf_line.get_language_line()
# ------------------------------------------------------
# TODO: Improvement, get related lines for the comments about content
# DONE: Take part names, comment lines, and text lines in a part dictionary.
object_parts = test_lines[1:]
part_dict_list = []
for part in object_parts:
part_dict = {}
part_dict["partLineContents"] = []
part_dict["partLineStructures"] = []
part_dict["partLines"] = []
part_dict["part_no"] = object_parts.index(part)
part_dict["part_title"] = part[0].strip()
for line in part:
part_class = cAtfLineGetter(line)
part_dict["partLineContents"].append(part_class.get_content_comment())
part_dict["partLineStructures"].append(part_class.get_structure_comment())
part_dict["partLines"].append(part_class.get_text_line())
#
part_dict_list.append(part_dict)
# ---------------------------------------------
test_al_class = cAtfALHandler(test_object_parts[2])
test_part_lines = test_al_class.splitPartLines() # DONE
test_get_line_dict_list = test_al_class.get_lineDict_list() # DONE
test_alrefs_line = test_al_class.get_ALRefs_lineLevel() # DONE
test_group_alrefs = list(test_al_class.group_ALRefs()) # DONE
# 734 - 1215
test_populate_mul = test_al_class.populate_mulALOC_refs() # DONE
test_populate_sing = test_al_class.populate_singALOC_refs() # DONE
test_multiline = test_al_class.get_mulAlOc_lineDict_list() # DONE
test_mulAl = test_al_class.get_mulAlOcS() # DONE
test_singAL = test_al_class.get_singALOcS() # DONE
test_al_ocs = test_al_class.get_ALOcS() # DONE
# Word class Tests -----------------------------
test_word = "{gesz-gesz-gesz}{gesz-an-il}bu-ut,-ni{gesz-mesz-gesz}e2-gal-za3#-di-nu#-tuku-a{gesz}#"
test_Wclass = cAtfWordDictBuilder(test_word)
seps = test_Wclass.set_sign_seperators() # DONE
detRfs = test_Wclass.get_detRefs() # DONE
# 1262 1628
g_derfs = test_Wclass.group_detRefs() # DONE
dSign = test_Wclass.get_detDictS() # DONE
wordDict = test_Wclass.wordDictBuild() # DONE
# Sign Class
# Test Signs
# Nested Compound Sign |(AN.((IR2%IR3).((AN@t~a&AN)+(IR3~txAN))).((AN.IR3)xNITA@r))|#
test_sign = "|(AN.((IR2%IR3).((AN@t~a&AN)+(IR3~txAN))).((AN.IR3)xNITA@r))|#"
# 2446 - 3076
testSignClass = cAtfSignDictBuilder(test_sign)
getComp = testSignClass.get_compositeSign() # DONE
getNestedSt = testSignClass.get_nestElements(getComp) # DONE
getOpPos = testSignClass.get_OpPositions(getComp) # DONE
a = list(getNestedSt)
getNestlevelDList = testSignClass.get_nestLevelDict(a) # DONE
getNestdict = testSignClass.get_nestDict(a) # DONE
diRang = testSignClass.nestDict_LevelRangeCreator(getNestdict) # DONE
opdictList = testSignClass.get_OpDict_list(diRang, getOpPos) # DONE
opLevPos = testSignClass.get_OpLevelPosition(opdictList) # DONE
SR_BS = testSignClass.get_SignRelationBS(
opLevPos,
getNestlevelDList,
getComp
) # DONE
SR_SpeCas = testSignClass.get_SignRelationSpeCases(
opLevPos,
getNestlevelDList,
getComp
) # DONE
SR_list = SR_BS + SR_SpeCas
signsSR = [testSignClass.get_signsSR(signDict) for signDict in SR_list]
# DONE
sign = test_sign
signDict = testSignClass.buildSignDict() # DONE
# Compound Nested Signs are DONE
# Unnested Compound Sign |AN.AN.AN+AN.AN+AN+AN|#
test_sign = "|AN.IR3.BAR+TAM2.MESZ+AN+AN|#"
# 2446 - 3076
testSignClass = cAtfSignDictBuilder(test_sign)
getComp = testSignClass.get_compositeSign() # DONE
getOpPos = testSignClass.get_OpPositions(getComp) # DONE
getUnNested = testSignClass.get_unNestedCompSigns(getComp, getOpPos) # DONE
signsSR = [testSignClass.get_signsSR(signDict) for signDict in getUnNested]
# DONE
signDict = testSignClass.buildSignDict() # DONE
# Determinative Part {an
test_sign = "{frk}#"
testSignClass = cAtfSignDictBuilder(test_sign)
mk = testSignClass.signDictBuild(test_sign)
signDict = testSignClass.buildSignDict()
# Damaged Simple Sign nu# # DONE
test_sign = "nu#"
testSignClass = cAtfSignDictBuilder(test_sign)
mk = testSignClass.signDictBuild(test_sign)
signDict = testSignClass.buildSignDict()
# Modified Damaged Sign AN@t# # DONE
test_sign = "AN@t#"
testSignClass = cAtfSignDictBuilder(test_sign)
mk = testSignClass.signDictBuild(test_sign)
signDict = testSignClass.buildSignDict()
# Rotated sign AN@123 # DONE
test_sign = "AN@123#"
testSignClass = cAtfSignDictBuilder(test_sign)
mk = testSignClass.signDictBuild(test_sign)
signDict = testSignClass.buildSignDict()
# 3061 - 3373
test_textClass = cAtfTextBuilder(test_file)
test_text = test_textClass.buildTextDict()
with open("ParserOutput.txt","w", encoding="utf-8", newline="\n") as f:
f.write(str(test_text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment