Skip to content

Instantly share code, notes, and snippets.

@andjc
Last active August 8, 2023 07:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andjc/760e2e711716076a7799794b40fce33c to your computer and use it in GitHub Desktop.
Save andjc/760e2e711716076a7799794b40fce33c to your computer and use it in GitHub Desktop.
For a specific text (string) identify and count occurances of (character or grapheme based) ngraphs in text
from collections import Counter
import regex
class ngraphs:
"""Calculate ngraph occurrences for target string
Attributes
----------
text: str
A plain text string to be analysed. Specific to ngraph instance.
size: int
Size of ngraph. 2 = digraph, 3 = character, etc. Defaults to 2
filter: bool
Filter out punctuation and whitespace, so that these characters do not appear
in the ngraphs. Defaults to False
count: int
graphemes: bool
Whether ngraphs are calculated on basis of number of characters, or number of graphemes.
Defaults to False.
Methods
-------
most_common()
Dictionary containing the _count_ most frequent ngraphs. Returns dictionary of
ngraphs, count of occurrence of ngraphs.
ngraph_list()
Return list of ngraphs generated from _text_.
"""
def __init__(self, text, size=2, filter=False, count=10, graphemes=False):
self._text = text
self.size = size
self.filter = filter
self.count = count
self.graphemes = graphemes
self.data
@property
def data(self):
self._data = self._frequency()
return self._data
@property
def text(self):
return self._text
@text.setter
def text(self, value):
# self._text = value
raise Exception("Cannot set text. Require new instance of ngraphs.")
@property
def size(self):
return self._size
@size.setter
def size(self, value):
self._size = value
@property
def filter(self):
return self._filter
@filter.setter
def filter(self, value):
self._filter = value
@property
def grapheme(self):
return self._grapheme
@grapheme.setter
def grapheme(self):
return self._grapheme
@grapheme.setter
def grapheme(self, value):
# self._grapheme = value
raise Exception("Cannot set grapheme. Require new instance of ngraphs.")
@property
def count(self):
return self._count
@count.setter
def count(self, value):
self._count = value
def __str__(self):
return f"size: {self.size} , filter: {self.filter} , count: {self.count}"
def _frequency(self):
# Identify ngraphs in text and count number of occurrences of each ngraph
pattern = f'[^\p\u007bP\u007d\p\u007bZ\u007d]\u007b{self.size}\u007d'
r = {}
if self.graphemes:
gr = regex.findall(r'\X', self.text)
c = {"".join(i for i in k): v for k, v in dict(Counter(tuple(gr)[idx : idx + self.size] for idx in range(len(gr) - 1))).items()}
else:
c = Counter(self.text[idx : idx + self.size] for idx in range(len(self.text) - 1))
r = {x: count for x, count in c.items() if regex.match(pattern, x)} if self.filter else dict(c)
r = dict(sorted(r.items(), key=lambda x:x[1], reverse=True))
return r
# return {"size":self.size, "filter":self.filter ,"ngraths": r}
# def _frequency_percentage(self, value):
# pdata = {k: round(v*100/self.total(), 6) for k,v in self.data.items()}
# return None
# def _percentage(self, value):
# return round(value*100/self.total(), 4)
def most_common(self, value=None):
if value and value != self.count:
self._count = value
return dict(list([self.data].items())[0: self.count])
def to_list(self):
# Convert data keys to list, i.e. list of ngraths
return [i for i in self.data.keys()]
def to_tuples(self):
# Convert data dictionary to a list of tuples.
# return [(k, v, self._percentage(v)) for k, v in self.data.items()]
return [(k, v) for k, v in self.data.items()]
def ngraph_length(self):
# Number of unique ngraphs in data
return len(self.data)
def text_length(self):
# Length (number of characters) of text
return len(self.text)
def total(self):
# Total number of ngraphs available in string
return sum(self.data.values())
s = "አንቀጽ፡፩፤ የሰው፡ልጅ፡ሁሉ፡ሲወለድ፡ነጻና፡በክብርና፡በመብትም፡እኩልነት፡ያለው፡ነው።፡የተፈጥሮ፡ማስተዋልና፡ሕሊና፡ስላለው፡አንዱ፡ሌላውን፡በወንድማማችነት፡መንፈስ፡መመልከት፡ይገባዋል። አንቀጽ፡፪፤ እያንዳንዱ፡ሰው፡የዘር፡የቀለም፡የጾታ፡የቋንቋ፡የሃይማኖት፡የፖለቲካ፡ወይም፡የሌላ፡ዓይነት፡አስተሳሰብ፡የብሔራዊ፡ወይም፡የኀብረተሰብ፡ታሪክ፡የሀብት፡የትውልድ፡ወይም፡የሌላ፡ደረጃ፡ልዩነት፡ሳይኖሩ፡በዚሁ፡ውሳኔ፡የተዘረዘሩት፡መብቶችንና፡ነጻነቶች፡ሁሉ፡እንዲከበሩለት፡ይገባል። ከዚህም፡በተቀረ፡አንድ፡ሰው፡ከሚኖርበት፡አገር፡ወይም፡ግዛት፡የፖለቲካ፡የአገዛዝ፡ወይም፡የኢንተርናሽናል፡አቋም፡የተነሳ፡አገሩ፡ነጻም፡ሆነ፡በሞግዚትነት፡አስተዳደር፡ወይም፡እራሱን፡ችሎ፡የማይተዳደር፡አገር፡ተወላጅ፡ቢሆንም፡በማንኛውም፡ዓይነት፡ገደብ፡ያለው፡አገዛዝ፡ሥር፡ቢሆንም፡ልዩነት፡አይፈጸምበትም።"
z = ngraphs(s, filter=True)
# 20 most frequent trigraphs
common_digraphs = z.most_common(20)
print(common_digraphs)
# {'ነት': 7, 'ወይ': 6, 'ይም': 6, 'አገ': 5, 'አን': 4, 'ሰው': 3, 'ነጻ': 3, 'ለው': 3, 'የተ': 3, 'ስተ': 3, 'ሌላ': 3, 'ንቀ': 2, 'ቀጽ': 2, 'ሁሉ': 2, 'ርና': 2, 'መብ': 2, 'ብት': 2, 'ትም': 2, 'ያለ': 2, 'ዋል': 2}
#
# Trigraphs including trigraphs with whitespace or punctuation.
#
z.size = 3
z.filter = False
z.count = 15
common_trigraphs = z.most_common()
# 15 most frequent trigraphs
print(common_trigraphs)
# {'ነት፡': 7, 'ም፡የ': 6, '፡ወይ': 6, 'ወይም': 6, 'ይም፡': 6, 'ት፡አ': 4, '፡አገ': 4, 'ሰው፡': 3, '፡ነጻ': 3, 'ለው፡': 3, '፡የተ': 3, 'ት፡የ': 3, 'አንቀ': 2, 'ንቀጽ': 2, 'ቀጽ፡': 2}
#
# Trigraphs excluding trigraphs with whitespace or punctuation
#
z.filter = True
# 15 most frequent trigraphs
common_trigraphs = z.most_common()
print(common_trigraphs)
# {'ወይም': 6, 'አንቀ': 2, 'ንቀጽ': 2, 'ያለው': 2, 'ይገባ': 2, 'የፖለ': 2, 'ፖለቲ': 2, 'ለቲካ': 2, 'የሌላ': 2, 'ዓይነ': 2, 'ይነት': 2, 'አስተ': 2, 'ልዩነ': 2, 'ዩነት': 2, 'አገር': 2}
print(z.data)
# {'ወይም': 6, 'አንቀ': 2, 'ንቀጽ': 2, 'ያለው': 2, 'ይገባ': 2, 'የፖለ': 2, 'ፖለቲ': 2, 'ለቲካ': 2, 'የሌላ': 2, 'ዓይነ': 2, 'ይነት': 2, 'አስተ': 2, 'ልዩነ': 2, 'ዩነት': 2, 'አገር': 2, 'አገዛ': 2, 'ገዛዝ': 2, 'ተዳደ': 2, 'ዳደር': 2, 'ቢሆን': 2, 'ሆንም': 2, 'የሰው': 1, 'ሲወለ': 1, 'ወለድ': 1, 'ነጻና': 1, 'በክብ': 1, 'ክብር': 1, 'ብርና': 1, 'በመብ': 1, 'መብት': 1, 'ብትም': 1, 'እኩል': 1, 'ኩልነ': 1, 'ልነት': 1, 'የተፈ': 1, 'ተፈጥ': 1, 'ፈጥሮ': 1, 'ማስተ': 1, 'ስተዋ': 1, 'ተዋል': 1, 'ዋልና': 1, 'ሕሊና': 1, 'ስላለ': 1, 'ላለው': 1, 'አንዱ': 1, 'ሌላው': 1, 'ላውን': 1, 'በወን': 1, 'ወንድ': 1, 'ንድማ': 1, 'ድማማ': 1, 'ማማች': 1, 'ማችነ': 1, 'ችነት': 1, 'መንፈ': 1, 'ንፈስ': 1, 'መመል': 1, 'መልከ': 1, 'ልከት': 1, 'ገባዋ': 1, 'ባዋል': 1, 'እያን': 1, 'ያንዳ': 1, 'ንዳን': 1, 'ዳንዱ': 1, 'የዘር': 1, 'የቀለ': 1, 'ቀለም': 1, 'የጾታ': 1, 'የቋን': 1, 'ቋንቋ': 1, 'የሃይ': 1, 'ሃይማ': 1, 'ይማኖ': 1, 'ማኖት': 1, 'ስተሳ': 1, 'ተሳሰ': 1, 'ሳሰብ': 1, 'የብሔ': 1, 'ብሔራ': 1, 'ሔራዊ': 1, 'የኀብ': 1, 'ኀብረ': 1, 'ብረተ': 1, 'ረተሰ': 1, 'ተሰብ': 1, 'ታሪክ': 1, 'የሀብ': 1, 'ሀብት': 1, 'የትው': 1, 'ትውል': 1, 'ውልድ': 1, 'ደረጃ': 1, 'ሳይኖ': 1, 'ይኖሩ': 1, 'በዚሁ': 1, 'ውሳኔ': 1, 'የተዘ': 1, 'ተዘረ': 1, 'ዘረዘ': 1, 'ረዘሩ': 1, 'ዘሩት': 1, 'መብቶ': 1, 'ብቶች': 1, 'ቶችን': 1, 'ችንና': 1, 'ነጻነ': 1, 'ጻነቶ': 1, 'ነቶች': 1, 'እንዲ': 1, 'ንዲከ': 1, 'ዲከበ': 1, 'ከበሩ': 1, 'በሩለ': 1, 'ሩለት': 1, 'ገባል': 1, 'ከዚህ': 1, 'ዚህም': 1, 'በተቀ': 1, 'ተቀረ': 1, 'አንድ': 1, 'ከሚኖ': 1, 'ሚኖር': 1, 'ኖርበ': 1, 'ርበት': 1, 'ግዛት': 1, 'የአገ': 1, 'የኢን': 1, 'ኢንተ': 1, 'ንተር': 1, 'ተርና': 1, 'ርናሽ': 1, 'ናሽና': 1, 'ሽናል': 1, 'አቋም': 1, 'የተነ': 1, 'ተነሳ': 1, 'አገሩ': 1, 'ነጻም': 1, 'በሞግ': 1, 'ሞግዚ': 1, 'ግዚት': 1, 'ዚትነ': 1, 'ትነት': 1, 'ስተዳ': 1, 'እራሱ': 1, 'ራሱን': 1, 'የማይ': 1, 'ማይተ': 1, 'ይተዳ': 1, 'ተወላ': 1, 'ወላጅ': 1, 'በማን': 1, 'ማንኛ': 1, 'ንኛው': 1, 'ኛውም': 1, 'ገደብ': 1, 'አይፈ': 1, 'ይፈጸ': 1, 'ፈጸም': 1, 'ጸምበ': 1, 'ምበት': 1, 'በትም': 1}
z.to_list()
# ['ወይም', 'አንቀ', 'ንቀጽ', 'ያለው', 'ይገባ', 'የፖለ', 'ፖለቲ', 'ለቲካ', 'የሌላ', 'ዓይነ', 'ይነት', 'አስተ', 'ልዩነ', 'ዩነት', 'አገር', 'አገዛ', 'ገዛዝ', 'ተዳደ', 'ዳደር', 'ቢሆን', 'ሆንም', 'የሰው', 'ሲወለ', 'ወለድ', 'ነጻና', 'በክብ', 'ክብር', 'ብርና', 'በመብ', 'መብት', 'ብትም', 'እኩል', 'ኩልነ', 'ልነት', 'የተፈ', 'ተፈጥ', 'ፈጥሮ', 'ማስተ', 'ስተዋ', 'ተዋል', 'ዋልና', 'ሕሊና', 'ስላለ', 'ላለው', 'አንዱ', 'ሌላው', 'ላውን', 'በወን', 'ወንድ', 'ንድማ', 'ድማማ', 'ማማች', 'ማችነ', 'ችነት', 'መንፈ', 'ንፈስ', 'መመል', 'መልከ', 'ልከት', 'ገባዋ', 'ባዋል', 'እያን', 'ያንዳ', 'ንዳን', 'ዳንዱ', 'የዘር', 'የቀለ', 'ቀለም', 'የጾታ', 'የቋን', 'ቋንቋ', 'የሃይ', 'ሃይማ', 'ይማኖ', 'ማኖት', 'ስተሳ', 'ተሳሰ', 'ሳሰብ', 'የብሔ', 'ብሔራ', 'ሔራዊ', 'የኀብ', 'ኀብረ', 'ብረተ', 'ረተሰ', 'ተሰብ', 'ታሪክ', 'የሀብ', 'ሀብት', 'የትው', 'ትውል', 'ውልድ', 'ደረጃ', 'ሳይኖ', 'ይኖሩ', 'በዚሁ', 'ውሳኔ', 'የተዘ', 'ተዘረ', 'ዘረዘ', 'ረዘሩ', 'ዘሩት', 'መብቶ', 'ብቶች', 'ቶችን', 'ችንና', 'ነጻነ', 'ጻነቶ', 'ነቶች', 'እንዲ', 'ንዲከ', 'ዲከበ', 'ከበሩ', 'በሩለ', 'ሩለት', 'ገባል', 'ከዚህ', 'ዚህም', 'በተቀ', 'ተቀረ', 'አንድ', 'ከሚኖ', 'ሚኖር', 'ኖርበ', 'ርበት', 'ግዛት', 'የአገ', 'የኢን', 'ኢንተ', 'ንተር', 'ተርና', 'ርናሽ', 'ናሽና', 'ሽናል', 'አቋም', 'የተነ', 'ተነሳ', 'አገሩ', 'ነጻም', 'በሞግ', 'ሞግዚ', 'ግዚት', 'ዚትነ', 'ትነት', 'ስተዳ', 'እራሱ', 'ራሱን', 'የማይ', 'ማይተ', 'ይተዳ', 'ተወላ', 'ወላጅ', 'በማን', 'ማንኛ', 'ንኛው', 'ኛውም', 'ገደብ', 'አይፈ', 'ይፈጸ', 'ፈጸም', 'ጸምበ', 'ምበት', 'በትም']
# African languages may contain a mixture of composed and
# precomposed characters, so ngraphs may contain unpaired combining diacritics.
t = 'dɛ̈tëicëkäŋ akɔ̈ɔ̈n'
tc = ngraphs(t)
print(tc.data)
# {'ɔ̈': 2, 'dɛ': 1, 'ɛ̈': 1, '̈t': 1, 'të': 1, 'ëi': 1, 'ic': 1, 'cë': 1, 'ëk': 1, 'kä': 1,
# 'äŋ': 1, 'ŋ ': 1, ' a': 1, 'ak': 1, 'kɔ': 1, '̈ɔ': 1, '̈n': 1}
# If we change from character based ngraphs to grapheme based ngraphs, we sidestep the
# issue with unpaired diacritics.
tg = ngraphs(t, graphemes=True)
print(tg.data)
# {'dɛ̈': 1, 'ɛ̈t': 1, 'të': 1, 'ëi': 1, 'ic': 1, 'cë': 1, 'ëk': 1, 'kä': 1, 'äŋ': 1, 'ŋ ': 1,
# ' a': 1, 'ak': 1, 'kɔ̈': 1, 'ɔ̈ɔ̈': 1, 'ɔ̈n': 1}
@andjc
Copy link
Author

andjc commented Aug 7, 2023

Grapheme support and compete documentation to be added.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment