Skip to content

Instantly share code, notes, and snippets.

@avi-perl
Created October 26, 2021 03:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avi-perl/55ce41d39164c8d0390810163a53c62c to your computer and use it in GitHub Desktop.
Save avi-perl/55ce41d39164c8d0390810163a53c62c to your computer and use it in GitHub Desktop.
My attempt to make a HebrewString object, a string representing hebrew text that you can slice and dice despite extra Unicode characters.
from rich import print
e = "וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְה֑וֹם וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם׃"
class HebrewString(str):
HEBREW_LETTERS = ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ך", "ל", "מ", "ם", "נ", "ן", "ס", "ע",
"פ", "ף", "צ", "ץ", "ק", "ר", "ש", "ת"]
def __init__(self, hebrew_string):
self._raw_input = hebrew_string
self.word_list = self._raw_input.split()
@property
def character_split(self):
word_pieces = []
# Build a list of strings where each string is a single hebrew letter and its accompanying chars
for word in self.word_list:
part = ""
for unicode_char in word:
if len(part) == 0:
# First letter of the word
part += unicode_char
continue
if unicode_char not in self.HEBREW_LETTERS:
part += unicode_char
else:
word_pieces.append(part)
part = unicode_char
word_pieces.append(part)
word_pieces.append(" ")
return word_pieces
def split(self, *args, **kwargs):
return [HebrewString(x) for x in super().split(*args, **kwargs)]
def __getitem__(self, items):
return "".join(self.character_split[items])
if __name__ == '__main__':
h = HebrewString(e)
print(h)
first_word = h.split()[0]
shoresh = first_word[-4:]
print(shoresh)
@avi-perl
Copy link
Author

avi-perl commented Oct 3, 2022

Python Library

This object is now available in my hebrew Python library!

pip install hebrew

Check it out: https://hebrew.aviperl.me/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment