Skip to content

Instantly share code, notes, and snippets.

@elazarg
Last active March 24, 2022 11:00
Show Gist options
  • Save elazarg/2290b6e7e6f9eb8e8e0b90f1925eba60 to your computer and use it in GitHub Desktop.
Save elazarg/2290b6e7e6f9eb8e8e0b90f1925eba60 to your computer and use it in GitHub Desktop.
Fetching diacritization from Dicta's API
from typing import Literal
import re
from functools import wraps
import requests
__all__ = ['fetch_dicta']
METEG = '\u05BD'
def split_by_length(characters, maxlen: int):
assert maxlen > 1
out = []
space = maxlen
for c in characters:
if c == ' ':
space = len(out)
out.append(c)
if len(out) == maxlen - 1:
yield out[:space + 1]
out = out[space + 1:]
if out:
yield out
def split_string_by_length(text: str, maxlen):
return [''.join(s).strip() for s in split_by_length(text, maxlen)]
def longest_undotted_length(text: str) -> int:
text = re.sub(r'[^\u0590-\u05f4#]', '', text)
return len(max([''] + re.findall(r'[\u05d0-\u05ea]+', text), key=len))
def piecewise(maxlen):
def inner(fetch):
@wraps(fetch)
def fetcher(text):
return ' '.join(fetch(chunk) for chunk in split_string_by_length(text, maxlen))
return fetcher
return inner
@piecewise(10000)
def fetch_dicta(text: str,
genre: Literal["rabbinic", "modern", "poetry"] = "modern",
keepmetagim=False,
keepqq=False,
matchpartial=True,
nodageshdefmem=False,
patachma=False,
addmorph=True) -> str:
text = '\n'.join(
line for line in text.split('\n')
if not line.startswith('https') and not line.startswith('#')).strip()
def extract_word(k):
if k['options']:
res = k['options'][0][0]
res = res.replace('|', '')
res = res.replace(METEG, '')
return res
return k['word']
url = 'https://nakdan-2-0.loadbalancer.dicta.org.il/api'
payload = {
"data": text,
"task": "nakdan",
"genre": genre,
"keepmetagim": keepmetagim,
"keepqq": keepqq,
"matchpartial": matchpartial,
"nodageshdefmem": nodageshdefmem,
"patachma": patachma,
"addmorph": addmorph,
}
headers = {
'content-type': 'text/plain;charset=UTF-8'
}
r = requests.post(url, json=payload, headers=headers)
r.raise_for_status()
result = ''.join(extract_word(k) for k in r.json())
if longest_undotted_length(result) > 40:
raise RuntimeError('Failed to dot')
return result
def test():
print(fetch_dicta("בוקר טוב"))
if __name__ == '__main__':
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment