Last active
March 24, 2022 11:00
-
-
Save elazarg/2290b6e7e6f9eb8e8e0b90f1925eba60 to your computer and use it in GitHub Desktop.
Fetching diacritization from Dicta's API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Literal | |
import re | |
from functools import wraps | |
import requests | |
__all__ = ['fetch_dicta'] | |
METEG = '\u05BD' | |
def split_by_length(characters, maxlen: int): | |
assert maxlen > 1 | |
out = [] | |
space = maxlen | |
for c in characters: | |
if c == ' ': | |
space = len(out) | |
out.append(c) | |
if len(out) == maxlen - 1: | |
yield out[:space + 1] | |
out = out[space + 1:] | |
if out: | |
yield out | |
def split_string_by_length(text: str, maxlen): | |
return [''.join(s).strip() for s in split_by_length(text, maxlen)] | |
def longest_undotted_length(text: str) -> int: | |
text = re.sub(r'[^\u0590-\u05f4#]', '', text) | |
return len(max([''] + re.findall(r'[\u05d0-\u05ea]+', text), key=len)) | |
def piecewise(maxlen): | |
def inner(fetch): | |
@wraps(fetch) | |
def fetcher(text): | |
return ' '.join(fetch(chunk) for chunk in split_string_by_length(text, maxlen)) | |
return fetcher | |
return inner | |
@piecewise(10000) | |
def fetch_dicta(text: str, | |
genre: Literal["rabbinic", "modern", "poetry"] = "modern", | |
keepmetagim=False, | |
keepqq=False, | |
matchpartial=True, | |
nodageshdefmem=False, | |
patachma=False, | |
addmorph=True) -> str: | |
text = '\n'.join( | |
line for line in text.split('\n') | |
if not line.startswith('https') and not line.startswith('#')).strip() | |
def extract_word(k): | |
if k['options']: | |
res = k['options'][0][0] | |
res = res.replace('|', '') | |
res = res.replace(METEG, '') | |
return res | |
return k['word'] | |
url = 'https://nakdan-2-0.loadbalancer.dicta.org.il/api' | |
payload = { | |
"data": text, | |
"task": "nakdan", | |
"genre": genre, | |
"keepmetagim": keepmetagim, | |
"keepqq": keepqq, | |
"matchpartial": matchpartial, | |
"nodageshdefmem": nodageshdefmem, | |
"patachma": patachma, | |
"addmorph": addmorph, | |
} | |
headers = { | |
'content-type': 'text/plain;charset=UTF-8' | |
} | |
r = requests.post(url, json=payload, headers=headers) | |
r.raise_for_status() | |
result = ''.join(extract_word(k) for k in r.json()) | |
if longest_undotted_length(result) > 40: | |
raise RuntimeError('Failed to dot') | |
return result | |
def test(): | |
print(fetch_dicta("בוקר טוב")) | |
if __name__ == '__main__': | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment