elazarg/nakdan.py

## nakdan.py
from typing import Literal
import re
from functools import wraps

import requests

__all__ = ['fetch_dicta']


METEG = '\u05BD'


def split_by_length(characters, maxlen: int):
    assert maxlen > 1
    out = []
    space = maxlen
    for c in characters:
        if c == ' ':
            space = len(out)
        out.append(c)
        if len(out) == maxlen - 1:
            yield out[:space + 1]
            out = out[space + 1:]
    if out:
        yield out


def split_string_by_length(text: str, maxlen):
    return [''.join(s).strip() for s in split_by_length(text, maxlen)]


def longest_undotted_length(text: str) -> int:
    text = re.sub(r'[^\u0590-\u05f4#]', '', text)
    return len(max([''] + re.findall(r'[\u05d0-\u05ea]+', text), key=len))


def piecewise(maxlen):
    def inner(fetch):
        @wraps(fetch)
        def fetcher(text):
            return ' '.join(fetch(chunk) for chunk in split_string_by_length(text, maxlen))

        return fetcher

    return inner


@piecewise(10000)
def fetch_dicta(text: str,
                genre: Literal["rabbinic", "modern", "poetry"] = "modern",
                keepmetagim=False,
                keepqq=False,
                matchpartial=True,
                nodageshdefmem=False,
                patachma=False,
                addmorph=True) -> str:
    text = '\n'.join(
        line for line in text.split('\n')
        if not line.startswith('https') and not line.startswith('#')).strip()

    def extract_word(k):
        if k['options']:
            res = k['options'][0][0]
            res = res.replace('|', '')
            res = res.replace(METEG, '')
            return res
        return k['word']

    url = 'https://nakdan-2-0.loadbalancer.dicta.org.il/api'

    payload = {
        "data": text,
        "task": "nakdan",
        "genre": genre,
        "keepmetagim": keepmetagim,
        "keepqq": keepqq,
        "matchpartial": matchpartial,
        "nodageshdefmem": nodageshdefmem,
        "patachma": patachma,
        "addmorph": addmorph,
    }
    headers = {
        'content-type': 'text/plain;charset=UTF-8'
    }

    r = requests.post(url, json=payload, headers=headers)
    r.raise_for_status()
    result = ''.join(extract_word(k) for k in r.json())
    if longest_undotted_length(result) > 40:
        raise RuntimeError('Failed to dot')
    return result


def test():
    print(fetch_dicta("בוקר טוב"))


if __name__ == '__main__':
    test()
	from typing import Literal
	import re
	from functools import wraps

	import requests

	__all__ = ['fetch_dicta']


	METEG = '\u05BD'


	def split_by_length(characters, maxlen: int):
	assert maxlen > 1
	out = []
	space = maxlen
	for c in characters:
	if c == ' ':
	space = len(out)
	out.append(c)
	if len(out) == maxlen - 1:
	yield out[:space + 1]
	out = out[space + 1:]
	if out:
	yield out


	def split_string_by_length(text: str, maxlen):
	return [''.join(s).strip() for s in split_by_length(text, maxlen)]


	def longest_undotted_length(text: str) -> int:
	text = re.sub(r'[^\u0590-\u05f4#]', '', text)
	return len(max([''] + re.findall(r'[\u05d0-\u05ea]+', text), key=len))


	def piecewise(maxlen):
	def inner(fetch):
	@wraps(fetch)
	def fetcher(text):
	return ' '.join(fetch(chunk) for chunk in split_string_by_length(text, maxlen))

	return fetcher

	return inner


	@piecewise(10000)
	def fetch_dicta(text: str,
	genre: Literal["rabbinic", "modern", "poetry"] = "modern",
	keepmetagim=False,
	keepqq=False,
	matchpartial=True,
	nodageshdefmem=False,
	patachma=False,
	addmorph=True) -> str:
	text = '\n'.join(
	line for line in text.split('\n')
	if not line.startswith('https') and not line.startswith('#')).strip()

	def extract_word(k):
	if k['options']:
	res = k['options'][0][0]
	res = res.replace('\|', '')
	res = res.replace(METEG, '')
	return res
	return k['word']

	url = 'https://nakdan-2-0.loadbalancer.dicta.org.il/api'

	payload = {
	"data": text,
	"task": "nakdan",
	"genre": genre,
	"keepmetagim": keepmetagim,
	"keepqq": keepqq,
	"matchpartial": matchpartial,
	"nodageshdefmem": nodageshdefmem,
	"patachma": patachma,
	"addmorph": addmorph,
	}
	headers = {
	'content-type': 'text/plain;charset=UTF-8'
	}

	r = requests.post(url, json=payload, headers=headers)
	r.raise_for_status()
	result = ''.join(extract_word(k) for k in r.json())
	if longest_undotted_length(result) > 40:
	raise RuntimeError('Failed to dot')
	return result


	def test():
	print(fetch_dicta("בוקר טוב"))


	if __name__ == '__main__':
	test()