mishari/text_processor.py

## text_processor.py
from pythainlp.tokenize import sent_tokenize, word_tokenize
from pythainlp.util import normalize
from pythainlp.util import num_to_thaiword
from pythainlp.spell import NorvigSpellChecker
import json_lines
import json
import re
import string
import sys
import fileinput
from multiprocessing import Pool

# Parameters from https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js
MIN_LENGTH = 6
MAX_LENGTH = 100

INVALIDATION = [{
  "regex": '[0-9๐-๙]',
  "error": 'Sentence should not contain numbers',
}, {
  "regex": '[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]',
  "error": 'Sentence should not contain symbols, including Paiyannoi and Maiyamok',
}, {
  "regex": '[A-Za-z]',
  "error": 'Sentence should not contain latin alphabet characters',
}, {
  "regex": '[ก-ฮ]\.[ก-ฮ]+\.',
  "error": 'Sentence should not contain abbreviations',
}, {
  "regex": '(^|\s)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
  "error": 'Word should not start with unexpected characters, like follow vowel and tone mark',
}, {
  "regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44](\s|$)',
  "error": 'Word should not end with leading vowels',
}, {
  "regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44]{2}',
  "error": 'Sentence should not contain repeating lead vowels',
}, {
  "regex": '[\u0E32\u0E33\u0E45]{2}',
  "error": 'Sentence should not contain repeating follow vowels',
}, {
  "regex": '\u0E30{2}',
  "error": 'Sentence should not contain repeating Sara A',
}, {
  "regex": '\u0E3A{2}|\u0E4C{2}|\u0E4D{2}|\u0E4E{2}',
  "error": 'Sentence should not contain repeating Phinthu / Thanthakhat / Nikhahit / Yamakkan',
}, {
  "regex": '[\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47]{2}',
  "error": 'Sentence should not contain repeating above vowels',
}, {
  "regex": '[\u0E38\u0E39]{2}',
  "error": 'Sentence should not contain repeating below vowels',
}, {
  "regex": '[\u0E48\u0E49\u0E4A\u0E4B]{2}',
  "error": 'Sentence should not contain repeating tone marks',
}, {
  "regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44\u0E30\u0E32\u0E33\u0E45][\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
  "error": 'Sentence should not contain invalid symbols after lead/follow vowels',
}, {
  "regex": '[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E][\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]',
  "error": 'Sentence should not contain invalid symbols before above/below vowels',
}, {
  "regex": '[\u0E33\u0E45][\u0E30]',
  "error": 'Sentence should not contain Sara A after Sara Am or Lakkhangyao',
}, {
  "regex": '[\u0E30][\u0E32\u0E33\u0E45]',
  "error": 'Sentence should not contain Sara Aa, Sara Am or Lakkhangyao after Sara A',
}, {
  "regex": '[\u200b\u200c\u2063\u0E01-\u0E4E]{71}',
  "error": 'Sentence should not contain more than 70 consonants and vowels running without a space',
}, {
  "regex": """[\u200b\u200c\u2063\u0E01-\u0E4E.,\-"'“”‘’\u0060?!:;]{81}""",
  "error": 'Sentence should not contain more than 80 characters running without a space',
}, {
  "regex": '[\u200b\u200c\u2063ก-ฮ]{31}',
  "error": 'Sentence should not contain more than 30 consonants running without a space',
}, {
  "regex": '(.)\1{6}',
  "error": 'Sentence should not contain more than 7 of the same character in a row',
}, {
  "regex": '(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])',
  "error": 'Sentence should not contain emojis or other special Unicode symbols',
}]


def strip_punctuation(old_string):
    new_string = ""
    for c in old_string:
        if c not in string.punctuation:
            new_string += c

    return new_string


def strip_whitespace(a_string):
    a_string = a_string.strip()
    a_string = " ".join(a_string.split())
    return a_string

def is_length_valid(s):
    if len(s) < MIN_LENGTH or len(s) > MAX_LENGTH:
        return False
    else:
        return True

def remove_wrong_length(sentences):

    new_s = []

    for s in sentences:
        if not is_length_valid(s):
            # print("WRONG LENGTH: " + s)
            pass
        else:
            new_s.append(s)

    return set(new_s)

def is_sentence_valid(s):
    rules = INVALIDATION
    valid = True

    if not is_length_valid(s):
        # print("INVALID LENGTH: " + s)
        return False

    for r in rules:
        if re.search(r["regex"], s):
            # print(r["error"] + ": " + s)
            return False

    if not sentence_spelling_correct(s):
      return False

    return valid

def check_sentences(sentence):
    new_s = []

    for s in sentences:
        valid = is_sentence_valid(s)

        if valid:
            new_s.append(s)

    return new_s

def split_sentences(text):
    sentences = sent_tokenize(text)
    sentences = [strip_whitespace(s) for s in sentences]
    output = []

    for s in sentences:
        if is_sentence_valid(s):
            output.append(s)
        else:
            o = [x for x in sent_tokenize(s,engine="whitespace") if is_sentence_valid(x)]
            output.extend(o)

    return output

def number_to_word(text):
    sentences = text.split(" ")
    output = []

    for s in sentences:
        if s.isdigit():
            output.append(num_to_thaiword(int(s)))
        else:
            output.append(s)

    return ' '.join(output)

def sentence_spelling_correct(sentence):
    checker = NorvigSpellChecker()
    words = word_tokenize(sentence)

    words = [w for w in words if w != ' ']

    if len(checker.known(words)) != len(words):
      return False

    return True

def normalize_mai_yamok(text):
    sentences = text.split(" ")


sentences = set()

# with open(sys.argv[1], 'rb') as f:
#     for item in json_lines.reader(f):
#         headline = number_to_word(normalize(strip_punctuation(
#                     item['headline'])))
#         article = number_to_word(normalize(strip_punctuation(
#                     item['article'])))

#         sentences.update(split_sentences(headline))
#         sentences.update(split_sentences(article))


def process_text(text):
      return split_sentences(number_to_word(normalize(strip_punctuation(
                  text))))

if __name__ == '__main__':
  pool = Pool()
  for i in pool.imap_unordered(process_text,fileinput.input()):
    sentences.update(i)

  with open("output.txt","w") as f:
      for s in sentences:
        f.write(s)
        f.write('\n')
	from pythainlp.tokenize import sent_tokenize, word_tokenize
	from pythainlp.util import normalize
	from pythainlp.util import num_to_thaiword
	from pythainlp.spell import NorvigSpellChecker
	import json_lines
	import json
	import re
	import string
	import sys
	import fileinput
	from multiprocessing import Pool

	# Parameters from https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js
	MIN_LENGTH = 6
	MAX_LENGTH = 100

	INVALIDATION = [{
	"regex": '[0-9๐-๙]',
	"error": 'Sentence should not contain numbers',
	}, {
	"regex": '[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]',
	"error": 'Sentence should not contain symbols, including Paiyannoi and Maiyamok',
	}, {
	"regex": '[A-Za-z]',
	"error": 'Sentence should not contain latin alphabet characters',
	}, {
	"regex": '[ก-ฮ]\.[ก-ฮ]+\.',
	"error": 'Sentence should not contain abbreviations',
	}, {
	"regex": '(^\|\s)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
	"error": 'Word should not start with unexpected characters, like follow vowel and tone mark',
	}, {
	"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44](\s\|$)',
	"error": 'Word should not end with leading vowels',
	}, {
	"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44]{2}',
	"error": 'Sentence should not contain repeating lead vowels',
	}, {
	"regex": '[\u0E32\u0E33\u0E45]{2}',
	"error": 'Sentence should not contain repeating follow vowels',
	}, {
	"regex": '\u0E30{2}',
	"error": 'Sentence should not contain repeating Sara A',
	}, {
	"regex": '\u0E3A{2}\|\u0E4C{2}\|\u0E4D{2}\|\u0E4E{2}',
	"error": 'Sentence should not contain repeating Phinthu / Thanthakhat / Nikhahit / Yamakkan',
	}, {
	"regex": '[\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47]{2}',
	"error": 'Sentence should not contain repeating above vowels',
	}, {
	"regex": '[\u0E38\u0E39]{2}',
	"error": 'Sentence should not contain repeating below vowels',
	}, {
	"regex": '[\u0E48\u0E49\u0E4A\u0E4B]{2}',
	"error": 'Sentence should not contain repeating tone marks',
	}, {
	"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44\u0E30\u0E32\u0E33\u0E45][\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
	"error": 'Sentence should not contain invalid symbols after lead/follow vowels',
	}, {
	"regex": '[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E][\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]',
	"error": 'Sentence should not contain invalid symbols before above/below vowels',
	}, {
	"regex": '[\u0E33\u0E45][\u0E30]',
	"error": 'Sentence should not contain Sara A after Sara Am or Lakkhangyao',
	}, {
	"regex": '[\u0E30][\u0E32\u0E33\u0E45]',
	"error": 'Sentence should not contain Sara Aa, Sara Am or Lakkhangyao after Sara A',
	}, {
	"regex": '[\u200b\u200c\u2063\u0E01-\u0E4E]{71}',
	"error": 'Sentence should not contain more than 70 consonants and vowels running without a space',
	}, {
	"regex": """[\u200b\u200c\u2063\u0E01-\u0E4E.,\-"'“”‘’\u0060?!:;]{81}""",
	"error": 'Sentence should not contain more than 80 characters running without a space',
	}, {
	"regex": '[\u200b\u200c\u2063ก-ฮ]{31}',
	"error": 'Sentence should not contain more than 30 consonants running without a space',
	}, {
	"regex": '(.)\1{6}',
	"error": 'Sentence should not contain more than 7 of the same character in a row',
	}, {
	"regex": '(\u00a9\|\u00ae\|[\u2000-\u3300]\|[\u2580-\u27bf]\|\ud83c[\ud000-\udfff]\|\ud83d[\ud000-\udfff]\|\ud83e[\ud000-\udfff]\|[\ue000-\uf8ff])',
	"error": 'Sentence should not contain emojis or other special Unicode symbols',
	}]


	def strip_punctuation(old_string):
	new_string = ""
	for c in old_string:
	if c not in string.punctuation:
	new_string += c

	return new_string


	def strip_whitespace(a_string):
	a_string = a_string.strip()
	a_string = " ".join(a_string.split())
	return a_string

	def is_length_valid(s):
	if len(s) < MIN_LENGTH or len(s) > MAX_LENGTH:
	return False
	else:
	return True

	def remove_wrong_length(sentences):

	new_s = []

	for s in sentences:
	if not is_length_valid(s):
	# print("WRONG LENGTH: " + s)
	pass
	else:
	new_s.append(s)

	return set(new_s)

	def is_sentence_valid(s):
	rules = INVALIDATION
	valid = True

	if not is_length_valid(s):
	# print("INVALID LENGTH: " + s)
	return False

	for r in rules:
	if re.search(r["regex"], s):
	# print(r["error"] + ": " + s)
	return False

	if not sentence_spelling_correct(s):
	return False

	return valid

	def check_sentences(sentence):
	new_s = []

	for s in sentences:
	valid = is_sentence_valid(s)

	if valid:
	new_s.append(s)

	return new_s

	def split_sentences(text):
	sentences = sent_tokenize(text)
	sentences = [strip_whitespace(s) for s in sentences]
	output = []

	for s in sentences:
	if is_sentence_valid(s):
	output.append(s)
	else:
	o = [x for x in sent_tokenize(s,engine="whitespace") if is_sentence_valid(x)]
	output.extend(o)

	return output

	def number_to_word(text):
	sentences = text.split(" ")
	output = []

	for s in sentences:
	if s.isdigit():
	output.append(num_to_thaiword(int(s)))
	else:
	output.append(s)

	return ' '.join(output)

	def sentence_spelling_correct(sentence):
	checker = NorvigSpellChecker()
	words = word_tokenize(sentence)

	words = [w for w in words if w != ' ']

	if len(checker.known(words)) != len(words):
	return False

	return True

	def normalize_mai_yamok(text):
	sentences = text.split(" ")


	sentences = set()

	# with open(sys.argv[1], 'rb') as f:
	# for item in json_lines.reader(f):
	# headline = number_to_word(normalize(strip_punctuation(
	# item['headline'])))
	# article = number_to_word(normalize(strip_punctuation(
	# item['article'])))

	# sentences.update(split_sentences(headline))
	# sentences.update(split_sentences(article))


	def process_text(text):
	return split_sentences(number_to_word(normalize(strip_punctuation(
	text))))

	if __name__ == '__main__':
	pool = Pool()
	for i in pool.imap_unordered(process_text,fileinput.input()):
	sentences.update(i)

	with open("output.txt","w") as f:
	for s in sentences:
	f.write(s)
	f.write('\n')