Created
March 4, 2022 05:05
-
-
Save colinjroberts/6e48bfb4ae664a3eb1527d03c5bcad87 to your computer and use it in GitHub Desktop.
From korean-typing-practice-part1: A Flask App that generates random English words and deduplicates Korean letters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from string import ascii_letters, ascii_lowercase | |
from flask import Flask, request | |
from random import randint | |
app = Flask(__name__) | |
@app.route('/') | |
def hello(): | |
return "Use the route /text with the following arguments: letters to use \ | |
in the words, number of words to generate. \ | |
e.g. localhost:5000/text?letters=abcde&count=5" | |
@app.route('/text', methods=['GET']) | |
def get_text(): | |
MAX_WORD_LENGTH = 6 | |
ACCEPTED_LANGUAGES = ('EN', 'KO') | |
# Get arguments and return helpful errors if missing | |
args = request.args.to_dict() | |
count_of_words_to_return = int(args.get('count')) | |
letters_provided = args.get('letters') | |
language = args.get('lang') | |
if not (count_of_words_to_return and letters_provided and language): | |
raise ValueError(f"int count expected, {count_of_words_to_return} was " | |
"provided; string letters expected, " | |
"{letters_provided} was provided, " | |
"string lang was expected, {language} was provided.") | |
elif int(count_of_words_to_return) <= 0: | |
raise ValueError(f"int count must be greater than 0: " | |
f"{count_of_words_to_return} was provided") | |
elif language not in ACCEPTED_LANGUAGES: | |
raise ValueError(f"Langauge must be one of {ACCEPTED_LANGUAGES}: " | |
f"{language} was provided") | |
if language == 'EN': | |
return handle_EN(count_of_words_to_return, letters_provided, MAX_WORD_LENGTH) | |
elif language == 'KO': | |
return handle_KO(count_of_words_to_return, letters_provided, MAX_WORD_LENGTH) | |
else: | |
return None | |
def handle_EN(count_of_words_to_return, letters_provided, max_word_length): | |
# Deduplicate list of letters | |
set_of_letters = deduplicate_letters_EN(letters_provided) | |
# Throw an error if there are no ascii letters in the set | |
if len(set_of_letters) <= 0: | |
raise ValueError(f"string letters expected at least 1 ascii letter, \ | |
{set_of_letters} was provided") | |
# Create and return the number of requested words | |
output = create_many_words_EN(set_of_letters, count_of_words_to_return, max_word_length) | |
return ", ".join(output) | |
def deduplicate_letters_EN(list_of_letters): | |
set_of_letters = set() | |
for char in list_of_letters: | |
if char in ascii_letters: | |
set_of_letters.add(char.lower()) | |
return set_of_letters | |
def create_one_word_EN(list_of_letters, word_length): | |
list_of_letters_for_new_word = [] | |
for i in range(word_length): | |
random_letter = list_of_letters[randint(0, len(list_of_letters)-1)] | |
list_of_letters_for_new_word.append(random_letter) | |
return "".join(list_of_letters_for_new_word) | |
def create_many_words_EN(set_of_letters, number_of_words, max_word_length): | |
output = [] | |
for i in range(number_of_words): | |
random_word_length = randint(1, max_word_length) | |
output.append(create_one_word_EN(list(set_of_letters), random_word_length)) | |
return output | |
def handle_KO(count_of_words_to_return, letters_provided, max_word_length): | |
# Filter input to only Korean letters | |
dict_of_filtered_input = filter_input_KO(letters_provided) | |
# Throw an error if there are no Korean letters in the set | |
list_of_dict_contents_is_not_empty = [len(l) > 0 for l in dict_of_filtered_input.values()] | |
if not any(list_of_dict_contents_is_not_empty): | |
raise ValueError(f"string letters expected at least 1 korean letter, " | |
f"{letters_provided} was provided.") | |
# Create a deduplicated set of letters | |
set_of_letters = deduplicate_letters_KO(dict_of_filtered_input) | |
return ", ".join(set_of_letters) | |
def deduplicate_letters_KO(dict_of_filtered_input): | |
# Decompose letter_input into characters | |
decomposed_letters = [] | |
decomposed_letters.extend(dict_of_filtered_input["compatibility_jamo"]) | |
for j in dict_of_filtered_input["jamo"]: | |
decomposed_letters.extend(lookup_jamo_KO(j)) | |
for s in dict_of_filtered_input["syllable"]: | |
decomposed_letters.extend(decompose_words_KO(s)) | |
# Convert decomposed list of letters to a set | |
set_of_letters = set(decomposed_letters) | |
return set_of_letters | |
def filter_input_KO(letter_input): | |
"""Converts all letters and multi-letter Unicode characters into a list | |
of initial or medial compatibility jamo. | |
Lookup keys are Unicode Hangul Jamo (1100–11FF). Their values are lists | |
of Unicode Hangul Compatibility Jamo which have only one representation | |
per character. For example, the inital jamo 'ᄀ' (U+1100) and the terminal | |
jamo 'ᆨ' (U+11a8) will both become compatibility jamo "ㄱ" (U+3131). If | |
a character isn't a compatibility jamo and isn't in the lookup, an error | |
is thrown. | |
N.B. To be more efficient, this table could be set as a constant and | |
referenced later. | |
""" | |
hangul_ranges = { | |
"jamo": (int('0x1100', 16), int('0x11FF', 16)), | |
"syllable": (int('0xAC00', 16), int('0xD7A3', 16)), | |
"compatibility_jamo": (int('0x3130', 16), int('0x318F', 16)), | |
} | |
output = {"jamo": [], "syllable": [], "compatibility_jamo": []} | |
for item in letter_input: | |
for (key, (range_min, range_max)) in hangul_ranges.items(): | |
if range_min <= ord(item) <= range_max: | |
output[key].append(item) | |
return output | |
def decompose_words_KO(list_of_filtered_input): | |
"""Takes a list of Korean letters and words, and returns a list of | |
all letters in order | |
""" | |
output = [] | |
for item in list_of_filtered_input: | |
# Save hex offsets for initial, medial, and terminal characters | |
intial_chr_ref = 4351 # Initial hangul characters start after '0x10FF' | |
mid_chr_ref = 4448 # Initial hangul characters start after '0x1161' | |
terminal_chr_ref = 4519 # Initial hangul characters start after '0x11A8' | |
# Calculate relative position of each jamo | |
terminal = (ord(item) - 44032) % 28 | |
mid = 1 + ((ord(item) - 44032 - terminal) % 588 // 28) | |
initial = 1 + ((ord(item) - 44032 + 1) // 588) | |
# Calculate base 10 number of each Unicode Jamo | |
terminal = terminal_chr_ref + terminal | |
mid = mid_chr_ref + mid | |
initial = intial_chr_ref + initial | |
# Convert to character, then to compatibility jamo | |
jamo = [chr(initial), chr(mid), chr(terminal)] | |
for j in jamo: | |
if j: | |
output.extend(lookup_jamo_KO(j)) | |
return output | |
def lookup_jamo_KO(letter_block): | |
"""Converts all letters and multi-letter Unicode characters into a list | |
of initial or medial compatibility jamo. | |
Lookup keys are Unicode Hangul Jamo (1100–11FF). Their values are lists | |
of Unicode Hangul Compatibility Jamo which have only one representation | |
per character. For example, the inital jamo 'ᄀ' (U+1100) and the terminal | |
jamo 'ᆨ' (U+11a8) will both become compatibility jamo "ㄱ" (U+3131). If | |
a character isn't a compatibility jamo and isn't in the lookup, an error | |
is thrown. | |
N.B. To be more efficient, this table could be set as a constant and | |
referenced later. | |
""" | |
lookup = { | |
'ᄀ': ["ㄱ"], | |
'ᄁ': ["ㄲ"], | |
'ᄂ': ["ㄴ"], | |
'ᄃ': ["ㄷ"], | |
'ᄄ': ["ㄸ"], | |
'ᄅ': ["ㄹ"], | |
'ᄆ': ["ㅁ"], | |
'ᄇ': ["ㅂ"], | |
'ᄈ': ["ㅃ"], | |
'ᄉ': ["ㅅ"], | |
'ᄊ': ["ㅆ"], | |
'ᄋ': ["ㅇ"], | |
'ᄌ': ["ㅈ"], | |
'ᄍ': ["ㅉ"], | |
'ᄎ': ["ㅊ"], | |
'ᄏ': ["ㅋ"], | |
'ᄐ': ["ㅌ"], | |
'ᄑ': ["ㅍ"], | |
'ᄒ': ["ㅎ"], | |
'ᅡ': ["ㅏ"], | |
'ᅢ': ["ㅐ"], | |
'ᅣ': ["ㅑ"], | |
'ᅤ': ["ㅒ"], | |
'ᅥ': ["ㅓ"], | |
'ᅦ': ["ㅔ"], | |
'ᅧ': ["ㅕ"], | |
'ᅨ': ["ㅖ"], | |
'ᅩ': ["ㅗ"], | |
'ᅪ': ["ㅗ", "ㅏ"], | |
'ᅫ': ["ㅗ", "ㅐ"], | |
'ᅬ': ["ㅗ", "ㅣ"], | |
'ᅭ': ["ㅛ"], | |
'ᅮ': ["ㅜ"], | |
'ᅯ': ["ㅜ", "ㅓ"], | |
'ᅰ': ["ㅜ", "ㅔ"], | |
'ᅱ': ["ㅜ", "ㅣ"], | |
'ᅲ': ["ㅠ"], | |
'ᅳ': ["ㅡ"], | |
'ᅴ': ["ㅣ"], | |
'ᅵ': ["ㅣ"], | |
'ᆨ': ["ㄱ"], | |
'ᆩ': ["ㄲ"], | |
'ᆪ': ["ㄱ", "ㅅ"], | |
'ᆫ': ["ㄴ"], | |
'ᆬ': ["ㄴ", "ㅈ"], | |
'ᆭ': ["ㄴ", "ㅎ"], | |
'ᆮ': ["ᄃ"], | |
'ᆯ': ["ㄹ"], | |
'ᆰ': ["ㄹ", "ㄱ"], | |
'ᆱ': ["ㄹ", "ㅁ"], | |
'ᆲ': ["ㄹ", "ㅂ"], | |
'ᆳ': ["ㄹ", "ㅅ"], | |
'ᆴ': ["ㄹ", "ㅌ"], | |
'ᆵ': ["ㄹ", "ㅍ"], | |
'ᆶ': ["ㄹ", "ㅎ"], | |
'ᆷ': ["ㅁ"], | |
'ᆸ': ["ㅂ"], | |
'ᆹ': ["ㅂ", "ㅅ"], | |
'ᆺ': ["ㅅ"], | |
'ᆻ': ["ㅆ"], | |
'ᆼ': ["ㅇ"], | |
'ᆽ': ["ㅈ"], | |
'ᆾ': ["ㅊ"], | |
'ᆿ': ["ㅋ"], | |
'ᇀ': ["ㅌ"], | |
'ᇁ': ["ㅍ"], | |
'ᇂ': ["ㅎ"], | |
} | |
output_list = lookup.get(letter_block, []) | |
return output_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment