Created
March 4, 2022 05:07
-
-
Save colinjroberts/6ff78fdced21f389cf38b3dc58a8feb0 to your computer and use it in GitHub Desktop.
From korean-typing-practice-part1: A Flask app that generates random English and Korean "words"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from string import ascii_letters, ascii_lowercase | |
from flask import Flask, request | |
from random import randint | |
app = Flask(__name__) | |
@app.route('/') | |
def hello(): | |
return "Use the route /text with the following arguments: letters to use \ | |
in the words, number of words to generate. \ | |
e.g. localhost:5000/text?letters=abcde&count=5" | |
@app.route('/text', methods=['GET']) | |
def get_text(): | |
MAX_WORD_LENGTH = 6 | |
ACCEPTED_LANGUAGES = ('EN', 'KO') | |
# Get arguments and return helpful errors if missing | |
args = request.args.to_dict() | |
count_of_words_to_return = int(args.get('count')) | |
letters_provided = args.get('letters') | |
language = args.get('lang') | |
if not (count_of_words_to_return and letters_provided and language): | |
raise ValueError(f"int count expected, {count_of_words_to_return} was " | |
"provided; string letters expected, " | |
"{letters_provided} was provided, " | |
"string lang was expected, {language} was provided.") | |
elif int(count_of_words_to_return) <= 0: | |
raise ValueError(f"int count must be greater than 0: " | |
f"{count_of_words_to_return} was provided") | |
elif language not in ACCEPTED_LANGUAGES: | |
raise ValueError(f"Langauge must be one of {ACCEPTED_LANGUAGES}: " | |
f"{language} was provided") | |
if language == 'EN': | |
return handle_EN(count_of_words_to_return, letters_provided, MAX_WORD_LENGTH) | |
elif language == 'KO': | |
return handle_KO(count_of_words_to_return, letters_provided, MAX_WORD_LENGTH) | |
else: | |
return None | |
def handle_EN(count_of_words_to_return, letters_provided, max_word_length): | |
# Deduplicate list of letters | |
set_of_letters = deduplicate_letters_EN(letters_provided) | |
# Throw an error if there are no ascii letters in the set | |
if len(set_of_letters) <= 0: | |
raise ValueError(f"string letters expected at least 1 ascii letter, \ | |
{set_of_letters} was provided") | |
# Create and return the number of requested words | |
output = create_many_words_EN(set_of_letters, count_of_words_to_return, max_word_length) | |
return ", ".join(output) | |
def deduplicate_letters_EN(list_of_letters): | |
set_of_letters = set() | |
for char in list_of_letters: | |
if char in ascii_letters: | |
set_of_letters.add(char.lower()) | |
return set_of_letters | |
def create_one_word_EN(list_of_letters, word_length): | |
list_of_letters_for_new_word = [] | |
for i in range(word_length): | |
random_letter = list_of_letters[randint(0, len(list_of_letters)-1)] | |
list_of_letters_for_new_word.append(random_letter) | |
return "".join(list_of_letters_for_new_word) | |
def create_many_words_EN(set_of_letters, number_of_words, max_word_length): | |
output = [] | |
for i in range(number_of_words): | |
random_word_length = randint(1, max_word_length) | |
output.append(create_one_word_EN(list(set_of_letters), random_word_length)) | |
return output | |
def handle_KO(count_of_words_to_return, letters_provided, max_word_length): | |
# Filter input to only Korean letters | |
dict_of_filtered_input = filter_input_KO(letters_provided) | |
# Throw an error if there are no Korean letters in the set | |
list_of_dict_contents_is_not_empty = [len(l) > 0 for l in dict_of_filtered_input.values()] | |
if not any(list_of_dict_contents_is_not_empty): | |
raise ValueError(f"string letters expected at least 1 korean letter, " | |
f"{letters_provided} was provided.") | |
# Create a deduplicated set of letters | |
set_of_letters = deduplicate_letters_KO(dict_of_filtered_input) | |
# Create and return the number of requested words | |
output = create_many_words_KO(set_of_letters, count_of_words_to_return, max_word_length) | |
return ", ".join(output) | |
def deduplicate_letters_KO(dict_of_filtered_input): | |
# Decompose letter_input into characters | |
decomposed_letters = [] | |
decomposed_letters.extend(dict_of_filtered_input["compatibility_jamo"]) | |
for j in dict_of_filtered_input["jamo"]: | |
decomposed_letters.extend(lookup_jamo_KO(j)) | |
for s in dict_of_filtered_input["syllable"]: | |
decomposed_letters.extend(decompose_words_KO(s)) | |
# Convert decomposed list of letters to a set | |
set_of_letters = set(decomposed_letters) | |
return set_of_letters | |
def filter_input_KO(letter_input): | |
"""Converts all letters and multi-letter Unicode characters into a list | |
of initial or medial compatibility jamo. | |
Lookup keys are Unicode Hangul Jamo (1100–11FF). Their values are lists | |
of Unicode Hangul Compatibility Jamo which have only one representation | |
per character. For example, the inital jamo 'ᄀ' (U+1100) and the terminal | |
jamo 'ᆨ' (U+11a8) will both become compatibility jamo "ㄱ" (U+3131). If | |
a character isn't a compatibility jamo and isn't in the lookup, an error | |
is thrown. | |
N.B. To be more efficient, this table could be set as a constant and | |
referenced later. | |
""" | |
hangul_ranges = { | |
"jamo": (int('0x1100', 16), int('0x11FF', 16)), | |
"syllable": (int('0xAC00', 16), int('0xD7A3', 16)), | |
"compatibility_jamo": (int('0x3130', 16), int('0x318F', 16)), | |
} | |
output = {"jamo": [], "syllable": [], "compatibility_jamo": []} | |
for item in letter_input: | |
for (key, (range_min, range_max)) in hangul_ranges.items(): | |
if range_min <= ord(item) <= range_max: | |
output[key].append(item) | |
return output | |
def decompose_words_KO(list_of_filtered_input): | |
"""Takes a list of Korean letters and words, and returns a list of | |
all letters in order | |
""" | |
output = [] | |
for item in list_of_filtered_input: | |
# Save hex offsets for initial, medial, and terminal characters | |
intial_chr_ref = 4351 # Initial hangul characters start after '0x10FF' | |
mid_chr_ref = 4448 # Initial hangul characters start after '0x1161' | |
terminal_chr_ref = 4519 # Initial hangul characters start after '0x11A8' | |
# Calculate relative position of each jamo | |
terminal = (ord(item) - 44032) % 28 | |
mid = 1 + ((ord(item) - 44032 - terminal) % 588 // 28) | |
initial = 1 + ((ord(item) - 44032 + 1) // 588) | |
# Calculate base 10 number of each Unicode Jamo | |
terminal = terminal_chr_ref + terminal | |
mid = mid_chr_ref + mid | |
initial = intial_chr_ref + initial | |
# Convert to character, then to compatibility jamo | |
jamo = [chr(initial), chr(mid), chr(terminal)] | |
for j in jamo: | |
if j: | |
output.extend(lookup_jamo_KO(j)) | |
return output | |
def lookup_jamo_KO(letter_block): | |
"""Converts all letters and multi-letter Unicode characters into a list | |
of initial or medial compatibility jamo. | |
Lookup keys are Unicode Hangul Jamo (1100–11FF). Their values are lists | |
of Unicode Hangul Compatibility Jamo which have only one representation | |
per character. For example, the inital jamo 'ᄀ' (U+1100) and the terminal | |
jamo 'ᆨ' (U+11a8) will both become compatibility jamo "ㄱ" (U+3131). If | |
a character isn't a compatibility jamo and isn't in the lookup, an error | |
is thrown. | |
N.B. To be more efficient, this table could be set as a constant and | |
referenced later. | |
""" | |
lookup = { | |
'ᄀ': ["ㄱ"], | |
'ᄁ': ["ㄲ"], | |
'ᄂ': ["ㄴ"], | |
'ᄃ': ["ㄷ"], | |
'ᄄ': ["ㄸ"], | |
'ᄅ': ["ㄹ"], | |
'ᄆ': ["ㅁ"], | |
'ᄇ': ["ㅂ"], | |
'ᄈ': ["ㅃ"], | |
'ᄉ': ["ㅅ"], | |
'ᄊ': ["ㅆ"], | |
'ᄋ': ["ㅇ"], | |
'ᄌ': ["ㅈ"], | |
'ᄍ': ["ㅉ"], | |
'ᄎ': ["ㅊ"], | |
'ᄏ': ["ㅋ"], | |
'ᄐ': ["ㅌ"], | |
'ᄑ': ["ㅍ"], | |
'ᄒ': ["ㅎ"], | |
'ᅡ': ["ㅏ"], | |
'ᅢ': ["ㅐ"], | |
'ᅣ': ["ㅑ"], | |
'ᅤ': ["ㅒ"], | |
'ᅥ': ["ㅓ"], | |
'ᅦ': ["ㅔ"], | |
'ᅧ': ["ㅕ"], | |
'ᅨ': ["ㅖ"], | |
'ᅩ': ["ㅗ"], | |
'ᅪ': ["ㅗ", "ㅏ"], | |
'ᅫ': ["ㅗ", "ㅐ"], | |
'ᅬ': ["ㅗ", "ㅣ"], | |
'ᅭ': ["ㅛ"], | |
'ᅮ': ["ㅜ"], | |
'ᅯ': ["ㅜ", "ㅓ"], | |
'ᅰ': ["ㅜ", "ㅔ"], | |
'ᅱ': ["ㅜ", "ㅣ"], | |
'ᅲ': ["ㅠ"], | |
'ᅳ': ["ㅡ"], | |
'ᅴ': ["ㅣ"], | |
'ᅵ': ["ㅣ"], | |
'ᆨ': ["ㄱ"], | |
'ᆩ': ["ㄲ"], | |
'ᆪ': ["ㄱ", "ㅅ"], | |
'ᆫ': ["ㄴ"], | |
'ᆬ': ["ㄴ", "ㅈ"], | |
'ᆭ': ["ㄴ", "ㅎ"], | |
'ᆮ': ["ᄃ"], | |
'ᆯ': ["ㄹ"], | |
'ᆰ': ["ㄹ", "ㄱ"], | |
'ᆱ': ["ㄹ", "ㅁ"], | |
'ᆲ': ["ㄹ", "ㅂ"], | |
'ᆳ': ["ㄹ", "ㅅ"], | |
'ᆴ': ["ㄹ", "ㅌ"], | |
'ᆵ': ["ㄹ", "ㅍ"], | |
'ᆶ': ["ㄹ", "ㅎ"], | |
'ᆷ': ["ㅁ"], | |
'ᆸ': ["ㅂ"], | |
'ᆹ': ["ㅂ", "ㅅ"], | |
'ᆺ': ["ㅅ"], | |
'ᆻ': ["ㅆ"], | |
'ᆼ': ["ㅇ"], | |
'ᆽ': ["ㅈ"], | |
'ᆾ': ["ㅊ"], | |
'ᆿ': ["ㅋ"], | |
'ᇀ': ["ㅌ"], | |
'ᇁ': ["ㅍ"], | |
'ᇂ': ["ㅎ"], | |
} | |
output_list = lookup.get(letter_block, []) | |
return output_list | |
def create_jamo_lists(set_of_compatibility_jamo): | |
# Set up initial character dict (compatibility jamo -> initial jamo) | |
initial_char_dict = { | |
'ㄱ': 0, # 'ᄀ', | |
'ㄲ': 1, # 'ᄁ', | |
'ㄴ': 2, # 'ᄂ', | |
'ㄷ': 3, # 'ᄃ', | |
'ㄸ': 4, # 'ᄄ', | |
'ㄹ': 5, # 'ᄅ', | |
'ㅁ': 6, # 'ᄆ', | |
'ㅂ': 7, # 'ᄇ', | |
'ㅃ': 8, # 'ᄈ', | |
'ㅅ': 9, # 'ᄉ', | |
'ㅆ': 10, # 'ᄊ', | |
'ㅇ': 11, # 'ᄋ', | |
'ㅈ': 12, # 'ᄌ', | |
'ㅉ': 13, # 'ᄍ', | |
'ㅊ': 14, # 'ᄎ', | |
'ㅋ': 15, # 'ᄏ', | |
'ㅌ': 16, # 'ᄐ', | |
'ㅍ': 17, # 'ᄑ', | |
'ㅎ': 18, # 'ᄒ', | |
} | |
# Set up medial character dict (compatibility jamo -> medial jamo) | |
medial_char_dict = { | |
'ㅏ': 0, # 'ᅡ', | |
'ㅐ': 1, # 'ᅢ', | |
'ㅑ': 2, # 'ᅣ', | |
'ㅒ': 3, # 'ᅤ', | |
'ㅓ': 4, # 'ᅥ', | |
'ㅔ': 5, # 'ᅦ', | |
'ㅕ': 6, # 'ᅧ', | |
'ㅖ': 7, # 'ᅨ', | |
'ㅗ': 8, # 'ᅩ', | |
'ㅛ': 12, # 'ᅭ', | |
'ㅜ': 13, # 'ᅮ', | |
'ㅠ': 17, # 'ᅲ', | |
'ㅡ': 18, # 'ᅳ', | |
'ㅣ': 20, # 'ᅵ', | |
} | |
# Set up terminal character dict (compatibility jamo -> terminal jamo) | |
terminal_char_dict = { | |
'ㄱ': 1, #'ᆨ', | |
'ㄲ': 2, #'ᆩ', | |
'ㄴ': 4, #'ᆫ', | |
'ㄷ': 7, #'ᆮ', | |
'ㄹ': 8, #'ᆯ', | |
'ㅁ': 16, #'ᆷ', | |
'ㅂ': 17, #'ᆸ', | |
'ㅅ': 19, #'ᆺ', | |
'ㅆ': 20, #'ᆻ', | |
'ㅇ': 21, #'ᆼ', | |
'ㅈ': 22, #'ᆽ', | |
'ㅊ': 23, #'ᆾ', | |
'ㅋ': 24, #'ᆿ', | |
'ㅌ': 25, #'ᇀ', | |
'ㅍ': 26, #'ᇁ', | |
'ㅎ': 27, #'ᇂ', | |
} | |
initial_jamo_list = [] | |
medial_jamo_list = [] | |
terminal_jamo_list = [0] # Syllable blocks can be only two letters long, so a 0 option is needed for none | |
# Look up Unicode refs for single character initial, medial, and terminal | |
for item in set_of_compatibility_jamo: | |
if item in initial_char_dict: | |
initial_jamo_list.append(initial_char_dict[item]) | |
if item in medial_char_dict: | |
medial_jamo_list.append(medial_char_dict[item]) | |
if item in terminal_char_dict: | |
terminal_jamo_list.append(terminal_char_dict[item]) | |
# Add composite medial chars if needed | |
if 'ㅗ' in set_of_compatibility_jamo: | |
if 'ㅏ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(9) # 'ᅪ' | |
if 'ㅐ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(10) # 'ᅫ' | |
if 'ㅣ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(11) # 'ᅬ' | |
if 'ㅜ' in set_of_compatibility_jamo: | |
if 'ㅓ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(14) # 'ᅯ' | |
if 'ㅔ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(15) # 'ᅰ' | |
if 'ㅣ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(16) # 'ᅱ' | |
if 'ㅡ' in set_of_compatibility_jamo: | |
if 'ㅣ' in set_of_compatibility_jamo: | |
medial_jamo_list.append(19) # 'ᅴ' | |
# Add composite terminal chars if needed | |
if 'ㄱ' in set_of_compatibility_jamo: | |
if 'ㅅ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(3) # 'ᆪ' | |
if 'ㄴ' in set_of_compatibility_jamo: | |
if 'ㅈ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(5) # 'ᆬ' | |
if 'ㅎ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(6) # 'ᆭ' | |
if 'ㄹ' in set_of_compatibility_jamo: | |
if 'ㄱ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(9) # 'ᆰ' | |
if 'ㅁ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(10) # 'ᆱ' | |
if 'ㅂ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(11) # 'ᆲ' | |
if 'ㅅ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(12) # 'ᆳ' | |
if 'ㅌ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(13) # 'ᆴ' | |
if 'ㅍ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(14) # 'ᆵ' | |
if 'ㅎ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(15) # 'ᆶ' | |
if 'ㅂ' in set_of_compatibility_jamo: | |
if 'ㅅ' in set_of_compatibility_jamo: | |
terminal_jamo_list.append(18) # 'ᆹ' | |
return initial_jamo_list, medial_jamo_list, terminal_jamo_list | |
def create_one_word_KO(initial_list, medial_list, terminal_list, number_of_syllables): | |
list_of_syllable_blocks = [] | |
for i in range(number_of_syllables): | |
# Randomly choose from initial set | |
initial_letter = initial_list[randint(0, len(initial_list)-1)] | |
# Randomly choose from medial set | |
medial_letter = medial_list[randint(0, len(medial_list)-1)] | |
# Randomly choose from terminal set | |
terminal_letter = terminal_list[randint(0, len(terminal_list)-1)] | |
# Calculate Unicode for syllable block | |
syllable_block = (initial_letter * 588) + (medial_letter * 28) + terminal_letter + 44032 | |
list_of_syllable_blocks.append(chr(syllable_block)) | |
return "".join(list_of_syllable_blocks) | |
def create_many_words_KO(set_of_compatibility_jamo, number_of_words, max_word_length): | |
# Lookup lists of Unicode refs for initial, medial, and terminal | |
initial_jamo_list, medial_jamo_list, terminal_jamo_list = create_jamo_lists(set_of_compatibility_jamo) | |
# Throw error if no initial or medial jamo exist | |
if not (initial_jamo_list and medial_jamo_list): | |
raise ValueError(f"There must be at least one initial letter and at least one vowel.") | |
# Create a list of words | |
output = [] | |
for i in range(number_of_words): | |
output.append(create_one_word_KO(initial_jamo_list, medial_jamo_list, terminal_jamo_list, randint(1,max_word_length))) | |
return output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment