Created
January 23, 2018 06:57
-
-
Save jakelosh/857b5985f79bd61e309ff115cb9db711 to your computer and use it in GitHub Desktop.
A simple regex exercise in R whereby I build pinyin syllables from base consonant and vowel pairs and then pare down to valid syllables.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The purpose of this script is to generate a random Mandarin syllable pairing for use in tone | |
# practice drills. The random syllable pair is used to populate a grid that does not have tones. | |
# The tone_table has the tones of the two syllables. The student then tries to say the random | |
# pairing with the tones they see. The partner then tries to guess which tones were meant. | |
# | |
# The script generates the syllables from scratch, starting with all possible vowel and consonant | |
# pairings. Then it uses a very complicated string of regex to exclude out invalid syllables. | |
# My goal with this script was to make the tone tables, but it was also meant to learn some | |
# useful regex, which is why I indulged in it rather than just typing out all possible syllables | |
# (which would have been much faster at my typing speed than finding all the exclusions I needed). | |
# Pinyin initials include many of the same consonants as in the English alphabet except for V | |
# and with the addition of sh, ch and zh; we build a vector of pinyin initials here | |
initials <- c(letters[1:26][-c(1,5,9,15,21,22)], "sh", "zh", "ch", "") | |
# Pinyin final sounds include the standard English vowels, dipthongs, and several others | |
finals <- c(letters[c(1,5,9,15,21)], "ai", "ei", "ou", "ao", "an", "en", "eng", "ang", "ong", "er", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong", "ua", "uo", "uai", "ui", "uan", "un", "uang", "uen", "ueng", "ü", "üe", "üan", "ün") | |
# Combine intials and finals | |
# Bring together in a list | |
initials_finals <- expand.grid(initials, finals) | |
# Flatten the list | |
concate_initials_finals <- do.call(paste, initials_finals) | |
# Remove spaces and cheat by appending known syllables that don't naturally fall into the initials/finals combinations | |
all_possible_syllables <- c(gsub(" ", "", concate_initials_finals), "xue", "jue", "que", "yue") | |
# A ridiculously long, probably ill-advised regex to exclude impossible syllables | |
# The entire point of this exercise was to learn regex, so we'll allow it this time | |
exclusions <- !grepl("\\<r[ao]\\>|ruang|ruen|r[u]ai|^[uiü]|^ong|yai|yen.|yi[ae]|wao|wi|[xjqy]u[ae]ng|\\<[xjqy]ua[i]\\>|\\<[xjqy]u[aoi]\\>|[xjq][aoe]|[szcrw]i.|[scry]ei|chei|[szc]hi.|\\<shong\\>|\\<[szc]ho\\>|\\<[tly]en\\>|\\<[dtnlszcy]uang\\>|[dtnlgkhszcqjxy]uen|\\<[dtnlsczr]ua\\>|[dtnlscz]ua[i]|\\<[dt]in\\>|\\<[dtngkhscz]o\\>|^hi|[fgk]i|fa[io]|\\<[bpfxjqw]e\\>|.er|[bxjqw]o.|[pmf]ong|\\<[bpmftngkhszcrwy]ia\\>|[bpftgkhszcrwy]iu|[bpmfdtgkhszcrwy]i[ao]ng|[nls]iong|[bpmfw]u.+|[nl]ui|[bpmfdtgkhszcrxhqwyjq]ü|[nl]üan|[nl]ün", all_possible_syllables) | |
# Apply exclusions | |
allowed_syllables <- all_possible_syllables[exclusions] | |
# Pull together two random syllables | |
random_syllables <- paste(sample(allowed_syllables, 2), collapse = "") | |
# Create pinyin table | |
pinyin_table <- matrix(rep.int(random_syllables,9), ncol = 3, byrow = TRUE) | |
# Create tones key | |
tone_table <- matrix(replicate(9, paste(sample.int(4, size = 2, replace = FALSE), collapse = "")), ncol = 3, byrow = TRUE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment