Last active
July 26, 2022 11:44
-
-
Save gartenfeld/7541325a78634707d7d1 to your computer and use it in GitHub Desktop.
Frequency distribution of syllables using CMU dictionary and COCA.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AA ɑ | |
AA0 ɑ | |
AA1 ɑ | |
AA2 ɑ | |
AE æ | |
AE0 æ | |
AE1 æ | |
AE2 æ | |
AH ə | |
AH0 ə | |
AH1 ʌ | |
AH2 ʌ | |
AO ɔ | |
AO0 ɔ | |
AO1 ɔ | |
AO2 ɔ | |
AW aʊ | |
AW0 aʊ | |
AW1 aʊ | |
AW2 aʊ | |
AY aɪ | |
AY0 aɪ | |
AY1 aɪ | |
AY2 aɪ | |
B b | |
CH ʧ | |
D d | |
DH ð | |
EH ɛ | |
EH0 ɛ | |
EH1 ɛ | |
EH2 ɛ | |
ER ər | |
ER0 ər | |
ER1 ər | |
ER2 ər | |
EY eɪ | |
EY0 eɪ | |
EY1 eɪ | |
EY2 eɪ | |
F f | |
G ɡ | |
HH h | |
IH ɪ | |
IH0 ɪ | |
IH1 ɪ | |
IH2 ɪ | |
IY i | |
IY0 i | |
IY1 i | |
IY2 i | |
JH ʤ | |
K k | |
L l | |
M m | |
N n | |
NG ŋ | |
OW oʊ | |
OW0 oʊ | |
OW1 oʊ | |
OW2 oʊ | |
OY ɔɪ | |
OY0 ɔɪ | |
OY1 ɔɪ | |
OY2 ɔɪ | |
P p | |
R r | |
S s | |
SH ʃ | |
T t | |
TH θ | |
UH ʊ | |
UH0 ʊ | |
UH1 ʊ | |
UH2 ʊ | |
UW u | |
UW0 u | |
UW1 u | |
UW2 u | |
V v | |
W w | |
Y j | |
Z z | |
ZH ʒ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
import os | |
import operator | |
def clear_output_file(out_file_path): | |
f_output = open(out_file_path,'w') | |
f_output.write("") | |
f_output.close() | |
return | |
def load_syl(syl_f): | |
f = codecs.open(syl_f, 'r', encoding='utf-8') | |
loc_dict = {} | |
for line in f: | |
try: | |
syl_line = line.rstrip("\n").split('\t') | |
syl_key = syl_line[0] | |
syl_value = syl_line[1:] | |
loc_dict[syl_key] = syl_value | |
except IndexError: | |
sys.stderr.write( "Errors at: " + line + '\n') | |
continue | |
f.close() | |
return loc_dict | |
def load_freq(freq_f): | |
f = codecs.open(freq_f, 'r', encoding='utf-8') | |
f_log = open(forms_log,'a') | |
pre_count, add_count, new_count = 0,0,0 | |
loc_syl_inv = {} | |
for line in f: | |
try: | |
freq_line = line.rstrip('\r').split('\t') | |
freq = freq_line[0] | |
form = freq_line[1].upper() | |
if form in syl_dict: # If syllabification can be found | |
syllables = syl_dict[form] | |
for syllable in syllables: | |
if syllable not in loc_syl_inv: | |
loc_syl_inv[syllable] = 0 | |
if syllable in loc_syl_inv: | |
pre_count = int(loc_syl_inv[syllable]) | |
add_count = int(freq) | |
new_count = pre_count + add_count | |
loc_syl_inv[syllable] = new_count | |
log_line = "FOUND"+'\t'+form+'\t'+freq+'\n' | |
f_log.write(log_line) # For tallying percentage of found entries later | |
except IndexError: | |
sys.stderr.write( "Errors at: " + line + '\n') | |
continue | |
f.close() | |
f_log.close() | |
return loc_syl_inv | |
def load_coding(scheme_f): | |
f = codecs.open(scheme_f, 'r', encoding='utf-8') | |
loc_scheme = {} | |
for line in f: | |
try: | |
code_line = line.rstrip("\n").split('\t') | |
arpa = code_line[0] | |
ipa = code_line[1:] | |
loc_scheme[arpa] = ipa | |
except IndexError: | |
sys.stderr.write( "Errors at: " + line + '\n') | |
continue | |
f.close() | |
return loc_scheme | |
def convert_to_ipa(): | |
clear_output_file(inv_file) | |
f_inv = open(inv_file,'a') | |
for entry in sorted_inventory: | |
syl = str(entry[0]) | |
s_freq = entry[1] | |
stress = "" | |
if "1" in syl: stress = "ˈ" | |
if "2" in syl: stress = "ˌ" | |
phonemes = syl.split(" ") | |
ipa_str = stress | |
for phoneme in phonemes: | |
p_ipa = str(scheme[phoneme][0]) | |
ipa_str = ipa_str + p_ipa | |
out_l = ipa_str + '\t' + str(s_freq) + '\n' | |
f_inv.write(out_l) | |
f_inv.close() | |
return | |
if __name__ == '__main__': | |
syl_file = "syl_cmu.txt" | |
freq_file = "forms_freq.txt" | |
forms_log = "forms_log.txt" | |
inv_file = "inventory.txt" | |
coding_file = "arpabet.txt" | |
print("Loading syllabification data...") | |
syl_dict = {} # For looking up syllabification of a given form | |
syl_dict = load_syl(syl_file) # Load syllabification data in to a dictionary | |
print("Building inventory of syllables...") | |
syl_inventory = {} # For storing the counts of each syllable type | |
clear_output_file(forms_log) | |
syl_inventory = load_freq(freq_file) | |
print("Sorting inventory by frequency...") | |
sorted_inventory = sorted(syl_inventory.items(), key=operator.itemgetter(1), reverse=True) | |
print("Converting to IPA...") | |
scheme = load_coding(coding_file) | |
convert_to_ipa() | |
print("Valmis!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment