Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Last active July 26, 2022 11:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gartenfeld/7541325a78634707d7d1 to your computer and use it in GitHub Desktop.
Save gartenfeld/7541325a78634707d7d1 to your computer and use it in GitHub Desktop.
Frequency distribution of syllables using CMU dictionary and COCA.
AA ɑ
AA0 ɑ
AA1 ɑ
AA2 ɑ
AE æ
AE0 æ
AE1 æ
AE2 æ
AH ə
AH0 ə
AH1 ʌ
AH2 ʌ
AO ɔ
AO0 ɔ
AO1 ɔ
AO2 ɔ
AW aʊ
AW0 aʊ
AW1 aʊ
AW2 aʊ
AY aɪ
AY0 aɪ
AY1 aɪ
AY2 aɪ
B b
CH ʧ
D d
DH ð
EH ɛ
EH0 ɛ
EH1 ɛ
EH2 ɛ
ER ər
ER0 ər
ER1 ər
ER2 ər
EY eɪ
EY0 eɪ
EY1 eɪ
EY2 eɪ
F f
G ɡ
HH h
IH ɪ
IH0 ɪ
IH1 ɪ
IH2 ɪ
IY i
IY0 i
IY1 i
IY2 i
JH ʤ
K k
L l
M m
N n
NG ŋ
OW oʊ
OW0 oʊ
OW1 oʊ
OW2 oʊ
OY ɔɪ
OY0 ɔɪ
OY1 ɔɪ
OY2 ɔɪ
P p
R r
S s
SH ʃ
T t
TH θ
UH ʊ
UH0 ʊ
UH1 ʊ
UH2 ʊ
UW u
UW0 u
UW1 u
UW2 u
V v
W w
Y j
Z z
ZH ʒ
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
import operator
def clear_output_file(out_file_path):
f_output = open(out_file_path,'w')
f_output.write("")
f_output.close()
return
def load_syl(syl_f):
f = codecs.open(syl_f, 'r', encoding='utf-8')
loc_dict = {}
for line in f:
try:
syl_line = line.rstrip("\n").split('\t')
syl_key = syl_line[0]
syl_value = syl_line[1:]
loc_dict[syl_key] = syl_value
except IndexError:
sys.stderr.write( "Errors at: " + line + '\n')
continue
f.close()
return loc_dict
def load_freq(freq_f):
f = codecs.open(freq_f, 'r', encoding='utf-8')
f_log = open(forms_log,'a')
pre_count, add_count, new_count = 0,0,0
loc_syl_inv = {}
for line in f:
try:
freq_line = line.rstrip('\r').split('\t')
freq = freq_line[0]
form = freq_line[1].upper()
if form in syl_dict: # If syllabification can be found
syllables = syl_dict[form]
for syllable in syllables:
if syllable not in loc_syl_inv:
loc_syl_inv[syllable] = 0
if syllable in loc_syl_inv:
pre_count = int(loc_syl_inv[syllable])
add_count = int(freq)
new_count = pre_count + add_count
loc_syl_inv[syllable] = new_count
log_line = "FOUND"+'\t'+form+'\t'+freq+'\n'
f_log.write(log_line) # For tallying percentage of found entries later
except IndexError:
sys.stderr.write( "Errors at: " + line + '\n')
continue
f.close()
f_log.close()
return loc_syl_inv
def load_coding(scheme_f):
f = codecs.open(scheme_f, 'r', encoding='utf-8')
loc_scheme = {}
for line in f:
try:
code_line = line.rstrip("\n").split('\t')
arpa = code_line[0]
ipa = code_line[1:]
loc_scheme[arpa] = ipa
except IndexError:
sys.stderr.write( "Errors at: " + line + '\n')
continue
f.close()
return loc_scheme
def convert_to_ipa():
clear_output_file(inv_file)
f_inv = open(inv_file,'a')
for entry in sorted_inventory:
syl = str(entry[0])
s_freq = entry[1]
stress = ""
if "1" in syl: stress = "ˈ"
if "2" in syl: stress = "ˌ"
phonemes = syl.split(" ")
ipa_str = stress
for phoneme in phonemes:
p_ipa = str(scheme[phoneme][0])
ipa_str = ipa_str + p_ipa
out_l = ipa_str + '\t' + str(s_freq) + '\n'
f_inv.write(out_l)
f_inv.close()
return
if __name__ == '__main__':
syl_file = "syl_cmu.txt"
freq_file = "forms_freq.txt"
forms_log = "forms_log.txt"
inv_file = "inventory.txt"
coding_file = "arpabet.txt"
print("Loading syllabification data...")
syl_dict = {} # For looking up syllabification of a given form
syl_dict = load_syl(syl_file) # Load syllabification data in to a dictionary
print("Building inventory of syllables...")
syl_inventory = {} # For storing the counts of each syllable type
clear_output_file(forms_log)
syl_inventory = load_freq(freq_file)
print("Sorting inventory by frequency...")
sorted_inventory = sorted(syl_inventory.items(), key=operator.itemgetter(1), reverse=True)
print("Converting to IPA...")
scheme = load_coding(coding_file)
convert_to_ipa()
print("Valmis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment