gartenfeld/arpabet.txt

## arpabet.txt
AA	ɑ
AA0	ɑ
AA1	ɑ
AA2	ɑ
AE	æ
AE0	æ
AE1	æ
AE2	æ
AH	ə
AH0	ə
AH1	ʌ
AH2	ʌ
AO	ɔ
AO0	ɔ
AO1	ɔ
AO2	ɔ
AW	aʊ
AW0	aʊ
AW1	aʊ
AW2	aʊ
AY	aɪ
AY0	aɪ
AY1	aɪ
AY2	aɪ
B	b
CH	ʧ
D	d
DH	ð
EH	ɛ
EH0	ɛ
EH1	ɛ
EH2	ɛ
ER	ər
ER0	ər
ER1	ər
ER2	ər
EY	eɪ
EY0	eɪ
EY1	eɪ
EY2	eɪ
F	f
G	ɡ
HH	h
IH	ɪ
IH0	ɪ
IH1	ɪ
IH2	ɪ
IY	i
IY0	i
IY1	i
IY2	i
JH	ʤ
K	k
L	l
M	m
N	n
NG	ŋ
OW	oʊ
OW0	oʊ
OW1	oʊ
OW2	oʊ
OY	ɔɪ
OY0	ɔɪ
OY1	ɔɪ
OY2	ɔɪ
P	p
R	r
S	s
SH	ʃ
T	t
TH	θ
UH	ʊ
UH0	ʊ
UH1	ʊ
UH2	ʊ
UW	u
UW0	u
UW1	u
UW2	u
V	v
W	w
Y	j
Z	z
ZH	ʒ

## ensyl.py
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
import operator

def clear_output_file(out_file_path):
	f_output = open(out_file_path,'w')
	f_output.write("")
	f_output.close()
	return

def load_syl(syl_f):
	f = codecs.open(syl_f, 'r', encoding='utf-8')
	loc_dict = {}
	for line in f:
		try:
			syl_line = line.rstrip("\n").split('\t')
			syl_key = syl_line[0]
			syl_value = syl_line[1:]
			loc_dict[syl_key] = syl_value
		except IndexError:
			sys.stderr.write( "Errors at: " + line + '\n')
			continue
	f.close()
	return loc_dict

def load_freq(freq_f):
	f = codecs.open(freq_f, 'r', encoding='utf-8')
	f_log = open(forms_log,'a')
	pre_count, add_count, new_count = 0,0,0
	loc_syl_inv = {}
	for line in f:
		try:
			freq_line = line.rstrip('\r').split('\t')
			freq = freq_line[0]
			form = freq_line[1].upper()
			if form in syl_dict: # If syllabification can be found
				syllables = syl_dict[form]
				for syllable in syllables:
					if syllable not in loc_syl_inv:
						loc_syl_inv[syllable] = 0
					if syllable in loc_syl_inv:
						pre_count = int(loc_syl_inv[syllable])
						add_count = int(freq)
						new_count = pre_count + add_count
						loc_syl_inv[syllable] = new_count
				log_line = "FOUND"+'\t'+form+'\t'+freq+'\n'
				f_log.write(log_line) # For tallying percentage of found entries later
		except IndexError:
			sys.stderr.write( "Errors at: " + line + '\n')
			continue
	f.close()
	f_log.close()
	return loc_syl_inv

def load_coding(scheme_f):
	f = codecs.open(scheme_f, 'r', encoding='utf-8')
	loc_scheme = {}
	for line in f:
		try:
			code_line = line.rstrip("\n").split('\t')
			arpa = code_line[0]
			ipa = code_line[1:]
			loc_scheme[arpa] = ipa
		except IndexError:
			sys.stderr.write( "Errors at: " + line + '\n')
			continue
	f.close()
	return loc_scheme

def convert_to_ipa():
	clear_output_file(inv_file)
	f_inv = open(inv_file,'a')
	for entry in sorted_inventory:
		syl = str(entry[0])
		s_freq = entry[1]
		stress = ""
		if "1" in syl: stress = "ˈ"
		if "2" in syl: stress = "ˌ"
		phonemes = syl.split(" ")
		ipa_str = stress
		for phoneme in phonemes:
			p_ipa = str(scheme[phoneme][0])
			ipa_str = ipa_str + p_ipa
		out_l = ipa_str + '\t' + str(s_freq) + '\n'
		f_inv.write(out_l)
	f_inv.close()
	return

if __name__ == '__main__':

	syl_file = "syl_cmu.txt"
	freq_file = "forms_freq.txt"
	forms_log = "forms_log.txt"
	inv_file = "inventory.txt"
	coding_file = "arpabet.txt"

	print("Loading syllabification data...")
	syl_dict = {} # For looking up syllabification of a given form
	syl_dict = load_syl(syl_file) # Load syllabification data in to a dictionary

	print("Building inventory of syllables...")
	syl_inventory = {} # For storing the counts of each syllable type
	clear_output_file(forms_log)
	syl_inventory = load_freq(freq_file)

	print("Sorting inventory by frequency...")
	sorted_inventory = sorted(syl_inventory.items(), key=operator.itemgetter(1), reverse=True)

	print("Converting to IPA...")
	scheme = load_coding(coding_file)
	convert_to_ipa()

	print("Valmis!")
	AA ɑ
	AA0 ɑ
	AA1 ɑ
	AA2 ɑ
	AE æ
	AE0 æ
	AE1 æ
	AE2 æ
	AH ə
	AH0 ə
	AH1 ʌ
	AH2 ʌ
	AO ɔ
	AO0 ɔ
	AO1 ɔ
	AO2 ɔ
	AW aʊ
	AW0 aʊ
	AW1 aʊ
	AW2 aʊ
	AY aɪ
	AY0 aɪ
	AY1 aɪ
	AY2 aɪ
	B b
	CH ʧ
	D d
	DH ð
	EH ɛ
	EH0 ɛ
	EH1 ɛ
	EH2 ɛ
	ER ər
	ER0 ər
	ER1 ər
	ER2 ər
	EY eɪ
	EY0 eɪ
	EY1 eɪ
	EY2 eɪ
	F f
	G ɡ
	HH h
	IH ɪ
	IH0 ɪ
	IH1 ɪ
	IH2 ɪ
	IY i
	IY0 i
	IY1 i
	IY2 i
	JH ʤ
	K k
	L l
	M m
	N n
	NG ŋ
	OW oʊ
	OW0 oʊ
	OW1 oʊ
	OW2 oʊ
	OY ɔɪ
	OY0 ɔɪ
	OY1 ɔɪ
	OY2 ɔɪ
	P p
	R r
	S s
	SH ʃ
	T t
	TH θ
	UH ʊ
	UH0 ʊ
	UH1 ʊ
	UH2 ʊ
	UW u
	UW0 u
	UW1 u
	UW2 u
	V v
	W w
	Y j
	Z z
	ZH ʒ
	import re # Regular Expressions
	import collections # Data Types
	import sys # File operations
	import codecs # UniCode support
	import os
	import operator

	def clear_output_file(out_file_path):
	f_output = open(out_file_path,'w')
	f_output.write("")
	f_output.close()
	return

	def load_syl(syl_f):
	f = codecs.open(syl_f, 'r', encoding='utf-8')
	loc_dict = {}
	for line in f:
	try:
	syl_line = line.rstrip("\n").split('\t')
	syl_key = syl_line[0]
	syl_value = syl_line[1:]
	loc_dict[syl_key] = syl_value
	except IndexError:
	sys.stderr.write( "Errors at: " + line + '\n')
	continue
	f.close()
	return loc_dict

	def load_freq(freq_f):
	f = codecs.open(freq_f, 'r', encoding='utf-8')
	f_log = open(forms_log,'a')
	pre_count, add_count, new_count = 0,0,0
	loc_syl_inv = {}
	for line in f:
	try:
	freq_line = line.rstrip('\r').split('\t')
	freq = freq_line[0]
	form = freq_line[1].upper()
	if form in syl_dict: # If syllabification can be found
	syllables = syl_dict[form]
	for syllable in syllables:
	if syllable not in loc_syl_inv:
	loc_syl_inv[syllable] = 0
	if syllable in loc_syl_inv:
	pre_count = int(loc_syl_inv[syllable])
	add_count = int(freq)
	new_count = pre_count + add_count
	loc_syl_inv[syllable] = new_count
	log_line = "FOUND"+'\t'+form+'\t'+freq+'\n'
	f_log.write(log_line) # For tallying percentage of found entries later
	except IndexError:
	sys.stderr.write( "Errors at: " + line + '\n')
	continue
	f.close()
	f_log.close()
	return loc_syl_inv

	def load_coding(scheme_f):
	f = codecs.open(scheme_f, 'r', encoding='utf-8')
	loc_scheme = {}
	for line in f:
	try:
	code_line = line.rstrip("\n").split('\t')
	arpa = code_line[0]
	ipa = code_line[1:]
	loc_scheme[arpa] = ipa
	except IndexError:
	sys.stderr.write( "Errors at: " + line + '\n')
	continue
	f.close()
	return loc_scheme

	def convert_to_ipa():
	clear_output_file(inv_file)
	f_inv = open(inv_file,'a')
	for entry in sorted_inventory:
	syl = str(entry[0])
	s_freq = entry[1]
	stress = ""
	if "1" in syl: stress = "ˈ"
	if "2" in syl: stress = "ˌ"
	phonemes = syl.split(" ")
	ipa_str = stress
	for phoneme in phonemes:
	p_ipa = str(scheme[phoneme][0])
	ipa_str = ipa_str + p_ipa
	out_l = ipa_str + '\t' + str(s_freq) + '\n'
	f_inv.write(out_l)
	f_inv.close()
	return

	if __name__ == '__main__':

	syl_file = "syl_cmu.txt"
	freq_file = "forms_freq.txt"
	forms_log = "forms_log.txt"
	inv_file = "inventory.txt"
	coding_file = "arpabet.txt"

	print("Loading syllabification data...")
	syl_dict = {} # For looking up syllabification of a given form
	syl_dict = load_syl(syl_file) # Load syllabification data in to a dictionary

	print("Building inventory of syllables...")
	syl_inventory = {} # For storing the counts of each syllable type
	clear_output_file(forms_log)
	syl_inventory = load_freq(freq_file)

	print("Sorting inventory by frequency...")
	sorted_inventory = sorted(syl_inventory.items(), key=operator.itemgetter(1), reverse=True)

	print("Converting to IPA...")
	scheme = load_coding(coding_file)
	convert_to_ipa()

	print("Valmis!")