koundinyad/syllable_dict.py

## syllable_dict.py
import json

cmu_dict_file = "./cmudict-0.txt" # CMU dictionary .txt file
word_data = {}

def get_stress(phonemes):
""" Extracts stress markers from phoneme list. """
stress_markers = []
for phoneme in phonemes:
if phoneme[-1].isdigit():
stress_markers.append('U' if phoneme[-1] in '12' else 'I')
return stress_markers

def cmu_syllable_approximation(word, phonetic_transcription): # Split the transcription into phonemes
phonemes = phonetic_transcription.split()

    stress_markers = get_stress(phonemes)

    # Heuristic: map the number of vowels in the word to the syllable count
    vowels = "AEIOU"
    vowel_indices = [i for i, char in enumerate(word) if char.upper() in vowels]
    split_indices = vowel_indices[:len(stress_markers) - 1]  # Ignore the last vowel for splitting

    # Split the word at the chosen indices
    splits = [0] + [index + 1 for index in split_indices] + [len(word)]
    syllables = [word[splits[i]:splits[i + 1]] for i in range(len(splits) - 1)]

    # Create a list of dictionaries for each syllable
    syllable_dicts = [{"syllable": syllables[i], "position": i+1, "pattern": stress_markers[i]}
                      for i in range(min(len(syllables), len(stress_markers)))]

    return syllable_dicts

with open(cmu_dict_file, "r", encoding="ISO-8859-1") as file:
for line in file:
parts = line.strip().split()
word = parts[0]
transcription = ' '.join(parts[1:])

        syllable_dicts = cmu_syllable_approximation(word, transcription)


        syllables = "".join(get_stress(transcription))

        # shortened key names for smaller file size
        word_data[word] = {
            "sc": len(syllable_dicts),  # sc for syllable_count
            "sp": syllables,  # sp for syllable_pattern
            "seg": syllable_dicts  # seg for segments
        }

output_json_file = "cmu_en.json"
with open(output_json_file, "w") as json_file:
json.dump(word_data, json_file)
	import json

	cmu_dict_file = "./cmudict-0.txt" # CMU dictionary .txt file
	word_data = {}

	def get_stress(phonemes):
	""" Extracts stress markers from phoneme list. """
	stress_markers = []
	for phoneme in phonemes:
	if phoneme[-1].isdigit():
	stress_markers.append('U' if phoneme[-1] in '12' else 'I')
	return stress_markers

	def cmu_syllable_approximation(word, phonetic_transcription): # Split the transcription into phonemes
	phonemes = phonetic_transcription.split()

	stress_markers = get_stress(phonemes)

	# Heuristic: map the number of vowels in the word to the syllable count
	vowels = "AEIOU"
	vowel_indices = [i for i, char in enumerate(word) if char.upper() in vowels]
	split_indices = vowel_indices[:len(stress_markers) - 1] # Ignore the last vowel for splitting

	# Split the word at the chosen indices
	splits = [0] + [index + 1 for index in split_indices] + [len(word)]
	syllables = [word[splits[i]:splits[i + 1]] for i in range(len(splits) - 1)]

	# Create a list of dictionaries for each syllable
	syllable_dicts = [{"syllable": syllables[i], "position": i+1, "pattern": stress_markers[i]}
	for i in range(min(len(syllables), len(stress_markers)))]

	return syllable_dicts

	with open(cmu_dict_file, "r", encoding="ISO-8859-1") as file:
	for line in file:
	parts = line.strip().split()
	word = parts[0]
	transcription = ' '.join(parts[1:])

	syllable_dicts = cmu_syllable_approximation(word, transcription)


	syllables = "".join(get_stress(transcription))

	# shortened key names for smaller file size
	word_data[word] = {
	"sc": len(syllable_dicts), # sc for syllable_count
	"sp": syllables, # sp for syllable_pattern
	"seg": syllable_dicts # seg for segments
	}

	output_json_file = "cmu_en.json"
	with open(output_json_file, "w") as json_file:
	json.dump(word_data, json_file)