Skip to content

Instantly share code, notes, and snippets.

@koundinyad
Created December 12, 2023 21:28
Show Gist options
  • Save koundinyad/8d03d759b6620b980b324d4db0fcf27a to your computer and use it in GitHub Desktop.
Save koundinyad/8d03d759b6620b980b324d4db0fcf27a to your computer and use it in GitHub Desktop.
import json
cmu_dict_file = "./cmudict-0.txt" # CMU dictionary .txt file
word_data = {}
def get_stress(phonemes):
""" Extracts stress markers from phoneme list. """
stress_markers = []
for phoneme in phonemes:
if phoneme[-1].isdigit():
stress_markers.append('U' if phoneme[-1] in '12' else 'I')
return stress_markers
def cmu_syllable_approximation(word, phonetic_transcription): # Split the transcription into phonemes
phonemes = phonetic_transcription.split()
stress_markers = get_stress(phonemes)
# Heuristic: map the number of vowels in the word to the syllable count
vowels = "AEIOU"
vowel_indices = [i for i, char in enumerate(word) if char.upper() in vowels]
split_indices = vowel_indices[:len(stress_markers) - 1] # Ignore the last vowel for splitting
# Split the word at the chosen indices
splits = [0] + [index + 1 for index in split_indices] + [len(word)]
syllables = [word[splits[i]:splits[i + 1]] for i in range(len(splits) - 1)]
# Create a list of dictionaries for each syllable
syllable_dicts = [{"syllable": syllables[i], "position": i+1, "pattern": stress_markers[i]}
for i in range(min(len(syllables), len(stress_markers)))]
return syllable_dicts
with open(cmu_dict_file, "r", encoding="ISO-8859-1") as file:
for line in file:
parts = line.strip().split()
word = parts[0]
transcription = ' '.join(parts[1:])
syllable_dicts = cmu_syllable_approximation(word, transcription)
syllables = "".join(get_stress(transcription))
# shortened key names for smaller file size
word_data[word] = {
"sc": len(syllable_dicts), # sc for syllable_count
"sp": syllables, # sp for syllable_pattern
"seg": syllable_dicts # seg for segments
}
output_json_file = "cmu_en.json"
with open(output_json_file, "w") as json_file:
json.dump(word_data, json_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment