Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save SharedAcademia/5e73a7a375c0409f47c5cb3d380d4910 to your computer and use it in GitHub Desktop.

Select an option

Save SharedAcademia/5e73a7a375c0409f47c5cb3d380d4910 to your computer and use it in GitHub Desktop.
text to speech (tts) .docx script for python using kokoro
# tts_from_docx.py
# make sure in terminal this command has run and kokoro is installed: pip install kokoro>=0.9.2 soundfile python-docx
from kokoro import KPipeline
import soundfile as sf
from docx import Document
import numpy as np
import os
# CONFIGURATION
##########################################
DOCX_FILE = "abc.docx" # Path to your input file
OUTPUT_DIR = "audio_parts" # Folder for chunks
FINAL_AUDIO = "audio.wav" # Final combined file
LANG_CODE = 'a' # 'a' = auto
VOICE = 'en_heart' # Kokoro voice en_heart = standard
CHUNK_SIZE = 500 # Characters per chunk
SAMPLE_RATE = 24000 # Audio sample rate
# 1. Load DOCX
##########################################
def extract_text_from_docx(docx_path):
doc = Document(docx_path)
text = "\n".join(p.text for p in doc.paragraphs if p.text.strip())
return text
print(f"Reading DOCX: {DOCX_FILE}")
full_text = extract_text_from_docx(DOCX_FILE)
print(f"Total text length: {len(full_text)} characters")
# 2. Split text into chunks
##########################################
def split_text(text, max_chars=CHUNK_SIZE):
paragraphs = text.split("\n")
chunks = []
chunk = ""
for p in paragraphs:
if len(chunk) + len(p) < max_chars:
chunk += p + "\n"
else:
chunks.append(chunk.strip())
chunk = p + "\n"
if chunk.strip():
chunks.append(chunk.strip())
return chunks
chunks = split_text(full_text)
print(f"Split into {len(chunks)} chunks of ~{CHUNK_SIZE} characters")
# 3. Create output folder
##########################################
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# 4. Initialize Kokoro TTS
##########################################
print("Loading Kokoro pipeline...")
pipeline = KPipeline(lang_code=LANG_CODE)
# 5. Generate audio for each chunk
##########################################
all_audio_files = []
for i, chunk in enumerate(chunks):
print(f"Generating audio for chunk {i+1}/{len(chunks)}")
generator = pipeline(chunk, voice=VOICE)
for j, (gs, ps, audio) in enumerate(generator):
filename = os.path.join(OUTPUT_DIR, f"part_{i}_{j}.wav")
sf.write(filename, audio, SAMPLE_RATE)
all_audio_files.append(filename)
print(f"Saved {filename}")
# 6. Combine all chunks into single WAV
##########################################
print("Combining audio files into one audiobook...")
combined_audio = []
for filename in all_audio_files:
audio_data, sr = sf.read(filename)
if sr != SAMPLE_RATE:
raise ValueError(f"Sample rate mismatch in {filename}")
combined_audio.append(audio_data)
final_audio_data = np.concatenate(combined_audio)
sf.write(FINAL_AUDIO, final_audio_data, SAMPLE_RATE)
print(f"🎧 Audiobook created: {FINAL_AUDIO}")
# 7. Delete individual chunk files
##########################################
print("🧹 Cleaning up individual audio chunks...")
for filename in all_audio_files:
try:
os.remove(filename)
print(f"🗑️ Deleted {filename}")
except Exception as e:
print(f"⚠️ Could not delete {filename}: {e}")
print("✅ All done! Created by SharedAcademia.org")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment