Skip to content

Instantly share code, notes, and snippets.

@rbreaves
Forked from szarroug3/split_audiobook_chapters.py
Last active November 24, 2022 07:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rbreaves/9fcd0b7f6c9c9f116ad5cb3a2f94b650 to your computer and use it in GitHub Desktop.
Save rbreaves/9fcd0b7f6c9c9f116ad5cb3a2f94b650 to your computer and use it in GitHub Desktop.
Split audiobook into chapters
#!/usr/bin/env python
"""
Script to split audiobook chapters into separate files using metadata
"""
from __future__ import print_function
import os
import re
import string
import codecs
import subprocess
from xml.etree import ElementTree
from argparse import ArgumentParser
ALLOWED_FILETYPES = ['.mp3']
CHAPTER = re.compile(r'^([\D ]*\d*)')
def get_file_data(filename, verbose):
"""
Get chapter info for file
Args:
:str filename: the filename of the file to split
:boolean verbose: true if you want to print errors, false otherwise
Returns:
:str: the key which contains the chapter data
:Element: Element of the data for the chapter
"""
cmd = ['ffprobe', '-show_format', '-pretty', '-loglevel', 'quiet', filename]
try:
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = process.communicate()
if err:
print('\tSomething went wrong getting file data...')
if verbose:
print(err)
return None, None
except (OSError, ValueError, subprocess.CalledProcessError) as error:
print('\tSomething went wrong getting file data...')
if verbose:
print(error)
return None, None
out = codecs.decode(str(out), 'unicode_escape')
for line in out.splitlines():
line = line.encode('utf-8').strip()
if '<Name>' in line and '<Time>' in line:
line_info = line.split('=')
return line_info[0][4:], ElementTree.fromstring(line_info[1])
print('\tSkipping. No chapter metadata found...')
return None, None
def check_time(time):
"""
Convert time to be in xx:xx:xx.xxx format -- 1:00:00.000 instead of 60:00.000
Args:
:str time: time to be checked
Returns:
str: fixed time
"""
split_time = time.split(':')
if len(split_time) > 2:
return time
minutes = int(split_time[-2])
hours = int(minutes / 60)
minutes %= 60
seconds = split_time[-1]
return '{0:02d}:{1:02d}:{2}'.format(hours, minutes, seconds)
def process_chapter_data(xml):
"""
Gets chapter data from xml
Args:
:ElementTree xml: xml data containing Markers with Name and Time tags
i.e <Marker><Name>{some_name}</Name><Time>{some_time}</Time></Marker>
Retuns:
:list: list of dicts with name, start_time, and end_time information
last object have end_time = None
"""
data = []
names = []
titles = []
for marker in xml.findall('.//Marker'):
# for some reason, there are some chapters with repeating names and incorrect time data
# looks like the first one is usually the right one so we'll skip any subsequent ones
name = clean_chapter_name(marker.find('Name').text)
title = marker.find('Name').text
time = check_time(marker.find('Time').text)
if not name or name in names:
continue
# add start time as end time for previous chapter
if data:
data[-1]['end_time'] = time
data.append({'name': name, 'start_time': time, 'title': title})
names.append(name)
titles.append(title)
if data:
data[-1]['end_time'] = None
return data
def clean_chapter_name(name):
"""
Clean up chapter name
Args:
:str name: name to clean
Returns:
:str: cleaned name
"""
name = ''.join(filter(lambda x: x in string.printable, name))
return re.sub(r'\W+', '_', name)
def split_into_chapters(filename, key, chapter_data, verbose):
"""
Split file into files by chapter name
Args:
:str filename: the filename of the file to split
:str key: key where the metadata was found
:list chapter_data: list of dicts with name, start_time, and end_time information
:boolean verbose: true if you want to print errors, false otherwise
Returns:
:boolean: True if sucessful, False otherwise
"""
new_file_name = '{1:02d}{2}{3}'
success = True
for i, chapter in enumerate(chapter_data, start=1):
split_filename = os.path.splitext(filename)
new_file = new_file_name.format(split_filename[0], i, chapter['name'], split_filename[1])
cmd = ['ffmpeg', '-loglevel', 'error', '-i', filename, '-ss', chapter['start_time']]
if chapter['end_time']:
cmd += ['-to', chapter['end_time']]
title = chapter['title'].encode('utf-8').strip().replace('"', '')
cmd += ['-metadata','title={0}'.format(re.sub(r'[^\x00-\x7f]',r'', title)), '-metadata', '{0}=-1'.format(key), '-c', 'copy', new_file]
print(cmd)
try:
process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
err = process.communicate()[1]
if err:
print('\tSomething went wrong splitting chapters...')
if verbose:
print(err)
success = False
except (OSError, ValueError, subprocess.CalledProcessError) as error:
print('\tSomething went wrong splitting chapters...')
if verbose:
print(error)
success = False
print('\tWrote new file for {0} to {1}'.format(chapter['name'], new_file))
return success
def get_files_from_dir(directory, recursive):
"""
Get the files to be processed from a given directory
Args:
:list input_list: list of files and folders to check
:bool recursive: True if you want to recursively check the directories
Returns:
list: list of supported files in the directory
"""
print('Getting files to process...')
filepaths = []
# if recursive, use os.walk
# otherwise, use os.listdir
if recursive:
for root, _, files in os.walk(directory):
for filename in files:
filepaths.append(os.path.join(root, filename))
return filepaths
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
filepaths.append(filepath)
return filepaths
def get_files(input_list, recursive):
"""
Get list of files to process
Args:
:list input_list: list of files and folders to check
:bool recursive: True if you want to recursively check the directories
Returns:
list: files to process
"""
filepaths = []
for path in input_list:
if os.path.isfile(path):
# check filetype
if os.path.splitext(path)[1].lower() not in ALLOWED_FILETYPES:
continue
# check that file is not already on our list
if path in filepaths:
continue
filepaths.append(path)
elif os.path.isdir(path):
for filename in get_files_from_dir(path, recursive):
# check filetype
if os.path.splitext(filename)[1].lower() not in ALLOWED_FILETYPES:
continue
# check that file is not already on our list
if filename in filepaths:
continue
filepaths.append(filename)
return filepaths
def get_arguments():
"""
Get input arguments
Returns:
argparse.Namespace: parsed arguments
"""
parser = ArgumentParser(description='Separate an audiobook into files of chapters')
parser.add_argument(dest='input', help='Input file or directory', nargs='+')
parser.add_argument('-d', '--delete-original', dest='delete',
help='If files are split, delete original',
action='store_true')
parser.add_argument('-r', '--recursive', dest='recursive',
help='Recurse folders', action='store_true')
parser.add_argument('-v', '--verbose', dest='verbose',
help='Print errors', action='store_true')
return parser.parse_args()
def check_ffbinaries():
"""
Check if ffprobe and ffmpeg are on system and in PATH
"""
found = True
try:
subprocess.Popen('ffprobe', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except IOError:
print('ffprobe was not found on system. Please install it and make sure it\'s in your PATH.')
found = False
try:
subprocess.Popen('ffmpeg', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except IOError:
print('ffmpeg was not found on system. Please install it and make sure it\'s in your PATH.')
found = False
return found
if __name__ == '__main__':
if not check_ffbinaries():
exit()
ARGS = get_arguments()
FILES = get_files(ARGS.input, ARGS.recursive)
if not FILES:
print('No files found in: {0}'.format(ARGS.input))
exit()
for FILE in FILES:
print('Processing {0}...'.format(FILE))
KEY, XML = get_file_data(FILE, ARGS.verbose)
if XML is None:
continue
CHAPTER_DATA = process_chapter_data(XML)
# no need to split books with one chapter
if len(CHAPTER_DATA) < 2:
print('\tSkipping. File only has one chapter...')
continue
if split_into_chapters(FILE, KEY, CHAPTER_DATA, ARGS.verbose) and ARGS.delete:
os.remove(FILE)
print('\tDeleting {0}...'.format(FILE))
@rbreaves
Copy link
Author

rbreaves commented Jul 6, 2018

Revised to remove book title from individual file names, the folder name is used for that purpose. The filenames are chapter names only and I added the proper chapter name into the metadata as the title name, the album name contains the book title.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment