rbreaves/split_audiobook_chapters.py

## split_audiobook_chapters.py
#!/usr/bin/env python

"""
Script to split audiobook chapters into separate files using metadata
"""

from __future__ import print_function

import os
import re
import string
import codecs
import subprocess
from xml.etree import ElementTree
from argparse import ArgumentParser

ALLOWED_FILETYPES = ['.mp3']
CHAPTER = re.compile(r'^([\D ]*\d*)')


def get_file_data(filename, verbose):
    """
    Get chapter info for file

    Args:
        :str filename: the filename of the file to split
        :boolean verbose: true if you want to print errors, false otherwise

    Returns:
        :str: the key which contains the chapter data
        :Element: Element of the data for the chapter
    """
    cmd = ['ffprobe', '-show_format', '-pretty', '-loglevel', 'quiet', filename]
    try:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = process.communicate()

        if err:
            print('\tSomething went wrong getting file data...')
            if verbose:
                print(err)
            return None, None
    except (OSError, ValueError, subprocess.CalledProcessError) as error:
        print('\tSomething went wrong getting file data...')
        if verbose:
            print(error)
        return None, None

    out = codecs.decode(str(out), 'unicode_escape')
    for line in out.splitlines():
        line = line.encode('utf-8').strip()
        if '<Name>' in line and '<Time>' in line:
            line_info = line.split('=')
            return line_info[0][4:], ElementTree.fromstring(line_info[1])

    print('\tSkipping. No chapter metadata found...')
    return None, None


def check_time(time):
    """
    Convert time to be in xx:xx:xx.xxx format -- 1:00:00.000 instead of 60:00.000

    Args:
        :str time: time to be checked

    Returns:
        str: fixed time
    """
    split_time = time.split(':')
    if len(split_time) > 2:
        return time

    minutes = int(split_time[-2])
    hours = int(minutes / 60)
    minutes %= 60
    seconds = split_time[-1]
    return '{0:02d}:{1:02d}:{2}'.format(hours, minutes, seconds)

def process_chapter_data(xml):
    """
    Gets chapter data from xml

    Args:
        :ElementTree xml: xml data containing Markers with Name and Time tags
                          i.e <Marker><Name>{some_name}</Name><Time>{some_time}</Time></Marker>

    Retuns:
        :list: list of dicts with name, start_time, and end_time information
               last object have end_time = None
    """
    data = []
    names = []
    titles = []
    for marker in xml.findall('.//Marker'):
        # for some reason, there are some chapters with repeating names and incorrect time data
        # looks like the first one is usually the right one so we'll skip any subsequent ones
        name = clean_chapter_name(marker.find('Name').text)
        title = marker.find('Name').text
        time = check_time(marker.find('Time').text)
        if not name or name in names:
            continue

        # add start time as end time for previous chapter
        if data:
            data[-1]['end_time'] = time

        data.append({'name': name, 'start_time': time, 'title': title})
        names.append(name)
        titles.append(title)

    if data:
        data[-1]['end_time'] = None
    return data

def clean_chapter_name(name):
    """
    Clean up chapter name

    Args:
        :str name: name to clean

    Returns:
        :str: cleaned name
    """
    name = ''.join(filter(lambda x: x in string.printable, name))
    return re.sub(r'\W+', '_', name)


def split_into_chapters(filename, key, chapter_data, verbose):
    """
    Split file into files by chapter name

    Args:
        :str filename: the filename of the file to split
        :str key: key where the metadata was found
        :list chapter_data: list of dicts with name, start_time, and end_time information
        :boolean verbose: true if you want to print errors, false otherwise

    Returns:
        :boolean: True if sucessful, False otherwise
    """
    new_file_name = '{1:02d}{2}{3}'
    success = True

    for i, chapter in enumerate(chapter_data, start=1):
        split_filename = os.path.splitext(filename)
        new_file = new_file_name.format(split_filename[0], i, chapter['name'], split_filename[1])

        cmd = ['ffmpeg', '-loglevel', 'error', '-i', filename, '-ss', chapter['start_time']]
        if chapter['end_time']:
            cmd += ['-to', chapter['end_time']]
        title = chapter['title'].encode('utf-8').strip().replace('"', '')
        cmd += ['-metadata','title={0}'.format(re.sub(r'[^\x00-\x7f]',r'', title)), '-metadata', '{0}=-1'.format(key), '-c', 'copy', new_file]
        print(cmd)

        try:
            process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
            err = process.communicate()[1]
            if err:
                print('\tSomething went wrong splitting chapters...')
                if verbose:
                    print(err)
                success = False
        except (OSError, ValueError, subprocess.CalledProcessError) as error:
            print('\tSomething went wrong splitting chapters...')
            if verbose:
                print(error)
            success = False

        print('\tWrote new file for {0} to {1}'.format(chapter['name'], new_file))
    return success

def get_files_from_dir(directory, recursive):
    """
    Get the files to be processed from a given directory

    Args:
        :list input_list: list of files and folders to check
        :bool recursive: True if you want to recursively check the directories

    Returns:
        list: list of supported files in the directory
    """
    print('Getting files to process...')
    filepaths = []

    # if recursive, use os.walk
    # otherwise, use os.listdir
    if recursive:
        for root, _, files in os.walk(directory):
            for filename in files:
                filepaths.append(os.path.join(root, filename))
        return filepaths

    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            filepaths.append(filepath)
    return filepaths

def get_files(input_list, recursive):
    """
    Get list of files to process

    Args:
        :list input_list: list of files and folders to check
        :bool recursive: True if you want to recursively check the directories

    Returns:
        list: files to process
    """
    filepaths = []

    for path in input_list:
        if os.path.isfile(path):
            # check filetype
            if os.path.splitext(path)[1].lower() not in ALLOWED_FILETYPES:
                continue

            # check that file is not already on our list
            if path in filepaths:
                continue

            filepaths.append(path)
        elif os.path.isdir(path):
            for filename in get_files_from_dir(path, recursive):
                # check filetype
                if os.path.splitext(filename)[1].lower() not in ALLOWED_FILETYPES:
                    continue

                # check that file is not already on our list
                if filename in filepaths:
                    continue

                filepaths.append(filename)

    return filepaths

def get_arguments():
    """
    Get input arguments

    Returns:
        argparse.Namespace: parsed arguments
    """
    parser = ArgumentParser(description='Separate an audiobook into files of chapters')
    parser.add_argument(dest='input', help='Input file or directory', nargs='+')
    parser.add_argument('-d', '--delete-original', dest='delete',
                        help='If files are split, delete original',
                        action='store_true')
    parser.add_argument('-r', '--recursive', dest='recursive',
                        help='Recurse folders', action='store_true')
    parser.add_argument('-v', '--verbose', dest='verbose',
                        help='Print errors', action='store_true')
    return parser.parse_args()


def check_ffbinaries():
    """
    Check if ffprobe and ffmpeg are on system and in PATH
    """
    found = True
    try:
        subprocess.Popen('ffprobe', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except IOError:
        print('ffprobe was not found on system. Please install it and make sure it\'s in your PATH.')
        found = False

    try:
        subprocess.Popen('ffmpeg', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except IOError:
        print('ffmpeg was not found on system. Please install it and make sure it\'s in your PATH.')
        found = False

    return found


if __name__ == '__main__':
    if not check_ffbinaries():
        exit()

    ARGS = get_arguments()

    FILES = get_files(ARGS.input, ARGS.recursive)
    if not FILES:
        print('No files found in: {0}'.format(ARGS.input))
        exit()

    for FILE in FILES:
        print('Processing {0}...'.format(FILE))
        KEY, XML = get_file_data(FILE, ARGS.verbose)
        if XML is None:
            continue

        CHAPTER_DATA = process_chapter_data(XML)
        # no need to split books with one chapter
        if len(CHAPTER_DATA) < 2:
            print('\tSkipping. File only has one chapter...')
            continue

        if split_into_chapters(FILE, KEY, CHAPTER_DATA, ARGS.verbose) and ARGS.delete:
            os.remove(FILE)
            print('\tDeleting {0}...'.format(FILE))
	#!/usr/bin/env python

	"""
	Script to split audiobook chapters into separate files using metadata
	"""

	from __future__ import print_function

	import os
	import re
	import string
	import codecs
	import subprocess
	from xml.etree import ElementTree
	from argparse import ArgumentParser

	ALLOWED_FILETYPES = ['.mp3']
	CHAPTER = re.compile(r'^([\D ]\d)')


	def get_file_data(filename, verbose):
	"""
	Get chapter info for file

	Args:
	:str filename: the filename of the file to split
	:boolean verbose: true if you want to print errors, false otherwise

	Returns:
	:str: the key which contains the chapter data
	:Element: Element of the data for the chapter
	"""
	cmd = ['ffprobe', '-show_format', '-pretty', '-loglevel', 'quiet', filename]
	try:
	process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = process.communicate()

	if err:
	print('\tSomething went wrong getting file data...')
	if verbose:
	print(err)
	return None, None
	except (OSError, ValueError, subprocess.CalledProcessError) as error:
	print('\tSomething went wrong getting file data...')
	if verbose:
	print(error)
	return None, None

	out = codecs.decode(str(out), 'unicode_escape')
	for line in out.splitlines():
	line = line.encode('utf-8').strip()
	if '<Name>' in line and '<Time>' in line:
	line_info = line.split('=')
	return line_info[0][4:], ElementTree.fromstring(line_info[1])

	print('\tSkipping. No chapter metadata found...')
	return None, None


	def check_time(time):
	"""
	Convert time to be in xx:xx:xx.xxx format -- 1:00:00.000 instead of 60:00.000

	Args:
	:str time: time to be checked

	Returns:
	str: fixed time
	"""
	split_time = time.split(':')
	if len(split_time) > 2:
	return time

	minutes = int(split_time[-2])
	hours = int(minutes / 60)
	minutes %= 60
	seconds = split_time[-1]
	return '{0:02d}:{1:02d}:{2}'.format(hours, minutes, seconds)

	def process_chapter_data(xml):
	"""
	Gets chapter data from xml

	Args:
	:ElementTree xml: xml data containing Markers with Name and Time tags
	i.e <Marker><Name>{some_name}</Name><Time>{some_time}</Time></Marker>

	Retuns:
	:list: list of dicts with name, start_time, and end_time information
	last object have end_time = None
	"""
	data = []
	names = []
	titles = []
	for marker in xml.findall('.//Marker'):
	# for some reason, there are some chapters with repeating names and incorrect time data
	# looks like the first one is usually the right one so we'll skip any subsequent ones
	name = clean_chapter_name(marker.find('Name').text)
	title = marker.find('Name').text
	time = check_time(marker.find('Time').text)
	if not name or name in names:
	continue

	# add start time as end time for previous chapter
	if data:
	data[-1]['end_time'] = time

	data.append({'name': name, 'start_time': time, 'title': title})
	names.append(name)
	titles.append(title)

	if data:
	data[-1]['end_time'] = None
	return data

	def clean_chapter_name(name):
	"""
	Clean up chapter name

	Args:
	:str name: name to clean

	Returns:
	:str: cleaned name
	"""
	name = ''.join(filter(lambda x: x in string.printable, name))
	return re.sub(r'\W+', '_', name)


	def split_into_chapters(filename, key, chapter_data, verbose):
	"""
	Split file into files by chapter name

	Args:
	:str filename: the filename of the file to split
	:str key: key where the metadata was found
	:list chapter_data: list of dicts with name, start_time, and end_time information
	:boolean verbose: true if you want to print errors, false otherwise

	Returns:
	:boolean: True if sucessful, False otherwise
	"""
	new_file_name = '{1:02d}{2}{3}'
	success = True

	for i, chapter in enumerate(chapter_data, start=1):
	split_filename = os.path.splitext(filename)
	new_file = new_file_name.format(split_filename[0], i, chapter['name'], split_filename[1])

	cmd = ['ffmpeg', '-loglevel', 'error', '-i', filename, '-ss', chapter['start_time']]
	if chapter['end_time']:
	cmd += ['-to', chapter['end_time']]
	title = chapter['title'].encode('utf-8').strip().replace('"', '')
	cmd += ['-metadata','title={0}'.format(re.sub(r'[^\x00-\x7f]',r'', title)), '-metadata', '{0}=-1'.format(key), '-c', 'copy', new_file]
	print(cmd)

	try:
	process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
	err = process.communicate()[1]
	if err:
	print('\tSomething went wrong splitting chapters...')
	if verbose:
	print(err)
	success = False
	except (OSError, ValueError, subprocess.CalledProcessError) as error:
	print('\tSomething went wrong splitting chapters...')
	if verbose:
	print(error)
	success = False

	print('\tWrote new file for {0} to {1}'.format(chapter['name'], new_file))
	return success

	def get_files_from_dir(directory, recursive):
	"""
	Get the files to be processed from a given directory

	Args:
	:list input_list: list of files and folders to check
	:bool recursive: True if you want to recursively check the directories

	Returns:
	list: list of supported files in the directory
	"""
	print('Getting files to process...')
	filepaths = []

	# if recursive, use os.walk
	# otherwise, use os.listdir
	if recursive:
	for root, _, files in os.walk(directory):
	for filename in files:
	filepaths.append(os.path.join(root, filename))
	return filepaths

	for filename in os.listdir(directory):
	filepath = os.path.join(directory, filename)
	if os.path.isfile(filepath):
	filepaths.append(filepath)
	return filepaths

	def get_files(input_list, recursive):
	"""
	Get list of files to process

	Args:
	:list input_list: list of files and folders to check
	:bool recursive: True if you want to recursively check the directories

	Returns:
	list: files to process
	"""
	filepaths = []

	for path in input_list:
	if os.path.isfile(path):
	# check filetype
	if os.path.splitext(path)[1].lower() not in ALLOWED_FILETYPES:
	continue

	# check that file is not already on our list
	if path in filepaths:
	continue

	filepaths.append(path)
	elif os.path.isdir(path):
	for filename in get_files_from_dir(path, recursive):
	# check filetype
	if os.path.splitext(filename)[1].lower() not in ALLOWED_FILETYPES:
	continue

	# check that file is not already on our list
	if filename in filepaths:
	continue

	filepaths.append(filename)

	return filepaths

	def get_arguments():
	"""
	Get input arguments

	Returns:
	argparse.Namespace: parsed arguments
	"""
	parser = ArgumentParser(description='Separate an audiobook into files of chapters')
	parser.add_argument(dest='input', help='Input file or directory', nargs='+')
	parser.add_argument('-d', '--delete-original', dest='delete',
	help='If files are split, delete original',
	action='store_true')
	parser.add_argument('-r', '--recursive', dest='recursive',
	help='Recurse folders', action='store_true')
	parser.add_argument('-v', '--verbose', dest='verbose',
	help='Print errors', action='store_true')
	return parser.parse_args()


	def check_ffbinaries():
	"""
	Check if ffprobe and ffmpeg are on system and in PATH
	"""
	found = True
	try:
	subprocess.Popen('ffprobe', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	except IOError:
	print('ffprobe was not found on system. Please install it and make sure it\'s in your PATH.')
	found = False

	try:
	subprocess.Popen('ffmpeg', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	except IOError:
	print('ffmpeg was not found on system. Please install it and make sure it\'s in your PATH.')
	found = False

	return found


	if __name__ == '__main__':
	if not check_ffbinaries():
	exit()

	ARGS = get_arguments()

	FILES = get_files(ARGS.input, ARGS.recursive)
	if not FILES:
	print('No files found in: {0}'.format(ARGS.input))
	exit()

	for FILE in FILES:
	print('Processing {0}...'.format(FILE))
	KEY, XML = get_file_data(FILE, ARGS.verbose)
	if XML is None:
	continue

	CHAPTER_DATA = process_chapter_data(XML)
	# no need to split books with one chapter
	if len(CHAPTER_DATA) < 2:
	print('\tSkipping. File only has one chapter...')
	continue

	if split_into_chapters(FILE, KEY, CHAPTER_DATA, ARGS.verbose) and ARGS.delete:
	os.remove(FILE)
	print('\tDeleting {0}...'.format(FILE))