Skip to content

Instantly share code, notes, and snippets.

@szarroug3
Last active August 15, 2018 22:50
Show Gist options
  • Save szarroug3/b5443235fb8baaf242ff6913aefe30b4 to your computer and use it in GitHub Desktop.
Save szarroug3/b5443235fb8baaf242ff6913aefe30b4 to your computer and use it in GitHub Desktop.
Split audiobook into chapters
#!/usr/bin/env python
"""
Script to split audiobook chapters into separate files using metadata
"""
from __future__ import print_function
import os
import re
import string
import subprocess
from argparse import ArgumentParser
from xml.etree import ElementTree
ALLOWED_FILETYPES = ['.mp3']
CHAPTER = re.compile(r'^([\D ]*\d*)')
def remove_non_ascii(string):
"""
Remove non-ascii characters from a string
Args:
:str string: the string to remove non-ascii characters from
Returns:
str: string of only ascii characters
"""
return ''.join(s for s in string if ord(s) < 128)
def get_file_data(filename, verbose):
"""
Get chapter info for file
Args:
:str filename: the filename of the file to split
:boolean verbose: true if you want to print errors, false otherwise
Returns:
:str: the key which contains the chapter data
:Element: Element of the data for the chapter
"""
cmd = ['ffprobe', '-show_format', '-pretty', '-loglevel', 'quiet', filename]
try:
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = process.communicate()
if err:
print('\tSomething went wrong getting file data...')
if verbose:
print(err)
return None, None
except (OSError, ValueError, subprocess.CalledProcessError) as error:
print('\tSomething went wrong getting file data...')
if verbose:
print(error)
return None, None
out = remove_non_ascii(out)
for line in out.splitlines():
if '<Name>' in line and '<Time>' in line:
line_info = line.split('=')
return line_info[0][4:], ElementTree.fromstring(line_info[1])
print('\tSkipping. No chapter metadata found...')
return None, None
def check_time(time):
"""
Convert time to be in xx:xx:xx.xxx format -- 1:00:00.000 instead of 60:00.000
Args:
:str time: time to be checked
Returns:
str: fixed time
"""
split_time = time.split(':')
if len(split_time) > 2:
return time
minutes = int(split_time[-2])
hours = int(minutes / 60)
minutes %= 60
seconds = split_time[-1]
return '{0:02d}:{1:02d}:{2}'.format(hours, minutes, seconds)
def process_chapter_data(xml):
"""
Gets chapter data from xml
Args:
:ElementTree xml: xml data containing Markers with Name and Time tags
i.e <Marker><Name>{some_name}</Name><Time>{some_time}</Time></Marker>
Retuns:
:list: list of dicts with name, start_time, and end_time information
last object have end_time = None
"""
data = []
names = []
for marker in xml.findall('.//Marker'):
# for some reason, there are some chapters with repeating names and incorrect time data
# looks like the first one is usually the right one so we'll skip any subsequent ones
name = clean_chapter_name(marker.find('Name').text)
time = check_time(marker.find('Time').text)
if not name or name in names:
continue
# add start time as end time for previous chapter
if data:
data[-1]['end_time'] = time
data.append({'name': name, 'start_time': time})
names.append(name)
if data:
data[-1]['end_time'] = None
return data
def clean_chapter_name(name):
"""
Clean up chapter name
Args:
:str name: name to clean
Returns:
:str: cleaned name
"""
name = ''.join(filter(lambda x: x in string.printable, name))
return re.sub(r'\W+', '_', name)
def split_into_chapters(filename, key, chapter_data, verbose):
"""
Split file into files by chapter name
Args:
:str filename: the filename of the file to split
:str key: key where the metadata was found
:list chapter_data: list of dicts with name, start_time, and end_time information
:boolean verbose: true if you want to print errors, false otherwise
Returns:
:boolean: True if sucessful, False otherwise
"""
new_file_name = '{0}-{1:02d}_{2}{3}'
success = True
for i, chapter in enumerate(chapter_data, start=1):
split_filename = os.path.splitext(filename)
new_file = new_file_name.format(split_filename[0], i, chapter['name'], split_filename[1])
cmd = ['ffmpeg', '-loglevel', 'error', '-i', filename, '-ss', chapter['start_time']]
if chapter['end_time']:
cmd += ['-to', chapter['end_time']]
cmd += ['-metadata', '{0}=-1'.format(key), '-c', 'copy', new_file]
try:
process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
err = process.communicate()[1]
if err:
print('\tSomething went wrong splitting chapters...')
if verbose:
print(err)
success = False
except (OSError, ValueError, subprocess.CalledProcessError) as error:
print('\tSomething went wrong splitting chapters...')
if verbose:
print(error)
success = False
print('\tWrote new file for {0} to {1}'.format(chapter['name'], new_file))
return success
def get_files_from_dir(directory, recursive):
"""
Get the files to be processed from a given directory
Args:
:list input_list: list of files and folders to check
:bool recursive: True if you want to recursively check the directories
Returns:
list: list of supported files in the directory
"""
print('Getting files to process...')
filepaths = []
# if recursive, use os.walk
# otherwise, use os.listdir
if recursive:
for root, _, files in os.walk(directory):
for filename in files:
filepaths.append(os.path.join(root, filename))
return filepaths
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
filepaths.append(filepath)
return filepaths
def get_files(input_list, recursive):
"""
Get list of files to process
Args:
:list input_list: list of files and folders to check
:bool recursive: True if you want to recursively check the directories
Returns:
list: files to process
"""
filepaths = []
for path in input_list:
if os.path.isfile(path):
# check filetype
if os.path.splitext(path)[1].lower() not in ALLOWED_FILETYPES:
continue
# check that file is not already on our list
if path in filepaths:
continue
filepaths.append(path)
elif os.path.isdir(path):
for filename in get_files_from_dir(path, recursive):
# check filetype
if os.path.splitext(filename)[1].lower() not in ALLOWED_FILETYPES:
continue
# check that file is not already on our list
if filename in filepaths:
continue
filepaths.append(filename)
return filepaths
def get_arguments():
"""
Get input arguments
Returns:
argparse.Namespace: parsed arguments
"""
parser = ArgumentParser(description='Separate an audiobook into files of chapters')
parser.add_argument(dest='input', help='Input file or directory', nargs='+')
parser.add_argument('-d', '--delete-original', dest='delete',
help='If files are split, delete original',
action='store_true')
parser.add_argument('-r', '--recursive', dest='recursive',
help='Recurse folders', action='store_true')
parser.add_argument('-v', '--verbose', dest='verbose',
help='Print errors', action='store_true')
return parser.parse_args()
def check_ffbinaries():
"""
Check if ffprobe and ffmpeg are on system and in PATH
"""
found = True
try:
subprocess.Popen('ffprobe', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except IOError:
print('ffprobe was not found on system. Please install it and make sure it\'s in your PATH.')
found = False
try:
subprocess.Popen('ffmpeg', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except IOError:
print('ffmpeg was not found on system. Please install it and make sure it\'s in your PATH.')
found = False
return found
if __name__ == '__main__':
if not check_ffbinaries():
exit()
ARGS = get_arguments()
FILES = get_files(ARGS.input, ARGS.recursive)
if not FILES:
print('No files found in: {0}'.format(ARGS.input))
exit()
for FILE in FILES:
print('Processing {0}...'.format(FILE))
KEY, XML = get_file_data(FILE, ARGS.verbose)
if XML is None:
continue
CHAPTER_DATA = process_chapter_data(XML)
# no need to split books with one chapter
if len(CHAPTER_DATA) < 2:
print('\tSkipping. File only has one chapter...')
continue
if split_into_chapters(FILE, KEY, CHAPTER_DATA, ARGS.verbose) and ARGS.delete:
os.remove(FILE)
print('\tDeleting {0}...'.format(FILE))
@rbreaves
Copy link

rbreaves commented Jul 6, 2018

I'm unable to make a pull request for a gist, so I am just going to provide a link to my forked version. I updated line 51 to include a fix for an ASCII/Unicode error that I ran into.

https://gist.github.com/rbreaves/9fcd0b7f6c9c9f116ad5cb3a2f94b650

https://docs.python.org/2.7/howto/unicode.html#the-unicode-type

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment