Last active
August 15, 2018 22:50
-
-
Save szarroug3/b5443235fb8baaf242ff6913aefe30b4 to your computer and use it in GitHub Desktop.
Split audiobook into chapters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Script to split audiobook chapters into separate files using metadata | |
""" | |
from __future__ import print_function | |
import os | |
import re | |
import string | |
import subprocess | |
from argparse import ArgumentParser | |
from xml.etree import ElementTree | |
ALLOWED_FILETYPES = ['.mp3'] | |
CHAPTER = re.compile(r'^([\D ]*\d*)') | |
def remove_non_ascii(string): | |
""" | |
Remove non-ascii characters from a string | |
Args: | |
:str string: the string to remove non-ascii characters from | |
Returns: | |
str: string of only ascii characters | |
""" | |
return ''.join(s for s in string if ord(s) < 128) | |
def get_file_data(filename, verbose): | |
""" | |
Get chapter info for file | |
Args: | |
:str filename: the filename of the file to split | |
:boolean verbose: true if you want to print errors, false otherwise | |
Returns: | |
:str: the key which contains the chapter data | |
:Element: Element of the data for the chapter | |
""" | |
cmd = ['ffprobe', '-show_format', '-pretty', '-loglevel', 'quiet', filename] | |
try: | |
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = process.communicate() | |
if err: | |
print('\tSomething went wrong getting file data...') | |
if verbose: | |
print(err) | |
return None, None | |
except (OSError, ValueError, subprocess.CalledProcessError) as error: | |
print('\tSomething went wrong getting file data...') | |
if verbose: | |
print(error) | |
return None, None | |
out = remove_non_ascii(out) | |
for line in out.splitlines(): | |
if '<Name>' in line and '<Time>' in line: | |
line_info = line.split('=') | |
return line_info[0][4:], ElementTree.fromstring(line_info[1]) | |
print('\tSkipping. No chapter metadata found...') | |
return None, None | |
def check_time(time): | |
""" | |
Convert time to be in xx:xx:xx.xxx format -- 1:00:00.000 instead of 60:00.000 | |
Args: | |
:str time: time to be checked | |
Returns: | |
str: fixed time | |
""" | |
split_time = time.split(':') | |
if len(split_time) > 2: | |
return time | |
minutes = int(split_time[-2]) | |
hours = int(minutes / 60) | |
minutes %= 60 | |
seconds = split_time[-1] | |
return '{0:02d}:{1:02d}:{2}'.format(hours, minutes, seconds) | |
def process_chapter_data(xml): | |
""" | |
Gets chapter data from xml | |
Args: | |
:ElementTree xml: xml data containing Markers with Name and Time tags | |
i.e <Marker><Name>{some_name}</Name><Time>{some_time}</Time></Marker> | |
Retuns: | |
:list: list of dicts with name, start_time, and end_time information | |
last object have end_time = None | |
""" | |
data = [] | |
names = [] | |
for marker in xml.findall('.//Marker'): | |
# for some reason, there are some chapters with repeating names and incorrect time data | |
# looks like the first one is usually the right one so we'll skip any subsequent ones | |
name = clean_chapter_name(marker.find('Name').text) | |
time = check_time(marker.find('Time').text) | |
if not name or name in names: | |
continue | |
# add start time as end time for previous chapter | |
if data: | |
data[-1]['end_time'] = time | |
data.append({'name': name, 'start_time': time}) | |
names.append(name) | |
if data: | |
data[-1]['end_time'] = None | |
return data | |
def clean_chapter_name(name): | |
""" | |
Clean up chapter name | |
Args: | |
:str name: name to clean | |
Returns: | |
:str: cleaned name | |
""" | |
name = ''.join(filter(lambda x: x in string.printable, name)) | |
return re.sub(r'\W+', '_', name) | |
def split_into_chapters(filename, key, chapter_data, verbose): | |
""" | |
Split file into files by chapter name | |
Args: | |
:str filename: the filename of the file to split | |
:str key: key where the metadata was found | |
:list chapter_data: list of dicts with name, start_time, and end_time information | |
:boolean verbose: true if you want to print errors, false otherwise | |
Returns: | |
:boolean: True if sucessful, False otherwise | |
""" | |
new_file_name = '{0}-{1:02d}_{2}{3}' | |
success = True | |
for i, chapter in enumerate(chapter_data, start=1): | |
split_filename = os.path.splitext(filename) | |
new_file = new_file_name.format(split_filename[0], i, chapter['name'], split_filename[1]) | |
cmd = ['ffmpeg', '-loglevel', 'error', '-i', filename, '-ss', chapter['start_time']] | |
if chapter['end_time']: | |
cmd += ['-to', chapter['end_time']] | |
cmd += ['-metadata', '{0}=-1'.format(key), '-c', 'copy', new_file] | |
try: | |
process = subprocess.Popen(cmd, stderr=subprocess.PIPE) | |
err = process.communicate()[1] | |
if err: | |
print('\tSomething went wrong splitting chapters...') | |
if verbose: | |
print(err) | |
success = False | |
except (OSError, ValueError, subprocess.CalledProcessError) as error: | |
print('\tSomething went wrong splitting chapters...') | |
if verbose: | |
print(error) | |
success = False | |
print('\tWrote new file for {0} to {1}'.format(chapter['name'], new_file)) | |
return success | |
def get_files_from_dir(directory, recursive): | |
""" | |
Get the files to be processed from a given directory | |
Args: | |
:list input_list: list of files and folders to check | |
:bool recursive: True if you want to recursively check the directories | |
Returns: | |
list: list of supported files in the directory | |
""" | |
print('Getting files to process...') | |
filepaths = [] | |
# if recursive, use os.walk | |
# otherwise, use os.listdir | |
if recursive: | |
for root, _, files in os.walk(directory): | |
for filename in files: | |
filepaths.append(os.path.join(root, filename)) | |
return filepaths | |
for filename in os.listdir(directory): | |
filepath = os.path.join(directory, filename) | |
if os.path.isfile(filepath): | |
filepaths.append(filepath) | |
return filepaths | |
def get_files(input_list, recursive): | |
""" | |
Get list of files to process | |
Args: | |
:list input_list: list of files and folders to check | |
:bool recursive: True if you want to recursively check the directories | |
Returns: | |
list: files to process | |
""" | |
filepaths = [] | |
for path in input_list: | |
if os.path.isfile(path): | |
# check filetype | |
if os.path.splitext(path)[1].lower() not in ALLOWED_FILETYPES: | |
continue | |
# check that file is not already on our list | |
if path in filepaths: | |
continue | |
filepaths.append(path) | |
elif os.path.isdir(path): | |
for filename in get_files_from_dir(path, recursive): | |
# check filetype | |
if os.path.splitext(filename)[1].lower() not in ALLOWED_FILETYPES: | |
continue | |
# check that file is not already on our list | |
if filename in filepaths: | |
continue | |
filepaths.append(filename) | |
return filepaths | |
def get_arguments(): | |
""" | |
Get input arguments | |
Returns: | |
argparse.Namespace: parsed arguments | |
""" | |
parser = ArgumentParser(description='Separate an audiobook into files of chapters') | |
parser.add_argument(dest='input', help='Input file or directory', nargs='+') | |
parser.add_argument('-d', '--delete-original', dest='delete', | |
help='If files are split, delete original', | |
action='store_true') | |
parser.add_argument('-r', '--recursive', dest='recursive', | |
help='Recurse folders', action='store_true') | |
parser.add_argument('-v', '--verbose', dest='verbose', | |
help='Print errors', action='store_true') | |
return parser.parse_args() | |
def check_ffbinaries(): | |
""" | |
Check if ffprobe and ffmpeg are on system and in PATH | |
""" | |
found = True | |
try: | |
subprocess.Popen('ffprobe', stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
except IOError: | |
print('ffprobe was not found on system. Please install it and make sure it\'s in your PATH.') | |
found = False | |
try: | |
subprocess.Popen('ffmpeg', stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
except IOError: | |
print('ffmpeg was not found on system. Please install it and make sure it\'s in your PATH.') | |
found = False | |
return found | |
if __name__ == '__main__': | |
if not check_ffbinaries(): | |
exit() | |
ARGS = get_arguments() | |
FILES = get_files(ARGS.input, ARGS.recursive) | |
if not FILES: | |
print('No files found in: {0}'.format(ARGS.input)) | |
exit() | |
for FILE in FILES: | |
print('Processing {0}...'.format(FILE)) | |
KEY, XML = get_file_data(FILE, ARGS.verbose) | |
if XML is None: | |
continue | |
CHAPTER_DATA = process_chapter_data(XML) | |
# no need to split books with one chapter | |
if len(CHAPTER_DATA) < 2: | |
print('\tSkipping. File only has one chapter...') | |
continue | |
if split_into_chapters(FILE, KEY, CHAPTER_DATA, ARGS.verbose) and ARGS.delete: | |
os.remove(FILE) | |
print('\tDeleting {0}...'.format(FILE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm unable to make a pull request for a gist, so I am just going to provide a link to my forked version. I updated line 51 to include a fix for an ASCII/Unicode error that I ran into.
https://gist.github.com/rbreaves/9fcd0b7f6c9c9f116ad5cb3a2f94b650
https://docs.python.org/2.7/howto/unicode.html#the-unicode-type