arkov/info.md

## info.md

      
    Raw
  

              info.md
            
          
    Mass subtitle export

This scripts helps you to export subtitles from your video files and seperate them into different folders - especially useful for Subs2srs.
Should work on all systems, tested on Linux and Windows.
Please bear in mind, that this was hastily done in a very short time for a very specific need. The code is not great but should be servicable especially that it is universal enough to work on all systems, using only the Python STL.
How to use

Simply copy the script file into the folder with the video files and change merely the subtitle-variable to fit your subtitle needs (works with more than two languages!).
The key (first element) of each dict-tuple is the exact subtitle-naming to search your video files for. The value is the folder that will be used for export for that language. Multiple languages can be associated to one and the same folder.
Prerequisites

All you need is Python and FFMPEG.

  
## subtitles.py
# %%
import os
import re
from pathlib import Path
from subprocess import run

subtitles = {'English(CC)':'en', 'Japanese(CC)': 'jp'} # Which subtitles to export and respective folder name.
# {SUBTITLENAME: FOLDERNAME}

DEBUG = False

# %%


vfmt = ['mp4', 'mkv'] # Which files to utilize, you might want to expand the list.

for l in subtitles.values(): # Creates the folder to export the subtitles to.
    try:
        os.mkdir(l)
    except FileExistsError:
        print(f"Folder \"{l}\" already exists")


# %%
def filelist():
    """Returns list of files in current folder."""
    files = os.walk(Path('.'))
    _, _, files = next(files)
    return files

# %%
def fclean(filename :str):
    """Cleans filenames in order to not escape them, when calling ffmpeg through shell/commandline."""
    letters = r"[\w\d\-_.]"
    regex = re.compile(letters)

    out = ''
    for c in filename:
        out += ('' if re.match(regex, c) is None else c)
    return out

# %%
regex = re.compile(r"Stream #0:(\d+)\((\w+)\): (\w+):[^\n]*\n *Metadata:\n *title *: *([^\n]*)") # Creates a tuple like this: ('3', 'eng', 'Subtitle', 'English')
# (#Stream reported by ffprobe (probably useless), language code, subtitle or audio stream, title of subtitle/audio/video stream (can be anything)).

files = filelist()
for f in files:
    try:
        _ = vfmt.index(f[-3:].lower()) # Ignore non-video files
        os.rename(f"./{f}", f"./{(f:=fclean(f))}") # Cleans filename

    except ValueError as e:
        continue
    except Exception as e:
        print(type(e))
        continue

    # Because accessing the subtitle streams is not as trivial, because the numbers by ffprobe are skewed/offset, we need to count the subtitles in order to get the real index.
    # ffprobe counts all streams together (0: video, 1: audio, 2: audio, 3-10: subtitles). In order to get stream 3 (subtitle), we actually need to index it by s:0 (first subtitle stream).
    # Because there does not seem to be a convention on keeping audio and subtitles separate (not intermixed), I am "counting" them like this.
    subs = []
    probe = run(f"ffprobe {f}", shell=True, capture_output=True, text=True).stderr # ffprobe returned everything via stderr, didn't bother modifying the shell command.
    groups = regex.findall(probe)
    for t in groups:
        if DEBUG:
            print(t)
        if t[2] == "Subtitle":
            subs.append(t[3])

    # Now accessing the subtitle streams works via corresponding index in list.
    for sub in subtitles:
        try:
            stream = subs.index(sub)
            ffmpegcmd = run(f"ffmpeg -i {f} -map 0:s:{stream} -y {subtitles[sub]}/{f[0:-4]}.srt", shell=True, text=True, capture_output=True)

        except:
            print(f"For language {sub} no subtitle found in {f}")

if DEBUG:
    print(subs)
	# %%
	import os
	import re
	from pathlib import Path
	from subprocess import run

	subtitles = {'English(CC)':'en', 'Japanese(CC)': 'jp'} # Which subtitles to export and respective folder name.
	# {SUBTITLENAME: FOLDERNAME}

	DEBUG = False

	# %%


	vfmt = ['mp4', 'mkv'] # Which files to utilize, you might want to expand the list.

	for l in subtitles.values(): # Creates the folder to export the subtitles to.
	try:
	os.mkdir(l)
	except FileExistsError:
	print(f"Folder \"{l}\" already exists")


	# %%
	def filelist():
	"""Returns list of files in current folder."""
	files = os.walk(Path('.'))
	_, _, files = next(files)
	return files

	# %%
	def fclean(filename :str):
	"""Cleans filenames in order to not escape them, when calling ffmpeg through shell/commandline."""
	letters = r"[\w\d\-_.]"
	regex = re.compile(letters)

	out = ''
	for c in filename:
	out += ('' if re.match(regex, c) is None else c)
	return out

	# %%
	regex = re.compile(r"Stream #0:(\d+)\((\w+)\): (\w+):[^\n]\n Metadata:\n title : ([^\n])") # Creates a tuple like this: ('3', 'eng', 'Subtitle', 'English')
	# (#Stream reported by ffprobe (probably useless), language code, subtitle or audio stream, title of subtitle/audio/video stream (can be anything)).

	files = filelist()
	for f in files:
	try:
	_ = vfmt.index(f[-3:].lower()) # Ignore non-video files
	os.rename(f"./{f}", f"./{(f:=fclean(f))}") # Cleans filename

	except ValueError as e:
	continue
	except Exception as e:
	print(type(e))
	continue

	# Because accessing the subtitle streams is not as trivial, because the numbers by ffprobe are skewed/offset, we need to count the subtitles in order to get the real index.
	# ffprobe counts all streams together (0: video, 1: audio, 2: audio, 3-10: subtitles). In order to get stream 3 (subtitle), we actually need to index it by s:0 (first subtitle stream).
	# Because there does not seem to be a convention on keeping audio and subtitles separate (not intermixed), I am "counting" them like this.
	subs = []
	probe = run(f"ffprobe {f}", shell=True, capture_output=True, text=True).stderr # ffprobe returned everything via stderr, didn't bother modifying the shell command.
	groups = regex.findall(probe)
	for t in groups:
	if DEBUG:
	print(t)
	if t[2] == "Subtitle":
	subs.append(t[3])

	# Now accessing the subtitle streams works via corresponding index in list.
	for sub in subtitles:
	try:
	stream = subs.index(sub)
	ffmpegcmd = run(f"ffmpeg -i {f} -map 0:s:{stream} -y {subtitles[sub]}/{f[0:-4]}.srt", shell=True, text=True, capture_output=True)

	except:
	print(f"For language {sub} no subtitle found in {f}")

	if DEBUG:
	print(subs)