dov/pandassrt.py

## pandassrt.py
# This file reads two subtitle srt files. The first one is assumed
# to be wrongly aligned, and the second one is correctly aligned.
# In additon the user needs to create a few alignment points. These
# will be used for rescaling the timescale of the first srt file
# so that it fits the second file.
#
# Dov Grobgeld
# dov.grobgeld@gmail.com

import pandas as pd
import srt

def readsrt(filename):
  subin = srt.subreader(open(filename))

  tstart_ary,tend_ary,text_ary = [],[],[]
  for (tstart,tend),text in subin:
    tstart_ary += [tstart]
    tend_ary += [tend]
    text_ary += [text]

  return pd.DataFrame({'tstart':tstart_ary,'tend':tend_ary,'text':text_ary})

def tosrt(df, filename,offset=1):
  subout = srt.subwriter(open(filename,'w'),offset=offset)
  for idx,row in df.iterrows():
    subout.write_record(((row.tstart,row.tend),row.text))

def realign_srt(subject, reference, align_pairs):
  '''Realigns the subject srt file to the reference srt file on the given alignment_points'''
  df_realigned = subject.copy()

  n = len(align_pairs)
  for i in range(n-1):
    ts1idx,ts2idx = [v-1 for v in align_pairs[i]]
    tss1,tss2 = subject.tstart.ix[ts1idx],reference.tstart.ix[ts2idx]

    te1idx,te2idx = [v-1 for v in align_pairs[i+1]]
    tse1,tse2 = subject.tstart.ix[te1idx],reference.tstart.ix[te2idx]
    slope = (1.0*(tse2-tss2)/(tse1-tss1))
    print('print tse1,tse2, slope, tse1*=',
          srt.ms2time(tse1),
          srt.ms2time(tse2),
          srt.ms2time((tse1-tss1)*slope+tss2),
          slope)

    # Extrapolate to end of the file for the last point
    if i==n-2:
      te1idx = len(df_realigned)-1

    # Interpolate the times
    for clm in ['tstart','tend']:
      df_realigned.loc[ts1idx:te1idx+1,clm] = (
        (subject.loc[ts1idx:te1idx+1,clm] - tss1) * slope + tss2).astype(int)

  return df_realigned

def realign_srt_files(subject_filename, reference_filename, align_pairs,
                      new_filename,
                      offset=1):
  tosrt(realign_srt(
        readsrt(subject_filename),
        readsrt(reference_filename),
        align_pairs),
        new_filename,
        offset=offset)

if __name__ == '__main__':
  realign_srt_files(
    '/tmp/heb1.srt', # 'bad.he.srt',
    '/tmp/en-fixed.srt', #'good.en.srt',
    align_pairs = [
      (3, 4),
      (456,441)
      ],
    new_filename = '/tmp/fixed1.srt',
    offset=1
    )

  realign_srt_files(
    '/tmp/heb2.srt', # 'bad.he.srt',
    '/tmp/en-fixed.srt', #'good.en.srt',
    align_pairs = [
      (1, 442),
      (380,821)
      ],
    new_filename = '/tmp/fixed2.srt',
    offset=457)


## srt.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#    Copyleft 2011 wistful <wst public mail at gmail com>
#
#    This is a free software; you can redistribute it and/or
#    modify it under the terms of the GNU Lesser General Public
#    License as published by the Free Software Foundation; either
#    version 2.1 of the License, or (at your option) any later version.
#
#    This library is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#    Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public
#    License along with this library; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#  Minor modified to be more pythonesque by Dov Grobgeld

__author__ = 'wistful'

import re


class SrtFormatError(Exception):
    def __init__(self, message):
        self.message = message

    def __str__(self):
        return repr(self.message)


def parse_time(str_time):
    """
    convert string format of start-finish to integer(ms) format
    >>> parse_time("00:14:33,460 --> 00:14:35,419")
    (873460, 875419)
    """
    pattern_time = r"(?P<h1>\d+):(?P<m1>\d+):(?P<s1>\d+),(?P<ms1>\d+)\W*-->\W*(?P<h2>\d+):(?P<m2>\d+):(?P<s2>\d+),(?P<ms2>\d+)$"
    try:
        d = re.match(pattern_time, str_time.strip()).groupdict()
    except:
        message = u"Invalid string format '%s' , expect hh:mm:ss,msc --> hh:mm:ss,msc" % str_time
        raise SrtFormatError(message)
    get_ms = lambda h, m, s, ms: (int(s) + int(m) * 60 + int(h) * 60 * 60) * 1000 + int(ms)
    return get_ms(d['h1'], d['m1'], d['s1'], d['ms1']), get_ms(d['h2'], d['m2'], d['s2'], d['ms2'])


def ms2time(ms):
    """
    convert msc to string format
    >>> ms2time(233243)
    '00:03:53,243'
    >>> ms2time(442)
    '00:00:00,442'
    """
    it = int(ms / 1000)
    ms = ms - it * 1000
    ss = it % 60
    mm = ((it - ss) / 60) % 60
    hh = ((it - (mm * 60) - ss) / 3600) % 60
    return "%02d:%02d:%02d,%03d" % (hh, mm, ss, ms)


def parse_ms(start, finish):
    """
    convert msc representation to string format
    >>> parse_ms(442, 233243)
    '00:00:00,442 --> 00:03:53,243'
    """
    return "%s --> %s" % (ms2time(start), ms2time(finish))


def subreader(handle):
    """
    return [((time_start, time_finish), subtitle_text), ...]
    file_path: full path to srt-file
    """
    pattern_index = r"^\d+$"
    records, times, text = list(), None, list()
    for line in handle:
        # Get rid of bom markers!
        line = line.replace('\xef\xbb\xbf','').strip()
        if re.match(pattern_index, line):
            if times:
                yield (times, '\n'.join(text) + '\n')
                times, text = None, list()
        elif '-->' in line:
            times = parse_time(line)
        elif line:
            text.append(line)
    if times:
        yield (times, '\n'.join(text) + '\n')

class subwriter:
  def __init__(self, handle, offset=1):
    self.handle = handle
    self.index = offset

  def write_record(self, record):
    ((start, finish), text) = record
    self.handle.write("%s\n%s\n%s\n" % (str(self.index), parse_ms(start, finish), text))
    self.index+=1

if __name__ == '__main__':
    import doctest
    print doctest.testmod()
	# This file reads two subtitle srt files. The first one is assumed
	# to be wrongly aligned, and the second one is correctly aligned.
	# In additon the user needs to create a few alignment points. These
	# will be used for rescaling the timescale of the first srt file
	# so that it fits the second file.
	#
	# Dov Grobgeld
	# dov.grobgeld@gmail.com

	import pandas as pd
	import srt

	def readsrt(filename):
	subin = srt.subreader(open(filename))

	tstart_ary,tend_ary,text_ary = [],[],[]
	for (tstart,tend),text in subin:
	tstart_ary += [tstart]
	tend_ary += [tend]
	text_ary += [text]

	return pd.DataFrame({'tstart':tstart_ary,'tend':tend_ary,'text':text_ary})

	def tosrt(df, filename,offset=1):
	subout = srt.subwriter(open(filename,'w'),offset=offset)
	for idx,row in df.iterrows():
	subout.write_record(((row.tstart,row.tend),row.text))

	def realign_srt(subject, reference, align_pairs):
	'''Realigns the subject srt file to the reference srt file on the given alignment_points'''
	df_realigned = subject.copy()

	n = len(align_pairs)
	for i in range(n-1):
	ts1idx,ts2idx = [v-1 for v in align_pairs[i]]
	tss1,tss2 = subject.tstart.ix[ts1idx],reference.tstart.ix[ts2idx]

	te1idx,te2idx = [v-1 for v in align_pairs[i+1]]
	tse1,tse2 = subject.tstart.ix[te1idx],reference.tstart.ix[te2idx]
	slope = (1.0*(tse2-tss2)/(tse1-tss1))
	print('print tse1,tse2, slope, tse1*=',
	srt.ms2time(tse1),
	srt.ms2time(tse2),
	srt.ms2time((tse1-tss1)*slope+tss2),
	slope)

	# Extrapolate to end of the file for the last point
	if i==n-2:
	te1idx = len(df_realigned)-1

	# Interpolate the times
	for clm in ['tstart','tend']:
	df_realigned.loc[ts1idx:te1idx+1,clm] = (
	(subject.loc[ts1idx:te1idx+1,clm] - tss1) * slope + tss2).astype(int)

	return df_realigned

	def realign_srt_files(subject_filename, reference_filename, align_pairs,
	new_filename,
	offset=1):
	tosrt(realign_srt(
	readsrt(subject_filename),
	readsrt(reference_filename),
	align_pairs),
	new_filename,
	offset=offset)

	if __name__ == '__main__':
	realign_srt_files(
	'/tmp/heb1.srt', # 'bad.he.srt',
	'/tmp/en-fixed.srt', #'good.en.srt',
	align_pairs = [
	(3, 4),
	(456,441)
	],
	new_filename = '/tmp/fixed1.srt',
	offset=1
	)

	realign_srt_files(
	'/tmp/heb2.srt', # 'bad.he.srt',
	'/tmp/en-fixed.srt', #'good.en.srt',
	align_pairs = [
	(1, 442),
	(380,821)
	],
	new_filename = '/tmp/fixed2.srt',
	offset=457)
	#! /usr/bin/env python
	# -- coding: utf-8 --
	# Copyleft 2011 wistful <wst public mail at gmail com>
	#
	# This is a free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# This library is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with this library; if not, write to the Free Software
	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	#
	# Minor modified to be more pythonesque by Dov Grobgeld

	__author__ = 'wistful'

	import re


	class SrtFormatError(Exception):
	def __init__(self, message):
	self.message = message

	def __str__(self):
	return repr(self.message)


	def parse_time(str_time):
	"""
	convert string format of start-finish to integer(ms) format
	>>> parse_time("00:14:33,460 --> 00:14:35,419")
	(873460, 875419)
	"""
	pattern_time = r"(?P<h1>\d+):(?P<m1>\d+):(?P<s1>\d+),(?P<ms1>\d+)\W-->\W(?P<h2>\d+):(?P<m2>\d+):(?P<s2>\d+),(?P<ms2>\d+)$"
	try:
	d = re.match(pattern_time, str_time.strip()).groupdict()
	except:
	message = u"Invalid string format '%s' , expect hh:mm:ss,msc --> hh:mm:ss,msc" % str_time
	raise SrtFormatError(message)
	get_ms = lambda h, m, s, ms: (int(s) + int(m) * 60 + int(h) * 60 * 60) * 1000 + int(ms)
	return get_ms(d['h1'], d['m1'], d['s1'], d['ms1']), get_ms(d['h2'], d['m2'], d['s2'], d['ms2'])


	def ms2time(ms):
	"""
	convert msc to string format
	>>> ms2time(233243)
	'00:03:53,243'
	>>> ms2time(442)
	'00:00:00,442'
	"""
	it = int(ms / 1000)
	ms = ms - it * 1000
	ss = it % 60
	mm = ((it - ss) / 60) % 60
	hh = ((it - (mm * 60) - ss) / 3600) % 60
	return "%02d:%02d:%02d,%03d" % (hh, mm, ss, ms)


	def parse_ms(start, finish):
	"""
	convert msc representation to string format
	>>> parse_ms(442, 233243)
	'00:00:00,442 --> 00:03:53,243'
	"""
	return "%s --> %s" % (ms2time(start), ms2time(finish))


	def subreader(handle):
	"""
	return [((time_start, time_finish), subtitle_text), ...]
	file_path: full path to srt-file
	"""
	pattern_index = r"^\d+$"
	records, times, text = list(), None, list()
	for line in handle:
	# Get rid of bom markers!
	line = line.replace('\xef\xbb\xbf','').strip()
	if re.match(pattern_index, line):
	if times:
	yield (times, '\n'.join(text) + '\n')
	times, text = None, list()
	elif '-->' in line:
	times = parse_time(line)
	elif line:
	text.append(line)
	if times:
	yield (times, '\n'.join(text) + '\n')

	class subwriter:
	def __init__(self, handle, offset=1):
	self.handle = handle
	self.index = offset

	def write_record(self, record):
	((start, finish), text) = record
	self.handle.write("%s\n%s\n%s\n" % (str(self.index), parse_ms(start, finish), text))
	self.index+=1

	if __name__ == '__main__':
	import doctest
	print doctest.testmod()