-
-
Save prcutler/2f72b29af7a5dcae0b22db0e55511017 to your computer and use it in GitHub Desktop.
Convert CSV to SRT subtitles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from pathlib import Path | |
import numpy as np | |
import pandas as pd | |
parser = argparse.ArgumentParser( | |
description="", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
) | |
parser.add_argument("in_csv", type=Path) | |
parser.add_argument("out_csv", type=Path) | |
parser.add_argument("--timestamp-col", type=str, default="timestamp") | |
parser.add_argument("--duration", type=float) | |
def main(args): | |
points = pd.read_csv(args.in_csv) | |
point_timestamp_col = args.timestamp_col | |
start_timestamp_col = "start_" + args.timestamp_col | |
stop_timestamp_col = "stop_" + args.timestamp_col | |
point_times = points[point_timestamp_col].apply(timestamp_to_seconds) | |
start_times = point_times.values | |
stop_times = np.concatenate([point_times.values[1:], [args.duration]]) | |
segments = pd.DataFrame( | |
{ | |
start_timestamp_col: [seconds_to_timestamp(s) for s in start_times], | |
stop_timestamp_col: [seconds_to_timestamp(s) for s in stop_times], | |
} | |
) | |
for col in set(points.columns) - {point_timestamp_col}: | |
segments[col] = points[col].values | |
segments.to_csv(args.out_csv, index=False) | |
def seconds_to_timestamp(total_seconds: float) -> str: | |
"""Convert seconds into a timestamp | |
Args: | |
total_seconds: time in seconds | |
Returns: | |
timestamp representing ``total_seconds`` | |
Examples: | |
>>> seconds_to_timestamp(1) | |
'00:00:01.000' | |
>>> seconds_to_timestamp(1.1) | |
'00:00:01.100' | |
>>> seconds_to_timestamp(60) | |
'00:01:00.000' | |
>>> seconds_to_timestamp(61) | |
'00:01:01.000' | |
>>> seconds_to_timestamp(60 * 60 + 1) | |
'01:00:01.000' | |
>>> seconds_to_timestamp(60 * 60 + 60 + 1) | |
'01:01:01.000' | |
>>> seconds_to_timestamp(1225.78500002) | |
'00:20:25.785' | |
""" | |
ss = total_seconds % 60 | |
mm = np.floor((total_seconds / 60) % 60) | |
hh = np.floor((total_seconds / (60 * 60))) | |
return "{:02.0f}:{:02.0f}:{:06.3f}".format(hh, mm, ss) | |
def timestamp_to_seconds(timestamp: str) -> float: | |
"""Convert a timestamp into total number of seconds | |
Args: | |
timestamp: formatted as ``HH:MM:SS[.FractionalPart]`` | |
Returns: | |
``timestamp`` converted to seconds | |
Examples: | |
>>> timestamp_to_seconds("00:00:00") | |
0.0 | |
>>> timestamp_to_seconds("00:00:05") | |
5.0 | |
>>> timestamp_to_seconds("00:00:05.5") | |
5.5 | |
>>> timestamp_to_seconds("00:01:05.5") | |
65.5 | |
>>> timestamp_to_seconds("01:01:05.5") | |
3665.5 | |
""" | |
_MINUTES_TO_SECONDS = 60 | |
_HOURS_TO_SECONDS = 60 * _MINUTES_TO_SECONDS | |
hours, minutes, seconds = map(float, timestamp.split(":")) | |
total_seconds = hours * _HOURS_TO_SECONDS + minutes * _MINUTES_TO_SECONDS + seconds | |
return total_seconds | |
if __name__ == "__main__": | |
main(parser.parse_args()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import sys | |
from dataclasses import dataclass | |
from pathlib import Path | |
from typing import List | |
import numpy as np | |
import pandas as pd | |
parser = argparse.ArgumentParser( | |
description="Create a SRT file from timestamped columns", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
) | |
parser.add_argument("csv", type=Path, help="Source CSV") | |
parser.add_argument("srt", type=Path, help="Destination SRT") | |
parser.add_argument( | |
"--force", action="store_true", help="Overwrite destination if it already exists" | |
) | |
parser.add_argument( | |
"--start-timestamp-col", | |
type=str, | |
default="timestamp", | |
help="Start timestamp column", | |
) | |
parser.add_argument( | |
"--subtitle-col", | |
type=str, | |
default="subtitle", | |
help="Column to use as contents of subtitle", | |
) | |
parser.add_argument("--stop-timestamp-col", type=str, help="Stop timestamp if present") | |
parser.add_argument( | |
"--duration", | |
type=float, | |
default=2, | |
help="Duration for subtitle if --stop-timestamp is not specified", | |
) | |
parser.add_argument( | |
"--center", | |
action="store_true", | |
help="Center subtitle around start time when --duration is specified instead of " | |
"starting from the start timestamp.", | |
) | |
def main(args): | |
if args.srt.exists() and not args.force: | |
print(f"{args.srt} already exists, quitting. Use --force to overwrite.") | |
df = pd.read_csv(args.csv) | |
cols_to_check = [args.start_timestamp_col, args.subtitle_col] | |
if args.stop_timestamp_col is not None: | |
cols_to_check.insert(1, args.stop_timestamp_col) | |
check_failed = False | |
for col in cols_to_check: | |
if col not in df.columns: | |
print(f"{col!r} not present in columns ({df.columns}).") | |
check_failed = True | |
if check_failed: | |
sys.exit(-1) | |
start_times = df[args.start_timestamp_col].apply(timestamp_to_seconds) | |
if args.stop_timestamp_col is None: | |
stop_times = start_times + args.duration | |
if args.center: | |
start_times -= args.duration / 2 | |
start_times = start_times.clip(lower=0) | |
stop_times -= args.duration / 2 | |
else: | |
stop_times = df[args.stop_timestamp_col].apply(timestamp_to_seconds) | |
subs = df[args.subtitle_col] | |
subtitle_entries = [ | |
SubtitleEntry( | |
start_timestamp=seconds_to_timestamp(start), | |
stop_timestamp=seconds_to_timestamp(stop), | |
subtitle=str(sub), | |
) | |
for start, stop, sub in zip(start_times, stop_times, subs) | |
] | |
srt = subtitles_to_srt(subtitle_entries) | |
with open(args.srt, "w") as f: | |
f.write(srt) | |
@dataclass | |
class SubtitleEntry: | |
start_timestamp: str | |
stop_timestamp: str | |
subtitle: str | |
def subtitles_to_srt(subtitles: List[SubtitleEntry]) -> str: | |
srt = "" | |
for i, subtitle in enumerate(subtitles): | |
if i > 0: | |
srt += "\n" | |
seq_number = i + 1 | |
srt += f"{seq_number}\n" | |
srt += f"{subtitle.start_timestamp} --> {subtitle.stop_timestamp}\n" | |
srt += f"{subtitle.subtitle}\n" | |
return srt | |
def seconds_to_timestamp(total_seconds: float) -> str: | |
"""Convert seconds into a timestamp | |
Args: | |
total_seconds: time in seconds | |
Returns: | |
timestamp representing ``total_seconds`` | |
Examples: | |
>>> seconds_to_timestamp(1) | |
'00:00:01.000' | |
>>> seconds_to_timestamp(1.1) | |
'00:00:01.100' | |
>>> seconds_to_timestamp(60) | |
'00:01:00.000' | |
>>> seconds_to_timestamp(61) | |
'00:01:01.000' | |
>>> seconds_to_timestamp(60 * 60 + 1) | |
'01:00:01.000' | |
>>> seconds_to_timestamp(60 * 60 + 60 + 1) | |
'01:01:01.000' | |
>>> seconds_to_timestamp(1225.78500002) | |
'00:20:25.785' | |
""" | |
ss = total_seconds % 60 | |
mm = np.floor((total_seconds / 60) % 60) | |
hh = np.floor((total_seconds / (60 * 60))) | |
return "{:02.0f}:{:02.0f}:{:06.3f}".format(hh, mm, ss) | |
def timestamp_to_seconds(timestamp: str) -> float: | |
"""Convert a timestamp into total number of seconds | |
Args: | |
timestamp: formatted as ``HH:MM:SS[.FractionalPart]`` | |
Returns: | |
``timestamp`` converted to seconds | |
Examples: | |
>>> timestamp_to_seconds("00:00:00") | |
0.0 | |
>>> timestamp_to_seconds("00:00:05") | |
5.0 | |
>>> timestamp_to_seconds("00:00:05.5") | |
5.5 | |
>>> timestamp_to_seconds("00:01:05.5") | |
65.5 | |
>>> timestamp_to_seconds("01:01:05.5") | |
3665.5 | |
""" | |
_MINUTES_TO_SECONDS = 60 | |
_HOURS_TO_SECONDS = 60 * _MINUTES_TO_SECONDS | |
hours, minutes, seconds = map(float, timestamp.split(":")) | |
total_seconds = hours * _HOURS_TO_SECONDS + minutes * _MINUTES_TO_SECONDS + seconds | |
return total_seconds | |
def to_subtitles(df): | |
return [ | |
SubtitleEntry( | |
start_timestamp=seconds_to_timestamp(timestamp_to_seconds(r.start_time)), | |
stop_timestamp=seconds_to_timestamp(timestamp_to_seconds(r.stop_time)), | |
subtitle=r.thread, | |
) | |
for _, r in df.iterrows() | |
] | |
if __name__ == "__main__": | |
main(parser.parse_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment