Skip to content

Instantly share code, notes, and snippets.

@Cdaprod
Created July 3, 2024 22:55
Show Gist options
  • Save Cdaprod/0769aaacee40cf36d312544ad28aad76 to your computer and use it in GitHub Desktop.
Save Cdaprod/0769aaacee40cf36d312544ad28aad76 to your computer and use it in GitHub Desktop.
This approach allows you to analyze and generate captions for videos using LangChain's experimental video captioning capabilities. For more detailed information and advanced usage, refer to the LangChain API documentation:

To analyze and generate captions for videos using LangChain, you can leverage the langchain_experimental.video_captioning module. Below is a comprehensive guide, including the necessary Python classes for handling video and audio models and generating captions.

Key Classes

  1. BaseModel: This class serves as a base for video and audio models.
  2. VideoModel: Represents a video segment, including start and end times and a description of the visual content.
  3. AudioModel: Represents an audio segment, including start and end times and subtitle text.
  4. CaptionModel: Represents a caption generated from video or audio models.

Implementation

BaseModel

This class serves as the base for other models.

from datetime import datetime
from typing import Any

class BaseModel:
    def __init__(self, start_time: int, end_time: int) -> None:
        self._start_time = start_time
        self._end_time = end_time

    @property
    def start_time(self) -> int:
        return self._start_time

    @start_time.setter
    def start_time(self, value: int) -> None:
        self._start_time = value

    @property
    def end_time(self) -> int:
        return self._end_time

    @end_time.setter
    def end_time(self, value: int) -> None:
        self._end_time = value

    def __str__(self) -> str:
        return f"start_time: {self.start_time}, end_time: {self.end_time}"

    @classmethod
    def from_srt(cls, start_time: str, end_time: str, *args: Any) -> "BaseModel":
        return cls(
            cls._srt_time_to_ms(start_time),
            cls._srt_time_to_ms(end_time),
            *args
        )

    @staticmethod
    def _srt_time_to_ms(srt_time_string: str) -> int:
        time_format = "%H:%M:%S,%f"
        dt = datetime.strptime(srt_time_string, time_format)
        ms = dt.microsecond // 1000
        return dt.second * 1000 + ms

VideoModel

Represents a video segment.

class VideoModel(BaseModel):
    def __init__(self, start_time: int, end_time: int, image_description: str) -> None:
        super().__init__(start_time, end_time)
        self._image_description = image_description

    @property
    def image_description(self) -> str:
        return self._image_description

    @image_description.setter
    def image_description(self, value: str) -> None:
        self._image_description = value

    def __str__(self) -> str:
        return f"{super().__str__()}, image_description: {self.image_description}"

    def similarity_score(self, other: "VideoModel") -> float:
        self_tokenized = set(
            word.lower().rstrip("s") for word in self.image_description.split()
        )
        other_tokenized = set(
            word.lower().rstrip("s") for word in other.image_description.split()
        )
        common_words = self_tokenized.intersection(other_tokenized)
        similarity_score = (
            len(common_words) / max(len(self_tokenized), len(other_tokenized)) * 100
        )
        return similarity_score

AudioModel

Represents an audio segment.

class AudioModel(BaseModel):
    def __init__(self, start_time: int, end_time: int, subtitle_text: str) -> None:
        super().__init__(start_time, end_time)
        self._subtitle_text = subtitle_text

    @property
    def subtitle_text(self) -> str:
        return self._subtitle_text

    @subtitle_text.setter
    def subtitle_text(self, value: str) -> None:
        self._subtitle_text = value

    def __str__(self) -> str:
        return f"{super().__str__()}, subtitle_text: {self.subtitle_text}"

CaptionModel

Represents a caption generated from video or audio models.

class CaptionModel(BaseModel):
    def __init__(self, start_time: int, end_time: int, closed_caption: str) -> None:
        super().__init__(start_time, end_time)
        self._closed_caption = closed_caption

    @property
    def closed_caption(self) -> str:
        return self._closed_caption

    @closed_caption.setter
    def closed_caption(self, value: str) -> None:
        self._closed_caption = value

    def add_subtitle_text(self, subtitle_text: str) -> "CaptionModel":
        self._closed_caption = self._closed_caption + " " + subtitle_text
        return self

    def __str__(self) -> str:
        return f"{super().__str__()}, closed_caption: {self.closed_caption}"

    def to_srt_entry(self, index: int) -> str:
        def _ms_to_srt_time(ms: int) -> str:
            hours = int(ms // 3600000)
            minutes = int((ms % 3600000) // 60000)
            seconds = int((ms % 60000) // 1000)
            milliseconds = int(ms % 1000)
            return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

        return "\n".join(
            [
                f"""{index}
{_ms_to_srt_time(self._start_time)} --> {_ms_to_srt_time(self._end_time)}
{self._closed_caption}""",
            ]
        )

    @classmethod
    def from_audio_model(cls, audio_model: AudioModel) -> "CaptionModel":
        return cls(
            audio_model.start_time,
            audio_model.end_time,
            audio_model.subtitle_text
        )

    @classmethod
    def from_video_model(cls, video_model: VideoModel) -> "CaptionModel":
        return cls(
            video_model.start_time,
            video_model.end_time,
            f"[{video_model.image_description}]",
        )

Usage Example

You can now use these models to generate captions from video and audio segments.

# Example usage
video_model = VideoModel(start_time=0, end_time=10000, image_description="A person is walking")
audio_model = AudioModel(start_time=5000, end_time=15000, subtitle_text="Hello, how are you?")

caption_from_video = CaptionModel.from_video_model(video_model)
caption_from_audio = CaptionModel.from_audio_model(audio_model)

print(caption_from_video)
print(caption_from_audio)

Combining Video and Audio Models

Use the CombineProcessor class to handle overlapping video and audio segments and generate captions.

from langchain_experimental.video_captioning.services.combine_service import CombineProcessor

# Initialize your language model (OpenAI in this case)
llm = OpenAI(api_key="YOUR_OPENAI_API_KEY")

# Create the CombineProcessor
processor = CombineProcessor(llm=llm, verbose=True, char_limit=100)

# List of video and audio models
video_models = [VideoModel(start_time=0, end_time=10, image_description="A person is walking")]
audio_models = [AudioModel(start_time=5, end_time=15, subtitle_text="Hello, how are you?")]

# Process to get captions
captions = processor.process(video_models, audio_models)

for caption in captions:
    print(caption.closed_caption)

This approach allows you to analyze and generate captions for videos using LangChain's experimental video captioning capabilities. For more detailed information and advanced usage, refer to the LangChain API documentation:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment