ClipsAI · mohit-twelvelabs · Jun 25, 2026
diff --git a/README.md b/README.md
@@ -50,6 +50,34 @@ print("StartTime: ", clips[0].start_time)
 print("EndTime: ", clips[0].end_time)
 ```
 
+### Finding clips with TwelveLabs Pegasus (optional)
+
+The default `ClipFinder` is transcript-only: it finds clips from the words alone. For videos where the highlights are visual rather than purely verbal, `PegasusClipFinder` is an opt-in alternative that uses [TwelveLabs](https://twelvelabs.io) Pegasus, a video-language model that analyzes the actual video. It proposes highlight time ranges which are aligned to the transcript so the returned `Clip` objects are interchangeable with those from `ClipFinder`.
+
+Install the optional dependency and set a TwelveLabs API key (a free key with a generous free tier is available at [twelvelabs.io](https://twelvelabs.io)):
+
+```bash {{ language: 'python' }}
+pip install "clipsai[twelvelabs]"
+export TWELVELABS_API_KEY="your-key"
+```
+
+```python
+from clipsai import PegasusClipFinder, Transcriber
+
+transcriber = Transcriber()
+transcription = transcriber.transcribe(audio_file_path="/abs/path/to/video.mp4")
+
+clipfinder = PegasusClipFinder()
+# pass a publicly accessible URL, or a video_id already indexed in TwelveLabs
+clips = clipfinder.find_clips(
+    transcription=transcription,
+    video_url="https://example.com/video.mp4",
+)
+
+print("StartTime: ", clips[0].start_time)
+print("EndTime: ", clips[0].end_time)
+```
+
 ### Resizing a video
 
 A hugging face access token is required to resize a video since [Pyannote](https://github.com/pyannote/pyannote-audio) is utilized for speaker diarization. You won't be charged for using Pyannote and instructions are on the [Pyannote HuggingFace ](https://huggingface.co/pyannote/speaker-diarization-3.0#requirements) page. For resizing the original video to the desired aspect ratio, refer to the resizing reference.

diff --git a/clipsai/__init__.py b/clipsai/__init__.py
@@ -1,5 +1,6 @@
 # Functions
 from .clip.clipfinder import ClipFinder
+from .clip.pegasus_clipfinder import PegasusClipFinder
 from .media.audio_file import AudioFile
 from .media.audiovideo_file import AudioVideoFile
 from .media.editor import MediaEditor
@@ -22,6 +23,7 @@
     "Clip",
     "Crops",
     "MediaEditor",
+    "PegasusClipFinder",
     "Segment",
     "Sentence",
     "Transcriber",

diff --git a/clipsai/clip/pegasus_clipfinder.py b/clipsai/clip/pegasus_clipfinder.py
@@ -0,0 +1,296 @@
+"""
+Finding clips using TwelveLabs Pegasus video understanding.
+
+Notes
+-----
+- Pegasus is a video-language model that analyzes the actual video (visuals, audio,
+  and pacing) rather than the transcript alone. This makes it an alternative to the
+  transcript-only TextTiling heuristic used by ``ClipFinder``, useful for videos where
+  highlights are not purely verbal.
+- Requires a TwelveLabs API key (set ``TWELVELABS_API_KEY`` or pass ``api_key``). Get a
+  free key with a generous free tier at https://twelvelabs.io.
+- TwelveLabs Python SDK: https://github.com/twelvelabs-io/twelvelabs-python
+"""
+# standard library imports
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+
+# current package imports
+from .clip import Clip
+from .exceptions import ClipFinderError
+
+# local package imports
+from clipsai.transcribe.transcription import Transcription
+from clipsai.utils.config_manager import ConfigManager
+from clipsai.utils.utils import find_missing_dict_keys
+
+DEFAULT_PROMPT = (
+    "You are a video editor selecting standalone short-form clips from a longer "
+    "video. Identify the {max_clips} most engaging, self-contained highlights. Each "
+    "highlight must be between {min_clip_duration} and {max_clip_duration} seconds "
+    "long. Respond with ONLY a JSON array of objects, each with numeric "
+    '"start_time" and "end_time" fields in seconds, e.g. '
+    '[{{"start_time": 12.0, "end_time": 48.5}}]. Do not include any other text.'
+)
+
+
+class PegasusClipFinder:
+    """
+    Finds clips within a video using TwelveLabs' Pegasus video understanding model.
+
+    This is an opt-in alternative to :class:`~clipsai.clip.clipfinder.ClipFinder`,
+    which relies solely on the transcript. Pegasus analyzes the video itself to
+    propose highlight time ranges, which are then aligned to the transcript's
+    character offsets so the returned :class:`~clipsai.clip.clip.Clip` objects are
+    interchangeable with those produced by ``ClipFinder``.
+    """
+
+    def __init__(
+        self,
+        api_key: str = None,
+        model_name: str = "pegasus1.5",
+        min_clip_duration: int = 15,
+        max_clip_duration: int = 900,
+        max_clips: int = 10,
+        max_tokens: int = 2048,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        api_key: str
+            TwelveLabs API key. If None, read from the ``TWELVELABS_API_KEY``
+            environment variable.
+        model_name: str
+            Pegasus model to use. Possible values: 'pegasus1.5', 'pegasus1.2'
+        min_clip_duration: int
+            Minimum duration in seconds for a clip
+        max_clip_duration: int
+            Maximum duration in seconds for a clip
+        max_clips: int
+            Maximum number of clips to request from Pegasus
+        max_tokens: int
+            Maximum number of tokens Pegasus may generate
+        """
+        config_manager = PegasusClipFinderConfigManager()
+        config_manager.assert_valid_config(
+            {
+                "model_name": model_name,
+                "min_clip_duration": min_clip_duration,
+                "max_clip_duration": max_clip_duration,
+                "max_clips": max_clips,
+            }
+        )
+
+        api_key = api_key or os.environ.get("TWELVELABS_API_KEY")
+        if not api_key:
+            raise ClipFinderError(
+                "A TwelveLabs API key is required. Pass api_key or set the "
+                "TWELVELABS_API_KEY environment variable. Get a free key at "
+                "https://twelvelabs.io."
+            )
+
+        self._api_key = api_key
+        self._model_name = model_name
+        self._min_clip_duration = min_clip_duration
+        self._max_clip_duration = max_clip_duration
+        self._max_clips = max_clips
+        self._max_tokens = max_tokens
+
+    def find_clips(
+        self,
+        transcription: Transcription,
+        video_url: str = None,
+        video_id: str = None,
+    ) -> list[Clip]:
+        """
+        Finds clips in a video using TwelveLabs Pegasus.
+
+        Provide exactly one of ``video_url`` or ``video_id``. The transcription is
+        used to map Pegasus' proposed time ranges to character offsets so the returned
+        clips match the format produced by ``ClipFinder``.
+
+        Parameters
+        ----------
+        transcription: Transcription
+            the transcription of the source media, used to map times to char offsets
+        video_url: str
+            a publicly accessible URL to the video to analyze
+        video_id: str
+            the id of a video already indexed in a TwelveLabs index
+
+        Returns
+        -------
+        list[Clip]
+            list of clips found in the video
+        """
+        if (video_url is None) == (video_id is None):
+            raise ClipFinderError("Provide exactly one of 'video_url' or 'video_id'.")
+
+        prompt = DEFAULT_PROMPT.format(
+            max_clips=self._max_clips,
+            min_clip_duration=self._min_clip_duration,
+            max_clip_duration=self._max_clip_duration,
+        )
+
+        # imported lazily so the optional dependency is only required when used
+        from twelvelabs import TwelveLabs
+
+        client = TwelveLabs(api_key=self._api_key)
+        if video_id is not None:
+            response = client.analyze(
+                model_name=self._model_name,
+                video_id=video_id,
+                prompt=prompt,
+                max_tokens=self._max_tokens,
+            )
+        else:
+            response = client.analyze(
+                model_name=self._model_name,
+                video={"type": "url", "url": video_url},
+                prompt=prompt,
+                max_tokens=self._max_tokens,
+            )
+
+        time_ranges = self._parse_response(response.data)
+        return self._ranges_to_clips(time_ranges, transcription)
+
+    def _parse_response(self, text: str) -> list[dict]:
+        """
+        Parses the JSON array of time ranges out of Pegasus' text response.
+
+        Parameters
+        ----------
+        text: str
+            the raw text returned by Pegasus
+
+        Returns
+        -------
+        list[dict]
+            list of dicts with 'start_time' and 'end_time' keys
+        """
+        if not text:
+            raise ClipFinderError("Pegasus returned an empty response.")
+
+        # Pegasus may wrap the JSON in prose or markdown fences; extract the array.
+        match = re.search(r"\[.*\]", text, re.DOTALL)
+        if match is None:
+            raise ClipFinderError(
+                "Could not find a JSON array in the Pegasus response: {}".format(text)
+            )
+        try:
+            ranges = json.loads(match.group(0))
+        except json.JSONDecodeError as exc:
+            raise ClipFinderError(
+                "Failed to parse JSON from the Pegasus response: {}".format(exc)
+            )
+        if not isinstance(ranges, list):
+            raise ClipFinderError("Expected a JSON array from Pegasus.")
+        return ranges
+
+    def _ranges_to_clips(
+        self,
+        time_ranges: list[dict],
+        transcription: Transcription,
+    ) -> list[Clip]:
+        """
+        Converts Pegasus time ranges into Clip objects, dropping ranges that fall
+        outside the configured duration bounds or the transcript's bounds.
+
+        Parameters
+        ----------
+        time_ranges: list[dict]
+            list of dicts with 'start_time' and 'end_time' keys (seconds)
+        transcription: Transcription
+            transcription used to map times to character offsets
+
+        Returns
+        -------
+        list[Clip]
+            list of clips
+        """
+        media_end = transcription.end_time
+        clips = []
+        for time_range in time_ranges:
+            if "start_time" not in time_range or "end_time" not in time_range:
+                logging.warning("Skipping malformed Pegasus range: %s", time_range)
+                continue
+
+            start_time = max(0.0, float(time_range["start_time"]))
+            end_time = min(float(time_range["end_time"]), media_end)
+            duration = end_time - start_time
+            if duration < self._min_clip_duration:
+                continue
+            if duration > self._max_clip_duration:
+                continue
+
+            start_char = transcription.find_char_index(start_time, "start")
+            end_char = transcription.find_char_index(end_time, "end")
+            clips.append(Clip(start_time, end_time, start_char, end_char))
+
+        clips.sort(key=lambda clip: clip.start_time)
+        return clips
+
+
+class PegasusClipFinderConfigManager(ConfigManager):
+    """
+    A class for validating PegasusClipFinder configuration settings.
+    """
+
+    VALID_MODEL_NAMES = ("pegasus1.5", "pegasus1.2")
+
+    def check_valid_config(self, config: dict) -> str or None:
+        """
+        Checks that 'config' contains valid configuration settings. Returns None if
+        valid, a descriptive error message if invalid.
+
+        Parameters
+        ----------
+        config: dict
+            A dictionary containing the configuration settings for PegasusClipFinder.
+
+        Returns
+        -------
+        str or None
+            None if the inputs are valid, otherwise an error message.
+        """
+        required_keys = [
+            "model_name",
+            "min_clip_duration",
+            "max_clip_duration",
+            "max_clips",
+        ]
+        missing_keys = find_missing_dict_keys(config, required_keys)
+        if len(missing_keys) != 0:
+            return "PegasusClipFinder missing configuration settings: {}".format(
+                missing_keys
+            )
+
+        if config["model_name"] not in self.VALID_MODEL_NAMES:
+            return "model_name must be one of {}, not {}".format(
+                self.VALID_MODEL_NAMES, config["model_name"]
+            )
+
+        self._type_checker.check_type(
+            config["min_clip_duration"], "min_clip_duration", (float, int)
+        )
+        self._type_checker.check_type(
+            config["max_clip_duration"], "max_clip_duration", (float, int)
+        )
+        self._type_checker.check_type(config["max_clips"], "max_clips", int)
+
+        if config["min_clip_duration"] < 0:
+            return "min_clip_duration must be 0 or greater, not {}".format(
+                config["min_clip_duration"]
+            )
+        if config["max_clip_duration"] <= config["min_clip_duration"]:
+            return (
+                "max_clip_duration of {} must be greater than min_clip_duration of {}"
+                "".format(config["max_clip_duration"], config["min_clip_duration"])
+            )
+        if config["max_clips"] < 1:
+            return "max_clips must be 1 or greater, not {}".format(config["max_clips"])
+
+        return None
diff --git a/setup.py b/setup.py
@@ -51,6 +51,9 @@
     },
     include_package_data=True,
     extras_require={
+        "twelvelabs": [
+            "twelvelabs>=1.2.8",
+        ],
         "dev": [
             "black",
             "black[jupyter]",