Zero-Shot Video Classification with Feluda

This notebook demonstrates how to use the feluda package to extract video frames and classify the video using the zero-shot classification operator operator. It includes -

  • Setting up Feluda and its operators.

  • Using video data from a subset of the UCF101 video dataset dataset. We are using this for demo, but can be replaced with any video dataset.

  • Using Feluda’s zero-shot classification operator to cluster vidoes.

  • Visualizing the clusters with video thumbnails and the classified labels.

GitHub Open In Colab

Install dependencies conditionally based on whether the notebook is running in Colab or locally.

%%time
import os
import sys

IN_COLAB = "google.colab" in sys.modules
print("Running Notebook in Google Colab" if IN_COLAB else "Running Notebook locally")

if IN_COLAB:
    # Since Google Colab has preinstalled libraries like tensorflow and numba, we create a folder called feluda_custom_venv and isolate the environment there.
    # This is done to avoid any conflicts with the preinstalled libraries.
    %pip install uv
    !mkdir -p /content/feluda_custom_venv
    !uv pip install --target=/content/feluda_custom_venv --prerelease allow feluda feluda-classify-video-zero-shot opencv-python matplotlib > /dev/null 2>&1

    sys.path.insert(0, "/content/feluda_custom_venv")
else:
    !uv pip install feluda feluda-classify-video-zero-shot opencv-python matplotlib > /dev/null 2>&1
Running Notebook locally
Using Python 3.10.12 environment at: /home/aatman/Aatman/Tattle/feluda/.venv
Audited 6 packages in 11ms
CPU times: user 6.38 ms, sys: 4.13 ms, total: 10.5 ms
Wall time: 138 ms
import tarfile

import cv2
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download

from feluda.factory import VideoFactory

Dataset Structure Breakdown

  • UCF101_subset/: The root directory containing the dataset.

    • train/: Contains training samples.

      • Each subdirectory under train/ corresponds to a specific action class:

        • BenchPress

        • BasketballDunk

        • BalanceBeam

        • ApplyLipstick

        • BabyCrawling

        • ApplyEyeMakeup

        • Archery

        • BandMarching

        • BaseballPitch

        • Basketball

    • test/: Same as train.

    • val/: Same as train.

We’ll take train subset as our example here.

# Downloading and extracting

dataset_name = "UCF101_subset/train"
hf_dataset_identifier = "sayakpaul/ucf101-subset"
filename = "UCF101_subset.tar.gz"
file_path = hf_hub_download(
    repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"
)

with tarfile.open(file_path) as t:
    t.extractall(".")
/var/folders/4p/bw6h5x8x1nb_17vsgfc12dz00000gn/T/ipykernel_53804/3114774855.py:12: DeprecationWarning: Python 3.14 will, by default, filter extracted tar archives and reject files or modify their metadata. Use the filter argument to control this behavior.
  t.extractall(".")

Initializing Feluda operator for this example

from feluda.operators import ClassifyVideoZeroShot

classifier = ClassifyVideoZeroShot()
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
def get_video_thumbnail(video_path: str, save_path: str) -> str | None:
    """Extract and save the first frame from the video as a thumbnail."""
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()  # Read the first frame
    if ret:
        thumbnail_path = os.path.join(
            save_path, os.path.basename(video_path).replace(".avi", "_thumbnail.jpg")
        )
        cv2.imwrite(thumbnail_path, frame)  # Save the thumbnail as a JPEG
        cap.release()
        return thumbnail_path  # Return the path to the saved thumbnail
    cap.release()
    return None

In the below codeblock, we are predicting the class of video using classify_video_zero_shot operator. The operator uses CLIP-ViT-B-32 and a zero-shot approach.

operator_parameter = []
labels = os.listdir(f"{dataset_name}")

for class_dir in labels:
    temp_list = os.listdir(os.path.join(dataset_name, class_dir))

    i = 0
    while i < len(temp_list) and i < 5:
        if temp_list[i] == "UCF101":
            i += 1
            continue

        video_full_path = os.path.join(dataset_name, class_dir, temp_list[i])

        # Extract and save the video thumbnail before processing
        thumbnail_save_dir = "thumbnails"
        os.makedirs(thumbnail_save_dir, exist_ok=True)
        thumbnail_path = get_video_thumbnail(video_full_path, thumbnail_save_dir)

        video_path = VideoFactory.make_from_file_on_disk(
            os.path.join(dataset_name, class_dir, temp_list[i])
        )

        operator_parameter.append(
            [
                classifier.run(video_path, labels),
                os.path.join(dataset_name, class_dir, temp_list[i]),
            ]
        )

        i += 1

classify_video_zero_shot operator output example that classifies video as BenchPress

operator_parameter[0][0]
{'prediction': 'BalanceBeam',
 'probs': [0.9969499707221985,
  4.102734601474367e-05,
  2.7282658265903592e-05,
  0.0002480082039255649,
  2.674547431524843e-05,
  3.819030462182127e-05,
  7.645731966476887e-05,
  0.00010040927008958533,
  0.002456656191498041,
  3.523077248246409e-05]}

Clustering videos

This cell groups the videos by its predicted class

# preprocessing clusters for visualization

clusters = {key: [] for key in sorted(labels)}

for results in operator_parameter:
    class_predicted = results[0]["prediction"]
    payload_path = results[1]

    clusters[class_predicted].append(payload_path)

Visualizing Clusters

thumbnail_folder = "thumbnails"


def load_thumbnail(payload: str) -> str | None:
    """Load the thumbnail from the pre-saved thumbnail folder."""
    video_filename = os.path.basename(payload)
    thumbnail_filename = video_filename.replace(".avi", "_thumbnail.jpg")
    thumbnail_path = os.path.join(thumbnail_folder, thumbnail_filename)

    if os.path.exists(thumbnail_path):
        return cv2.imread(thumbnail_path)
    print(f"Thumbnail not found for {video_filename}")
    return None


for cluster_label, video_paths in clusters.items():
    num_videos = len(video_paths)
    if not num_videos:
        continue

    fig, axes = plt.subplots(1, num_videos, figsize=(20, 5))

    # Check if axes is a single Axes object or an array of Axes
    if num_videos == 1:
        axes = [axes]

    for i, video_path in enumerate(video_paths):
        video_thumbnail = load_thumbnail(video_path)
        if video_thumbnail is not None:
            video_thumbnail = cv2.cvtColor(
                cv2.resize(video_thumbnail, (60, 60)), cv2.COLOR_BGR2RGB
            )
            axes[i].imshow(video_thumbnail)
            axes[i].axis("off")

    plt.suptitle(f"{cluster_label}", fontsize=16)
    plt.tight_layout()
    plt.subplots_adjust(top=0.85)
    plt.show()
../_images/d97684ef80b8f808632630e26485278d307b17b306b558cd8c136f77d31cd010.png ../_images/c9f544131a78ad1d151dcea0216937e6d5c17ae594b43403f7e1f14428ca63b7.png ../_images/2fab50a61b1a10e1b2b223a8654baf4a8801b7d6afcc82895abc3d56c147bb25.png ../_images/1716611a6191e4316df7c43789807e2a236107c166955681ffdbd40239400ce4.png ../_images/4e8dd8cabe3d9a5a36250101858ee105bf97b8984eebbb65c658fb1b628fb7b1.png ../_images/6c4ab240ef773135510ace2f354b6454e79455a871aa5760f719e16c07d16fd5.png ../_images/54d0e3cfc44cd2be4970a3873ef9b1b59634e9519a843189646a7b7d6dd47a2d.png ../_images/b2274057eb78b70b02f08686c54802b18e0c9af332fb287235776844efa65b20.png
# Clean up resources when you're done
import shutil

shutil.rmtree("thumbnails")
shutil.rmtree("UCF101_subset")

classifier.cleanup()