Video Frame Search

This notebook adds a cooking video to ApertureDB, extracts frames at regular intervals, embeds each frame with a CLIP model, and runs text-to-frame search to find the most relevant moments in the video.

Connect to ApertureDB

Option A: ApertureDB Cloud (recommended)
Sign up for a free 30-day trial. Get your key from Connect → Generate API Key, add it to a .env file in this directory:

APERTUREDB_KEY=your_key_here

Option B: Community Edition (local Docker)
Run this in a terminal before starting the notebook:

docker run -d --name aperturedb \
  -p 55555:55555 -e ADB_MASTER_KEY=admin -e ADB_FORCE_SSL=false \
  aperturedata/aperturedb-community

See client configuration options for all connection methods and server setup options for deployment choices.

%pip install aperturedb sentence-transformers Pillow python-dotenv

from dotenv import load_dotenv
load_dotenv()

True

# !adb config create localdb --active \
#     --host localhost --port 55555 \
#     --username admin --password admin \
#     --no-use-ssl --no-interactive

from aperturedb.CommonLibrary import create_connector

client = create_connector()
response, _ = client.query([{"GetStatus": {}}])
client.print_last_response()

[
    {
        "GetStatus": {
            "info": "OK",
            "status": 0,
            "system": "ApertureDB",
            "version": "0.19.6"
        }
    }
]

Step: Load the CLIP model

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("clip-ViT-B-32")
print(f"Embedding dimensions: {model.get_sentence_embedding_dimension()}")

Step: Add a video to ApertureDB

import requests

VIDEO_URL = "https://raw.githubusercontent.com/aperture-data/Cookbook/main/notebooks/simple/data/crepe_flambe.mp4"

resp = requests.get(VIDEO_URL, timeout=30)
resp.raise_for_status()
video_bytes = resp.content
print(f"Downloaded {len(video_bytes):,} bytes")

q = [{
    "AddVideo": {
        "_ref": 1,
        "properties": {
            "name":    "crepe_flambe",
            "cuisine": "French",
        }
    }
}]
response, _ = client.query(q, [video_bytes])
client.print_last_response()

Downloaded 2,911,784 bytes
[
    {
        "AddVideo": {
            "status": 0
        }
    }
]

Step: Extract frames from the video

# Extract frames at positions 0, 30, 60, 90, 120
FRAME_NUMBERS = [0, 30, 60, 90, 120]

q = [{
    "FindVideo": {
        "constraints": {"name": ["==", "crepe_flambe"]},
        "_ref": 1,
        "blobs": False,
    }
}, {
    "ExtractFrames": {
        "video_ref": 1,
        "frame_spec": FRAME_NUMBERS,
    }
}]

response, frame_blobs = client.query(q)
n_frames = response[1]["ExtractFrames"].get("returned", 0)
print(f"Extracted {n_frames} frames")

Extracted 5 frames

Step: Create a DescriptorSet for frame embeddings

SET_NAME = "video_frame_search"

client.query([{
    "AddDescriptorSet": {
        "name":       SET_NAME,
        "dimensions": 512,
        "engine":     "FaissFlat",
        "metric":     "CS",
    }
}])
client.print_last_response()

[
    {
        "AddDescriptorSet": {
            "status": 0
        }
    }
]

Step: Embed and store each frame

Each frame blob is decoded as a PIL image, embedded with CLIP, and stored as a Descriptor linked to the video with the frame number as a property.

from PIL import Image
from io import BytesIO
import numpy as np

for i, (frame_blob, frame_num) in enumerate(zip(frame_blobs, FRAME_NUMBERS)):
    img = Image.open(BytesIO(frame_blob)).convert("RGB")
    emb = model.encode(img, normalize_embeddings=True).astype("float32")

    q = [{
        "FindVideo": {
            "constraints": {"name": ["==", "crepe_flambe"]},
            "_ref": 1,
            "blobs": False,
        }
    }, {
        "AddDescriptor": {
            "set":   SET_NAME,
            "connect": {"ref": 1, "class": "has_frame_embedding"},
            "properties": {
                "frame_number": frame_num,
                "video_name":   "crepe_flambe",
            }
        }
    }]
    client.query(q, [emb.tobytes()])

print(f"Stored embeddings for {len(frame_blobs)} frames")

Stored embeddings for 5 frames

Step: Text-to-frame search

Encode a text query and find the frame whose visual content best matches. The frame number tells you where in the video the match occurs.

query_text = "pouring batter into pan"

query_emb = model.encode(query_text, normalize_embeddings=True).astype("float32")

q = [{
    "FindDescriptor": {
        "set":         SET_NAME,
        "k_neighbors": 3,
        "distances":   True,
        "results":     {"all_properties": True},
    }
}]

response, _ = client.query(q, [query_emb.tobytes()])

print(f'Query: "{query_text}"\n')
for entity in response[0]["FindDescriptor"].get("entities", []):
    print(f"  Frame {entity['frame_number']:>4}   distance={entity['_distance']:.4f}")

Query: "pouring batter into pan"

  Frame    0   distance=0.2711
  Frame   60   distance=0.2670
  Frame  120   distance=0.2666

Step: Cleanup (optional)

client.query([{"DeleteDescriptorSet": {"with_name": SET_NAME}}])
client.print_last_response()

[
    {
        "DeleteDescriptorSet": {
            "count": 1,
            "status": 0
        }
    }
]

client.query([{"FindVideo":{"_ref":1}}, {"DeleteVideo": {"ref": 1}}])
client.print_last_response()

[
    {
        "FindVideo": {
            "returned": 0,
            "status": 0
        }
    },
    {
        "DeleteVideo": {
            "count": 1,
            "status": 0
        }
    }
]

Connect to ApertureDB​

Step: Load the CLIP model​

Step: Add a video to ApertureDB​

Step: Extract frames from the video​

Step: Create a DescriptorSet for frame embeddings​

Step: Embed and store each frame​

Step: Text-to-frame search​

Step: Cleanup (optional)​