Video Frame Search
This notebook adds a cooking video to ApertureDB, extracts frames at regular intervals, embeds each frame with a CLIP model, and runs text-to-frame search to find the most relevant moments in the video.
Connect to ApertureDB
Option A: ApertureDB Cloud (recommended)
Sign up for a free 30-day trial. Get your key from Connect → Generate API Key, add it to a .env file in this directory:
APERTUREDB_KEY=your_key_here
Option B: Community Edition (local Docker)
Run this in a terminal before starting the notebook:
docker run -d --name aperturedb \
-p 55555:55555 -e ADB_MASTER_KEY=admin -e ADB_FORCE_SSL=false \
aperturedata/aperturedb-community
See client configuration options for all connection methods and server setup options for deployment choices.
%pip install aperturedb sentence-transformers Pillow python-dotenv
from dotenv import load_dotenv
load_dotenv()
True
# !adb config create localdb --active \
# --host localhost --port 55555 \
# --username admin --password admin \
# --no-use-ssl --no-interactive
from aperturedb.CommonLibrary import create_connector
client = create_connector()
response, _ = client.query([{"GetStatus": {}}])
client.print_last_response()
[
{
"GetStatus": {
"info": "OK",
"status": 0,
"system": "ApertureDB",
"version": "0.19.6"
}
}
]
Step: Load the CLIP model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("clip-ViT-B-32")
print(f"Embedding dimensions: {model.get_sentence_embedding_dimension()}")
Step: Add a video to ApertureDB
import requests
VIDEO_URL = "https://raw.githubusercontent.com/aperture-data/Cookbook/main/notebooks/simple/data/crepe_flambe.mp4"
resp = requests.get(VIDEO_URL, timeout=30)
resp.raise_for_status()
video_bytes = resp.content
print(f"Downloaded {len(video_bytes):,} bytes")
q = [{
"AddVideo": {
"_ref": 1,
"properties": {
"name": "crepe_flambe",
"cuisine": "French",
}
}
}]
response, _ = client.query(q, [video_bytes])
client.print_last_response()
Downloaded 2,911,784 bytes
[
{
"AddVideo": {
"status": 0
}
}
]
Step: Extract frames from the video
# Extract frames at positions 0, 30, 60, 90, 120
FRAME_NUMBERS = [0, 30, 60, 90, 120]
q = [{
"FindVideo": {
"constraints": {"name": ["==", "crepe_flambe"]},
"_ref": 1,
"blobs": False,
}
}, {
"ExtractFrames": {
"video_ref": 1,
"frame_spec": FRAME_NUMBERS,
}
}]
response, frame_blobs = client.query(q)
n_frames = response[1]["ExtractFrames"].get("returned", 0)
print(f"Extracted {n_frames} frames")
Extracted 5 frames
Step: Create a DescriptorSet for frame embeddings
SET_NAME = "video_frame_search"
client.query([{
"AddDescriptorSet": {
"name": SET_NAME,
"dimensions": 512,
"engine": "FaissFlat",
"metric": "CS",
}
}])
client.print_last_response()
[
{
"AddDescriptorSet": {
"status": 0
}
}
]
Step: Embed and store each frame
Each frame blob is decoded as a PIL image, embedded with CLIP, and stored as a Descriptor linked to the video with the frame number as a property.
from PIL import Image
from io import BytesIO
import numpy as np
for i, (frame_blob, frame_num) in enumerate(zip(frame_blobs, FRAME_NUMBERS)):
img = Image.open(BytesIO(frame_blob)).convert("RGB")
emb = model.encode(img, normalize_embeddings=True).astype("float32")
q = [{
"FindVideo": {
"constraints": {"name": ["==", "crepe_flambe"]},
"_ref": 1,
"blobs": False,
}
}, {
"AddDescriptor": {
"set": SET_NAME,
"connect": {"ref": 1, "class": "has_frame_embedding"},
"properties": {
"frame_number": frame_num,
"video_name": "crepe_flambe",
}
}
}]
client.query(q, [emb.tobytes()])
print(f"Stored embeddings for {len(frame_blobs)} frames")
Stored embeddings for 5 frames
Step: Text-to-frame search
Encode a text query and find the frame whose visual content best matches. The frame number tells you where in the video the match occurs.
query_text = "pouring batter into pan"
query_emb = model.encode(query_text, normalize_embeddings=True).astype("float32")
q = [{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 3,
"distances": True,
"results": {"all_properties": True},
}
}]
response, _ = client.query(q, [query_emb.tobytes()])
print(f'Query: "{query_text}"\n')
for entity in response[0]["FindDescriptor"].get("entities", []):
print(f" Frame {entity['frame_number']:>4} distance={entity['_distance']:.4f}")
Query: "pouring batter into pan"
Frame 0 distance=0.2711
Frame 60 distance=0.2670
Frame 120 distance=0.2666
Step: Cleanup (optional)
client.query([{"DeleteDescriptorSet": {"with_name": SET_NAME}}])
client.print_last_response()
[
{
"DeleteDescriptorSet": {
"count": 1,
"status": 0
}
}
]
client.query([{"FindVideo":{"_ref":1}}, {"DeleteVideo": {"ref": 1}}])
client.print_last_response()
[
{
"FindVideo": {
"returned": 0,
"status": 0
}
},
{
"DeleteVideo": {
"count": 1,
"status": 0
}
}
]