Skip to main content

Hybrid Search Examples

Open In Colab Download View source on GitHub

Hybrid search combines KNN vector similarity with metadata filters and aggregates in a single query. ApertureDB applies constraints server-side during the search — not as a post-filter — so results are both semantically relevant and satisfy the condition.

This notebook shows seven patterns:

  1. KNN only
  2. KNN + equality filter
  3. KNN + range filter
  4. Sort KNN results by a property
  5. Count matching descriptors
  6. Average / min / max over results
  7. Group by (facets)

Connect to ApertureDB

Option A: ApertureDB Cloud (recommended)
Sign up for a free 30-day trial. Get your key from Connect > Generate API Key, add it to a .env file in this directory:

APERTUREDB_KEY=your_key_here

Option B: Community Edition (local Docker)
Run this in a terminal before starting the notebook:

docker run -d --name aperturedb \
-p 55555:55555 -e ADB_MASTER_KEY=admin -e ADB_FORCE_SSL=false \
aperturedata/aperturedb-community

See client configuration options for all connection methods and server setup options for deployment choices.

%pip install --upgrade --quiet aperturedb python-dotenv sentence-transformers pandas
# Option A: ApertureDB Cloud
from dotenv import load_dotenv
load_dotenv() # loads APERTUREDB_KEY from .env into the environment
True
# Option B: Community Edition (local Docker)
# !adb config create localdb --active \
# --host localhost --port 55555 \
# --username admin --password admin \
# --no-use-ssl --no-interactive
from aperturedb.CommonLibrary import create_connector

client = create_connector()
response, _ = client.query([{"GetStatus": {}}])
client.print_last_response()
[
{
"GetStatus": {
"info": "OK",
"status": 0,
"system": "ApertureDB",
"version": "0.19.6"
}
}
]

Load and Ingest the Cookbook Dataset

We embed all 20 dishes with all-MiniLM-L6-v2 and store them with cuisine and calorie metadata.

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

dishes = pd.read_csv(
"https://raw.githubusercontent.com/aperture-data/Cookbook/refs/heads/main/images.adb.csv"
)
dishes["description"] = dishes["dish_name"] + " - " + dishes["caption"]

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(dishes["description"].tolist(), normalize_embeddings=True)
print(f"Loaded {len(dishes)} dishes, embedding shape: {embeddings.shape}")
Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key | Status | |
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED | |

Notes:
- UNEXPECTED: can be ignored when loading from different task/architecture; not ok if you expect identical arch.
``````output
Loaded 20 dishes, embedding shape: (20, 384)
SET_NAME = "cookbook_hybrid"

client.query([{
"AddDescriptorSet": {
"name": SET_NAME, "dimensions": 384, "engine": "HNSW", "metric": "CS",
}
}])

# Add synthetic calorie counts for the range filter demo
import random
random.seed(42)

for i, row in dishes.iterrows():
emb = embeddings[i].astype("float32")
client.query([{
"AddDescriptor": {
"set": SET_NAME,
"properties": {
"dish_name": row["dish_name"],
"cuisine": row["food_tags"],
"calories": random.randint(300, 900),
},
"if_not_found": {"dish_name": ["==", row["dish_name"]]},
}
}], [emb.tobytes()])

print(f"Ingested {len(dishes)} descriptors")
Ingested 20 descriptors

Pattern 1: KNN Only

Find the 3 dishes most similar to the query — no metadata filter.

query_text = "spicy curry with rice"
query_emb = model.encode([query_text], normalize_embeddings=True)[0].astype("float32")

response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 3,
"distances": True,
"results": {"all_properties": True},
}
}], [query_emb.tobytes()])

print(f"KNN only — top 3 for '{query_text}':")
for e in response[0]["FindDescriptor"].get("entities", []):
print(f" {e['dish_name']:<30} [{e['cuisine']}] score={1-e['_distance']:.3f}")
KNN only — top 3 for 'spicy curry with rice':
rajma chawal [Indian] score=0.416
butter chicken with special fried rice and assorted naan breads [Indian] score=0.447
won ton soup, chicken chow mein, katsu chicken [Chinese] score=0.538

Pattern 2: KNN + Equality Filter

Restrict results to a specific cuisine. The filter is applied during the vector search, not after.

response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 3,
"constraints": {
"cuisine": ["==", "Indian"], # only search within Indian dishes
},
"distances": True,
"results": {"all_properties": True},
}
}], [query_emb.tobytes()])

print(f"KNN + cuisine==Indian — top 3 for '{query_text}':")
for e in response[0]["FindDescriptor"].get("entities", []):
print(f" {e['dish_name']:<30} [{e['cuisine']}] score={1-e['_distance']:.3f}")
KNN + cuisine==Indian — top 3 for 'spicy curry with rice':
rajma chawal [Indian] score=0.416
butter chicken with special fried rice and assorted naan breads [Indian] score=0.447
paneer bhurji [Indian] score=0.580

Pattern 3: KNN + Range Filter

Combine vector search with a numeric range constraint — for example, dishes under 600 calories.

response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 5,
"constraints": {
"calories": ["<", 600],
},
"distances": True,
"results": {"all_properties": True},
}
}], [query_emb.tobytes()])

print(f"KNN + calories<600 — top 5 for '{query_text}':")
for e in response[0]["FindDescriptor"].get("entities", []):
print(f" {e['dish_name']:<30} calories={e['calories']} score={1-e['_distance']:.3f}")
KNN + calories<600 — top 5 for 'spicy curry with rice':
rajma chawal calories=414 score=0.416
won ton soup, chicken chow mein, katsu chicken calories=332 score=0.538
paneer bhurji calories=325 score=0.580
negi miso ramen calories=538 score=0.593
vegetable tian with noodles calories=395 score=0.614

Pattern 4: Sort KNN Results by Property

Find the 5 most similar dishes, then return them sorted by calories (cheapest first). sort applies after KNN, so you get the top-k most similar ordered by a metadata property.

response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 5,
"sort": "calories", # sort top-5 KNN results ascending by calories
"distances": True,
"results": {"all_properties": True},
}
}], [query_emb.tobytes()])

print(f"KNN top-5 sorted by calories:")
for e in response[0]["FindDescriptor"].get("entities", []):
print(f" {e['dish_name']:<30} calories={e['calories']} score={1-e['_distance']:.3f}")
KNN top-5 sorted by calories:
paneer bhurji calories=325 score=0.580
won ton soup, chicken chow mein, katsu chicken calories=332 score=0.538
rajma chawal calories=414 score=0.416
negi miso ramen calories=538 score=0.593
butter chicken with special fried rice and assorted naan breads calories=858 score=0.447

Pattern 5: Count

count: True in results returns only the count — no entities fetched.

# Count all Indian dishes
response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"constraints": {"cuisine": ["==", "Indian"]},
"results": {"count": True},
}
}])

print(f"Indian dishes in set: {response[0]['FindDescriptor']['count']}")

# Count how many of the top-10 similar dishes are under 500 calories
response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 10,
"constraints": {"calories": ["<", 500]},
"results": {"count": True},
}
}], [query_emb.tobytes()])

print(f"Top-10 similar dishes with <500 cal: {response[0]['FindDescriptor']['count']}")
Indian dishes in set: 5
Top-10 similar dishes with <500 cal: 9

Pattern 6: Average, Min, Max

The results block supports average, min, and max on any numeric property. Note: the response keys are prefixed with _average_avg, min_min, max_max.

import numpy as np

# Aggregate stats on all Indian dishes
response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"constraints": {"cuisine": ["==", "Indian"]},
"results": {"average": "calories", "min": "calories", "max": "calories"},
}
}])

fd = response[0]["FindDescriptor"]
print(f"Indian dishes — avg cal: {fd['_avg']['calories']:.0f}, "
f"min: {fd['_min']['calories']}, max: {fd['_max']['calories']}")

# Average calories of the top-5 KNN results
response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"k_neighbors": 5,
"results": {"average": "calories"},
}
}], [query_emb.tobytes()])

print(f"Avg calories of top-5 similar dishes: "
f"{response[0]['FindDescriptor']['_avg']['calories']:.0f}")
Indian dishes — avg cal: 546, min: 325, max: 858
Avg calories of top-5 similar dishes: 493

Pattern 7: Group By (Facets)

group returns per-value counts — equivalent to faceted search — without returning individual entities. Response comes back in groups, with _group_count, _group_avg, _group_min, etc.

# Count dishes per cuisine across the whole set
response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"results": {"group": ["cuisine"]},
}
}])

print("Dishes per cuisine:")
for g in response[0]["FindDescriptor"].get("groups", []):
print(f" {g['cuisine']:<15} count={g['_group_count']}")

print()

# Average calories per cuisine
response, _ = client.query([{
"FindDescriptor": {
"set": SET_NAME,
"results": {"group": ["cuisine"], "average": "calories"},
}
}])

print("Average calories per cuisine:")
for g in response[0]["FindDescriptor"].get("groups", []):
avg = g.get("_group_avg", {}).get("calories", "n/a")
print(f" {g['cuisine']:<15} avg_cal={avg:.0f}")
Dishes per cuisine:
American count=2
British count=2
Chinese count=2
French count=1
Indian count=5
Italian count=1
Japanese count=2
Scottish count=4

Average calories per cuisine:
American avg_cal=680
British avg_cal=360
Chinese avg_cal=330
French avg_cal=395
Indian avg_cal=546
Italian avg_cal=817
Japanese avg_cal=706
Scottish avg_cal=526

Cleanup

client.query([{"DeleteDescriptorSet": {"with_name": SET_NAME}}])
client.print_last_response()
[
{
"DeleteDescriptorSet": {
"count": 1,
"status": 0
}
}
]

What's Next