Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
145 commits
Select commit Hold shift + click to select a range
7a31dda
Update meta_prompt_optimizer.py
vincentkoc Sep 25, 2025
1b7704f
Update mcp_workflow.py
vincentkoc Sep 25, 2025
aba2272
Update gepa_optimizer.py
vincentkoc Sep 25, 2025
aeaf00e
Update few_shot_bayesian_optimizer.py
vincentkoc Sep 25, 2025
fe22b36
Update evolutionary_optimizer.py
vincentkoc Sep 25, 2025
5ff3bf2
Update base_optimizer.py
vincentkoc Sep 25, 2025
7bde2f2
test: added tests for spec
vincentkoc Sep 25, 2025
8f9d65c
test: tests
vincentkoc Sep 25, 2025
68ca2ef
Update mipro_optimizer.py
vincentkoc Sep 25, 2025
b76c495
Update gepa_optimizer.py
vincentkoc Sep 25, 2025
5b79c09
patched examples
vincentkoc Sep 25, 2025
c5586d5
refactor: signature validators
vincentkoc Sep 25, 2025
9802a86
refactor: call counters
vincentkoc Sep 25, 2025
5b4684a
test: counters
vincentkoc Sep 25, 2025
2463f7f
Update test_gepa_adapter.py
vincentkoc Sep 25, 2025
f5c9673
chore: lint
vincentkoc Sep 25, 2025
d90d8b5
test: patch
vincentkoc Sep 25, 2025
dfefda8
Merge branch 'main' into fix/optimizer-signatures
vincentkoc Sep 25, 2025
a00dd5d
Update test_mipro.py
vincentkoc Sep 25, 2025
bab1a38
Merge branch 'main' into fix/optimizer-signatures
vincentkoc Sep 25, 2025
f3abcd8
Update test_mipro.py
vincentkoc Sep 25, 2025
0480112
Merge branch 'fix/optimizer-signatures' of https://github.com/comet-m…
vincentkoc Sep 25, 2025
27b10ef
fix: cache, gc and seed values
vincentkoc Sep 25, 2025
e971525
chore: signature fixes
vincentkoc Sep 25, 2025
78121a1
chore: type fix
vincentkoc Sep 25, 2025
d4035e0
chore: types
vincentkoc Sep 25, 2025
ead12a0
Update _lm.py
vincentkoc Sep 25, 2025
9dffbc7
fix: copilot reccos
vincentkoc Sep 25, 2025
b4f8386
Merge branch 'main' into fix/optimizer-signatures
vincentkoc Sep 25, 2025
6040636
Update adapter.py
vincentkoc Sep 25, 2025
e2541a0
chore: docs for optimizeer
vincentkoc Sep 25, 2025
db57399
Merge branch 'fix/optimizer-signatures' of https://github.com/comet-m…
vincentkoc Sep 25, 2025
8bcc989
fix: actual_llm_calls for MIPRO
vincentkoc Sep 25, 2025
3baed74
chore: lint
vincentkoc Sep 25, 2025
7cb55d0
Update README.md
vincentkoc Sep 25, 2025
9a4b9aa
Update reference.mdx
vincentkoc Sep 25, 2025
b020c5a
Update generate_fern_docs.py
vincentkoc Sep 25, 2025
ef9c5c2
docs: optimizer signatures
vincentkoc Sep 25, 2025
ba4df42
docs: update docs
vincentkoc Sep 25, 2025
08c5f6c
chore: deprecation warnings
vincentkoc Sep 25, 2025
a9cb4a6
chore: align signatures
vincentkoc Sep 26, 2025
47cf1cb
fix: mipro test
vincentkoc Sep 26, 2025
d93406b
Update test_evolutionary.py
vincentkoc Sep 26, 2025
2194ec4
Update test_meta_prompt.py
vincentkoc Sep 26, 2025
f3f5a81
Update meta_prompt_optimizer.py
vincentkoc Sep 25, 2025
806c30d
Update mcp_workflow.py
vincentkoc Sep 25, 2025
6ad0f22
Update gepa_optimizer.py
vincentkoc Sep 25, 2025
400c4b1
Update few_shot_bayesian_optimizer.py
vincentkoc Sep 25, 2025
e253e7f
Update evolutionary_optimizer.py
vincentkoc Sep 25, 2025
7c5a73e
Update base_optimizer.py
vincentkoc Sep 25, 2025
5e2e08a
test: added tests for spec
vincentkoc Sep 25, 2025
e0a70a4
test: tests
vincentkoc Sep 25, 2025
2272129
Update mipro_optimizer.py
vincentkoc Sep 25, 2025
1b96eae
Update gepa_optimizer.py
vincentkoc Sep 25, 2025
97b793d
patched examples
vincentkoc Sep 25, 2025
2d63c83
refactor: signature validators
vincentkoc Sep 25, 2025
c06e19b
refactor: call counters
vincentkoc Sep 25, 2025
926d058
test: counters
vincentkoc Sep 25, 2025
0e4ba13
Update test_gepa_adapter.py
vincentkoc Sep 25, 2025
3548831
chore: lint
vincentkoc Sep 25, 2025
337c104
test: patch
vincentkoc Sep 25, 2025
deba49b
Update test_mipro.py
vincentkoc Sep 25, 2025
228a07f
Update test_mipro.py
vincentkoc Sep 25, 2025
63c93d4
fix: cache, gc and seed values
vincentkoc Sep 25, 2025
50286f4
chore: signature fixes
vincentkoc Sep 25, 2025
954082e
chore: type fix
vincentkoc Sep 25, 2025
3092a2f
chore: types
vincentkoc Sep 25, 2025
7b3af63
Update _lm.py
vincentkoc Sep 25, 2025
79308c6
fix: copilot reccos
vincentkoc Sep 25, 2025
26c28b0
Update adapter.py
vincentkoc Sep 25, 2025
0b4675e
chore: docs for optimizeer
vincentkoc Sep 25, 2025
8659305
fix: actual_llm_calls for MIPRO
vincentkoc Sep 25, 2025
7954f2d
chore: lint
vincentkoc Sep 25, 2025
2eb3f14
Update README.md
vincentkoc Sep 25, 2025
80f78d0
Update reference.mdx
vincentkoc Sep 25, 2025
08abd2b
Update generate_fern_docs.py
vincentkoc Sep 25, 2025
29feffc
docs: optimizer signatures
vincentkoc Sep 25, 2025
d1d1cfe
docs: update docs
vincentkoc Sep 25, 2025
492a8e6
chore: deprecation warnings
vincentkoc Sep 25, 2025
b7677b5
chore: align signatures
vincentkoc Sep 26, 2025
8b2819a
fix: mipro test
vincentkoc Sep 26, 2025
a2a000b
Update test_evolutionary.py
vincentkoc Sep 26, 2025
2f9448f
Update test_meta_prompt.py
vincentkoc Sep 26, 2025
eb7df19
Merge branch 'fix/optimizer-signatures' of https://github.com/comet-m…
vincentkoc Sep 26, 2025
f29b0ac
Update core.py
vincentkoc Sep 26, 2025
6570015
test: codecoverage
vincentkoc Sep 26, 2025
8b107f5
refactor: experiment_config
vincentkoc Sep 26, 2025
0623cbc
refactor: experiment config
vincentkoc Sep 26, 2025
e665835
refactor: experiment_config
vincentkoc Sep 26, 2025
c4fb33c
Update AddOptimizationDialog.tsx
vincentkoc Sep 26, 2025
b205a91
Update evaluation_ops.py
vincentkoc Sep 26, 2025
8f1f2cd
Update AddOptimizationDialog.tsx
vincentkoc Sep 26, 2025
4af50e6
Update AddOptimizationDialog.tsx
vincentkoc Sep 26, 2025
5c2fc2e
Update test_mcp_utils.py
vincentkoc Sep 26, 2025
26b76b2
Update mcp_workflow.py
vincentkoc Sep 26, 2025
98be00f
Update reporting.py
vincentkoc Sep 26, 2025
5ab5eae
Update prompts.py
vincentkoc Sep 26, 2025
dac626a
Update population_ops.py
vincentkoc Sep 26, 2025
1b6e75e
feat: mcp on evolutionary
vincentkoc Sep 26, 2025
d74b548
Create litellm_evolutionary_context7_mcp_example.py
vincentkoc Sep 26, 2025
579aee5
Merge branch 'main' into feat/mcp-upflift
dsblank Sep 29, 2025
5f1b8da
Apply suggestion from @Copilot
dsblank Sep 29, 2025
504bae3
Remove duplicated code
dsblank Sep 29, 2025
c470270
This uses the MIPROv2 args
dsblank Sep 29, 2025
0d06d36
Remove change
dsblank Sep 29, 2025
6a34fe8
Fix formatting issues
dsblank Sep 29, 2025
203f4df
Cleanup again of items fixed in the first PR
dsblank Sep 29, 2025
a333377
mypy and linting fixes
dsblank Sep 29, 2025
de38f6c
Fix mypy issue
dsblank Sep 29, 2025
a4cbb3c
Added missing fallback
dsblank Sep 29, 2025
23ee89e
Removed unnecessary print statement
dsblank Sep 29, 2025
bc5a320
Fix unit test, added optimizer_ref
dsblank Sep 29, 2025
95443db
Merge branch 'main' into feat/mcp-upflift
dsblank Sep 30, 2025
e53841c
Update pyproject.toml
vincentkoc Sep 30, 2025
b08167a
Update Makefile
vincentkoc Sep 30, 2025
78b8ca6
chore: python version
vincentkoc Sep 30, 2025
08496f3
fix: bug default invoker
vincentkoc Sep 30, 2025
3d234cf
fix: bug cache normalization of type
vincentkoc Sep 30, 2025
621ea01
fix: evolutionary json tuple
vincentkoc Sep 30, 2025
c287018
chore: lint
vincentkoc Sep 30, 2025
1ac35b1
chore: mypy
vincentkoc Sep 30, 2025
3278cc6
Merge branch 'main' into feat/mcp-upflift
vincentkoc Sep 30, 2025
d08d2de
feat: sdk supports multi-modal input
vincentkoc Sep 30, 2025
c33e48b
Create test_message_renderer.py
vincentkoc Sep 30, 2025
10a120f
feat: helper for image datasets
vincentkoc Sep 30, 2025
1815201
feat: EO supporting images
vincentkoc Sep 30, 2025
1e8357e
Create litellm_evolutionary_hazard_detection_example.py
vincentkoc Sep 30, 2025
db3f76b
chore: dataset for driving hazards
vincentkoc Sep 30, 2025
359ba4b
chore: judge
vincentkoc Sep 30, 2025
c2be104
Update chat_prompt.py
vincentkoc Sep 30, 2025
0fe8ab6
chore: tests
vincentkoc Sep 30, 2025
b4376f6
chore: improvements
vincentkoc Sep 30, 2025
63b75dc
chore: EO (is_multimodal)
vincentkoc Oct 1, 2025
ffc7442
fix: stricter JSON output on EO (bug)
vincentkoc Oct 1, 2025
6bf3e59
fix: max_token on EO
vincentkoc Oct 1, 2025
6e0869f
feat: image size/quality flags
vincentkoc Oct 1, 2025
a6479a1
Update litellm_evolutionary_hazard_detection_example.py
vincentkoc Oct 1, 2025
ad5ec24
Update litellm_evolutionary_hazard_detection_example.py
vincentkoc Oct 1, 2025
ff0f406
Update litellm_evolutionary_hazard_detection_example.py
vincentkoc Oct 1, 2025
075da21
Update litellm_evolutionary_hazard_detection_example.py
vincentkoc Oct 1, 2025
b848387
Update litellm_evolutionary_hazard_detection_example.py
vincentkoc Oct 1, 2025
cd7a0f5
Merge branch 'main' into feat/imagebased-optimizer
vincentkoc Oct 2, 2025
c441dfd
Update crossover_ops.py
vincentkoc Oct 4, 2025
31b8fa0
Update llm_support.py
vincentkoc Oct 4, 2025
a0a7a5b
Update mutation_ops.py
vincentkoc Oct 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
"""
Multimodal Driving Hazard Detection Example using Evolutionary Optimizer.

This example demonstrates:
1. Loading the DHPR dataset with images encoded as base64
2. Creating a multimodal prompt with structured content (text + images)
3. Using a vision-capable model (GPT-4o-mini) for hazard detection
4. Evaluating with a custom LLM-as-a-Judge metric
5. Optimizing the prompt with the Evolutionary Optimizer

The optimizer will evolve prompts to improve hazard detection accuracy
while preserving the image inputs throughout the evolutionary process.
"""

from typing import Any
import os
import sys
import logging

from opik_optimizer import EvolutionaryOptimizer, ChatPrompt
from opik_optimizer.datasets import driving_hazard_50
from opik_optimizer.metrics import MultimodalLLMJudge

from opik.evaluation.metrics.score_result import ScoreResult


# ============================================================================
# CONFIGURATION - Adjust these settings for your needs
# ============================================================================

# Optimization settings (smaller values = faster, fewer API calls, less context usage)
POPULATION_SIZE = 10 # Number of prompt variations per generation
NUM_GENERATIONS = 5 # Number of evolutionary iterations
N_SAMPLES = 5 # Number of dataset samples to use for optimization

# Image settings - Optimized for GPT-5's 400k context
# Adjust these when loading the dataset below
MAX_IMAGE_SIZE = (640, 480) # Width, height in pixels. Options: (400,300) small, (512,384) medium, (640,480) large
IMAGE_QUALITY = 75 # JPEG quality 1-100. Options: 40-50 (small), 60-70 (balanced), 85+ (high)
# Note: (640x480, quality=75) gives ~25-35k tokens per image, fits well in 400k context
# If using GPT-4o (128k context), reduce to: MAX_IMAGE_SIZE = (512, 384), IMAGE_QUALITY = 60

# Model settings - Training cheaper model with better judge
# Using GPT-5 models (400k context)

# PRIMARY: GPT-5 models (400k context)
VISION_MODEL = "gpt-5-nano" # Vision model being optimized (fast, cheap, 400k context)
JUDGE_MODEL = "gpt-5" # Evaluation model (powerful judge with reasoning, 400k context)

# FALLBACK: If GPT-5 is not available via your API provider, use GPT-4o:
# VISION_MODEL = "gpt-4o-mini" # 128k context
# JUDGE_MODEL = "gpt-4o" # 128k context
# Then reduce image quality: MAX_IMAGE_SIZE = (512, 384), IMAGE_QUALITY = 60
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Incorrect Model Names in Example Script

The example script litellm_evolutionary_hazard_detection_example.py is configured with non-existent "GPT-5" model names (gpt-5-nano, gpt-5). This causes runtime errors, making the example unusable by default, even though a GPT-4o fallback is mentioned in comments.

Fix in Cursor Fix in Web


# ALTERNATIVE: Via OpenRouter (if you have GPT-5 access there)
# VISION_MODEL = "openrouter/openai/gpt-5-nano"
# JUDGE_MODEL = "openrouter/openai/gpt-5"
# export OPENROUTER_API_KEY="your-key"

# ============================================================================


# Load the driving hazard dataset with images
# Each item contains:
# - question: Text question about the image
# - image_content: Structured content with text and base64-encoded image
# - hazard: Expected hazard description (ground truth)
# - question_id: Unique identifier
dataset = driving_hazard_50(
test_mode=True, # Use test_mode=True for quick testing (5 samples)
max_image_size=MAX_IMAGE_SIZE, # Resize images to reduce token usage
image_quality=IMAGE_QUALITY, # JPEG compression quality
)


def multimodal_hazard_judge(dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
"""
Custom evaluation metric using LLM-as-a-Judge with vision support.

Compares the model's hazard detection output with the expected hazard
description while considering the input image.

Args:
dataset_item: Dataset item with image_content and expected hazard
llm_output: The model's hazard detection output

Returns:
ScoreResult with match score (0.0-1.0) and reasoning
"""
metric = MultimodalLLMJudge(
model=JUDGE_MODEL, # Vision-capable judge model
evaluation_criteria="""
Evaluate hazard detection with strict requirements:

1. **Accuracy (40%)**: Did it identify the EXACT hazard type?
- Pedestrian vs vehicle vs obstacle specificity
- Correct location (left/right/center/ahead)
- Distance estimation if applicable

2. **Completeness (30%)**: All hazards mentioned?
- Primary hazard (MUST catch this)
- Secondary hazards (bonus points)
- Environmental factors (weather, lighting, road conditions)

3. **Actionability (20%)**: Clear driver guidance?
- Urgency level (immediate/moderate/low)
- Recommended action (brake/slow/monitor/stop)
- Timing (now vs approaching)

4. **Visual Understanding (10%)**: Correct image interpretation?
- Scene context (highway, city, parking, intersection)
- Traffic flow understanding
- Spatial relationships between objects

**Scoring Guidelines:**
- Score 1.0: ALL critical hazards identified with precise locations and actionable details
- Score 0.7-0.9: Primary hazard caught with good details, minor omissions acceptable
- Score 0.5-0.6: Partial detection but missing key details or secondary hazards
- Score below 0.5: Missed critical hazards or major inaccuracies

Consider semantically equivalent descriptions as correct matches, but reward specificity.
""",
)

# Get the multimodal input (with image)
image_content = dataset_item.get("image_content", dataset_item.get("question", ""))

# Get the expected hazard description
expected_hazard = dataset_item.get("hazard", "")

return metric.score(
input=image_content,
output=llm_output,
expected_output=expected_hazard,
)


# Create multimodal prompt for hazard detection
# The {image_content} placeholder will be replaced with structured content
# that includes both the question text and the base64-encoded dashcam image
system_prompt = """You are an expert driving safety assistant specialized in hazard detection.

Your task is to analyze dashcam images and identify potential hazards that a driver should be aware of.

For each image:
1. Carefully examine the visual scene
2. Identify any potential hazards (pedestrians, vehicles, road conditions, obstacles, etc.)
3. Assess the urgency and severity of each hazard
4. Provide a clear, specific description of the hazard

Be precise and actionable in your hazard descriptions. Focus on safety-critical information."""

# Using messages format to support structured content with images
# The dataset items have 'image_content' which is structured content
# in OpenAI format: [{"type": "text", "text": "..."}, {"type": "image_url", ...}]
prompt = ChatPrompt(
messages=[
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": "{image_content}", # Will be replaced with structured content + image
},
],
)

# Initialize the Evolutionary Optimizer with multimodal support
# The optimizer will:
# - Mutate the text prompts while preserving images
# - Use crossover to combine effective prompt strategies
# - Evaluate using the vision-capable LLM judge
# - Evolve towards better hazard detection prompts
optimizer = EvolutionaryOptimizer(
model=VISION_MODEL, # Vision-capable model
population_size=POPULATION_SIZE, # Larger population for better exploration
num_generations=NUM_GENERATIONS, # More generations for better evolution
enable_moo=False, # Single objective optimization
enable_llm_crossover=True, # Enable intelligent LLM-based crossover (multimodal-aware)
infer_output_style=False, # IMPORTANT: Disable for multimodal to avoid context overflow
verbose=1, # Show progress
)

print("=" * 80)
print("MULTIMODAL EVOLUTIONARY OPTIMIZER - DRIVING HAZARD DETECTION")
print("=" * 80)
print(f"\nDataset: {len(dataset.get_items())} driving scenarios with images")
print(f"Model: {optimizer.model} (vision-capable)")
print(f"Population size: {optimizer.population_size}")
print(f"Generations: {optimizer.num_generations}")
print(f"Evaluation: Multimodal LLM-as-a-Judge ({JUDGE_MODEL})")
print(f"\nNOTE: Images are resized to 512x384 and compressed to reduce token usage")
print(f"NOTE: Using {N_SAMPLES} samples to avoid context window limits with base64 images")
print(f"TIP: Adjust POPULATION_SIZE, NUM_GENERATIONS, N_SAMPLES at the top of this script")
print("\n" + "=" * 80 + "\n")

# Optimize the prompt
# The optimizer will evolve prompts to maximize the match score
# between the model's hazard detection and the ground truth hazards
#
# Note: If you encounter network errors, the optimization will automatically
# retry failed operations. Just re-run the script if it fails completely.
try:
optimization_result = optimizer.optimize_prompt(
prompt=prompt,
dataset=dataset,
metric=multimodal_hazard_judge,
n_samples=N_SAMPLES, # Use fewer samples to avoid context overflow with images
)
except Exception as e:
print(f"\n❌ Optimization failed with error: {e}")
print("\n💡 This is often a transient network error. Try:")
print(" 1. Re-run the script (usually works)")
print(" 2. Check your internet connection")
print(" 3. Check Opik API status")
print("\n The optimization was making progress - your last best score was shown above!")
raise

print("\n" + "=" * 80)
print("OPTIMIZATION COMPLETE")
print("=" * 80 + "\n")

# Display the optimization results
optimization_result.display()

print("\n" + "=" * 80)
print("BEST PROMPT")
print("=" * 80)
print(optimization_result.prompt)
print("\n" + "=" * 80)

# Example of how to use the optimized prompt with new images:
print("\nTo use the optimized prompt with new driving images:")
print("1. Load a new dashcam image")
print("2. Encode it to base64 using image_helpers.encode_file_to_base64_uri()")
print("3. Create structured content with convert_to_structured_content()")
print("4. Pass it to the optimized prompt")
print("\nExample:")
print("""
from opik_optimizer.utils.image_helpers import (
encode_file_to_base64_uri,
convert_to_structured_content,
)

# Load and encode image with YOUR preferred settings
image_uri = encode_file_to_base64_uri(
"dashcam.jpg",
max_size=(512, 384), # Match training size
)

# Create structured content
image_content = convert_to_structured_content(
text="Identify any driving hazards in this image.",
image_uri=image_uri,
)

# Create ChatPrompt from optimized messages and substitute placeholders
optimized_prompt = ChatPrompt(messages=optimization_result.prompt)
messages = optimized_prompt.get_messages(
dataset_item={"image_content": image_content}
)
""")
4 changes: 4 additions & 0 deletions sdks/opik_optimizer/src/opik_optimizer/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .ai2_arc import ai2_arc
from .cnn_dailymail import cnn_dailymail
from .driving_hazard import driving_hazard_50, driving_hazard_100, driving_hazard_test
from .election_questions import election_questions
from .gsm8k import gsm8k
from .halu_eval import halu_eval_300
Expand All @@ -23,4 +24,7 @@
"election_questions",
"medhallu",
"rag_hallucinations",
"driving_hazard_50",
"driving_hazard_100",
"driving_hazard_test",
]
Loading
Loading