Unifies the verifier docs into one page and switches to a GPT‑Vision‑mini–based approach:
Samples frames from the video
Sends a small set of frames to a vision‑capable LLM (e.g., gpt-4o-mini)
Asks the model to produce a strict JSON with the required metrics in percent and a final verdict
This keeps the interface stable while the intelligence lives in the prompt and the model.
Metrics schema
Required JSON keys (all integers in 0..100, final_pct also 0..100, verdict string):
{"accuracy":0,"speed":0,"safety":0,"optimal_track":0,"energy_efficiency":0,"trajectory_stability":0,"final_pct":0,"verdict":"success | failure | inconclusive","reasoning":"one‑paragraph short explanation"}
Reference implementation (Python)
Requirements:
Notes:
We sample ~6 frames to keep token cost under control. Tune as needed.
Use JPEG at ~85% quality and width ≤ 640 px for a good cost/quality trade‑off.
If your provider requires a different schema for images (e.g., image_url), adapt the images payload.
Aggregation across verifiers
To emulate multiple verifiers (Alpha/Beta/Gamma), call verify_with_gpt_vision 3 times with different random frame subsets or slightly perturbed prompts (“Verifier Alpha perspective” etc.) and assemble a 2D metrics table like in the demo dashboard. The FINAL row can be the average per‑column or an LLM‑based consensus.
pip install opencv-python numpy openai
export OPENAI_API_KEY=... # set your key
import base64
import os
import cv2
import numpy as np
from typing import List, Dict, Any
from openai import OpenAI
MODEL = "gpt-4o-mini" # "gpt-4o-mini" / any GPT Vision "mini" variant
def sample_frames(path: str, num: int = 6) -> List[np.ndarray]:
cap = cv2.VideoCapture(path)
if not cap.isOpened():
raise FileNotFoundError(path)
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
idxs = np.linspace(0, max(0, total - 1), num=num, dtype=int)
frames = []
for i in idxs:
cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
ok, frame = cap.read()
if ok:
frames.append(frame)
cap.release()
return frames
def to_b64_jpeg(img: np.ndarray) -> str:
# moderate resize to reduce token cost
max_w = 640
if img.shape[1] > max_w:
scale = max_w / img.shape[1]
img = cv2.resize(img, (max_w, int(img.shape[0] * scale)))
ok, buf = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 85])
return base64.b64encode(buf.tobytes()).decode("utf-8")
def build_prompt(task_prompt: str) -> str:
return f"""
You are a strict execution verifier. Inspect the provided frames (chronological) and score the REAL‑WORLD execution
of the instruction strictly in PERCENT (0..100). Output STRICT JSON with keys:
accuracy, speed, safety, optimal_track, energy_efficiency, trajectory_stability, final_pct, verdict, reasoning.
Rules:
- accuracy: how exactly the instruction appears accomplished
- speed: higher if efficient, without long idle
- safety: no collisions/spills/unsafe motions
- optimal_track: path quality and economy of motion
- energy_efficiency: minimal redundant moves
- trajectory_stability: smoothness, low jitter
- final_pct: overall score (not average; your holistic judgment)
- verdict: one of "success", "failure", "inconclusive"
Instruction: {task_prompt}
Return ONLY JSON.
"""
def verify_with_gpt_vision(video_path: str, instruction: str) -> Dict[str, Any]:
frames = sample_frames(video_path, num=6)
images = [{
"type": "input_image",
"image_data": {"b64": to_b64_jpeg(f)}
} for f in frames]
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
prompt = build_prompt(instruction)
# OpenAI "Responses" API (unified) — if you prefer Chat Completions, adapt accordingly
resp = client.responses.create(
model=MODEL,
input=[
{"role": "system", "content": "You are a precise verification assistant."},
{"role": "user", "content": [{"type": "text", "text": prompt}, *images]}
]
)
text = resp.output_text
# Attempt to parse the JSON the model returned
import json, re
m = re.search(r"\{[\s\S]*\}", text)
data = json.loads(m.group(0)) if m else {"error": "no-json", "raw": text}
return data
if __name__ == "__main__":
result = verify_with_gpt_vision(
video_path="docs/assets/videos/realistic.mp4",
instruction="turn on the stove and put the moka pot on it",
)
print(result)