Skip to content

Batch Evaluation

Examples of evaluating multiple prompts in batch.

Basic Batch Evaluation

from rait_connector import RAITClient

client = RAITClient()

prompts = [
    {
        "prompt_id": "batch-001",
        "prompt_url": "https://example.com/prompt/001",
        "timestamp": "2025-12-11T10:00:00Z",
        "model_name": "gpt-4",
        "model_version": "1.0",
        "query": "What is AI?",
        "response": "AI is...",
        "environment": "production",
        "purpose": "monitoring"
    },
    {
        "prompt_id": "batch-002",
        "prompt_url": "https://example.com/prompt/002",
        "timestamp": "2025-12-11T10:01:00Z",
        "model_name": "gpt-4",
        "model_version": "1.0",
        "query": "What is ML?",
        "response": "ML is...",
        "environment": "production",
        "purpose": "monitoring"
    }
]

summary = client.evaluate_batch(prompts)

print(f"Total: {summary['total']}")
print(f"Successful: {summary['successful']}")
print(f"Failed: {summary['failed']}")

Using EvaluationInput Models

from rait_connector import RAITClient, EvaluationInput

client = RAITClient()

prompts = [
    EvaluationInput(
        prompt_id="batch-003",
        prompt_url="https://example.com/prompt/003",
        timestamp="2025-12-11T10:02:00Z",
        model_name="gpt-4",
        model_version="1.0",
        query="What is deep learning?",
        response="Deep learning is...",
        environment="production",
        purpose="monitoring"
    ),
    EvaluationInput(
        prompt_id="batch-004",
        prompt_url="https://example.com/prompt/004",
        timestamp="2025-12-11T10:03:00Z",
        model_name="gpt-4",
        model_version="1.0",
        query="What is neural network?",
        response="A neural network is...",
        environment="production",
        purpose="monitoring"
    )
]

summary = client.evaluate_batch(prompts)

With Custom Callback

def on_complete(summary):
    print(f"Batch evaluation complete!")
    print(f"Success rate: {summary['successful']}/{summary['total']}")

    if summary['errors']:
        print("\nErrors:")
        for error in summary['errors']:
            print(f"  - {error['prompt_id']}: {error['error']}")

summary = client.evaluate_batch(
    prompts,
    on_complete=on_complete
)

Processing Results

summary = client.evaluate_batch(prompts)

# Process successful results
for result in summary['results']:
    print(f"\nProcessing {result['prompt_id']}")

    for dimension in result['ethical_dimensions']:
        print(f"  {dimension['dimension_name']}")

        for metric in dimension['dimension_metrics']:
            print(f"    - {metric['metric_name']}")

# Handle errors
for error in summary['errors']:
    print(f"Failed: {error['prompt_id']} - {error['error']}")

Fail Fast Mode

from rait_connector.exceptions import EvaluationError

try:
    summary = client.evaluate_batch(
        prompts,
        fail_fast=True  # Stop on first error
    )
except EvaluationError as e:
    print(f"Batch failed: {e}")

Parallel Configuration

# Increase workers for better performance
summary = client.evaluate_batch(
    prompts,
    parallel=True,
    max_workers=10
)

# Or disable parallel execution
summary = client.evaluate_batch(
    prompts,
    parallel=False
)

With Custom Fields

Each prompt can include its own custom_fields dict for per-prompt additional data:

from rait_connector import RAITClient

client = RAITClient()

prompts = [
    {
        "prompt_id": "batch-001",
        "prompt_url": "https://example.com/prompt/001",
        "timestamp": "2025-12-11T10:00:00Z",
        "model_name": "gpt-4",
        "model_version": "1.0",
        "query": "What is AI?",
        "response": "AI is...",
        "environment": "production",
        "purpose": "monitoring",
        "custom_fields": {
            "user_id": "user_123",
            "session_id": "session_abc",
        },
    },
    {
        "prompt_id": "batch-002",
        "prompt_url": "https://example.com/prompt/002",
        "timestamp": "2025-12-11T10:01:00Z",
        "model_name": "gpt-4",
        "model_version": "1.0",
        "query": "What is ML?",
        "response": "ML is...",
        "environment": "production",
        "purpose": "monitoring",
        "custom_fields": {
            "user_id": "user_456",
            "session_id": "session_def",
        },
    },
]

def on_complete(summary):
    print(f"Success: {summary['successful']}")
    print(f"Failed: {summary['failed']}")

summary = client.evaluate_batch(
    prompts=prompts,
    on_complete=on_complete,
)

Loading from CSV

import csv
from datetime import datetime

prompts = []

with open('prompts.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        prompts.append({
            "prompt_id": row['id'],
            "prompt_url": row['url'],
            "timestamp": datetime.now().isoformat(),
            "model_name": row['model'],
            "model_version": row['version'],
            "query": row['query'],
            "response": row['response'],
            "environment": "production",
            "purpose": "batch_monitoring"
        })

summary = client.evaluate_batch(prompts)

Loading from JSON

import json

with open('prompts.json', 'r') as f:
    prompts = json.load(f)

summary = client.evaluate_batch(prompts)

Saving Results

import json

summary = client.evaluate_batch(prompts)

# Save summary
with open('batch_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

# Save detailed results
for result in summary['results']:
    filename = f"result_{result['prompt_id']}.json"
    with open(filename, 'w') as f:
        json.dump(result, f, indent=2)

Progress Tracking

def track_progress(summary):
    total = summary['total']
    success = summary['successful']
    failed = summary['failed']

    print(f"\nBatch Complete: {success + failed}/{total}")
    print(f"  Success: {success}")
    print(f"  Failed: {failed}")
    print(f"  Success Rate: {success/total*100:.1f}%")

summary = client.evaluate_batch(
    prompts,
    on_complete=track_progress
)

Batch with Retry Logic

from rait_connector.exceptions import EvaluationError

max_retries = 3

for attempt in range(max_retries):
    try:
        summary = client.evaluate_batch(prompts)

        if summary['failed'] == 0:
            print("All evaluations successful!")
            break

        # Retry only failed prompts
        failed_ids = {e['prompt_id'] for e in summary['errors']}
        prompts = [p for p in prompts if p['prompt_id'] in failed_ids]

        print(f"Retrying {len(prompts)} failed prompts (attempt {attempt + 1})")

    except EvaluationError as e:
        print(f"Attempt {attempt + 1} failed: {e}")
        if attempt == max_retries - 1:
            raise