The Expect
API is your Swiss Army knife for MCP testing. Use it to assert everything from simple content checks to complex path efficiency and LLM quality judgments.
Quick reference
The Expect
namespace provides intuitive access to all evaluator factories:
from mcp_eval.catalog import Expect
# Content assertions
Expect.content.contains("success")
Expect.content.regex(r"\d+ results?")
# Tool assertions
Expect.tools.was_called("fetch")
Expect.tools.success_rate(min_rate=0.95)
Expect.tools.sequence(["validate", "process", "format"])
# Performance assertions
Expect.performance.max_iterations(3)
Expect.performance.response_time_under(5000)
# LLM judge assertions
Expect.judge.llm("Must be professional and accurate")
Expect.judge.multi_criteria([criterion1, criterion2])
# Path efficiency assertions
Expect.path.efficiency(
expected_tool_sequence=["fetch", "parse"],
allow_extra_steps=1
)
Content namespace
Validate the actual text content returned by your agent:
Basic content checks
# Check if content contains a substring
await session.assert_that(
Expect.content.contains("Example Domain"),
response=response,
name="has_expected_text"
)
# Exact match (use regex or precise substring)
await session.assert_that(
Expect.content.regex(r"^The answer is 42$"),
response=response
)
# Regular expression matching
await session.assert_that(
Expect.content.regex(r"Temperature: \d+°[CF]"),
response=response,
name="has_temperature"
)
Advanced content patterns
# Case-insensitive matching
await session.assert_that(
Expect.content.contains("SUCCESS", case_sensitive=False)
)
# Multiple conditions (all must pass)
for expected in ["result", "complete", "successful"]:
await session.assert_that(
Expect.content.contains(expected),
response=response
)
Verify tool usage patterns and success rates:
# Verify a specific tool was called
await session.assert_that(
Expect.tools.was_called("calculator"),
name="used_calculator"
)
# Check tool wasn't called (negative assertion)
await session.assert_that(
Expect.tools.count("dangerous_tool", 0),
name="safety_check"
)
# Verify exact sequence of tools
await session.assert_that(
Expect.tools.sequence(["auth", "fetch", "parse"]),
name="correct_order"
)
# Check tool was called with specific arguments
await session.assert_that(
Expect.tools.called_with(
tool_name="fetch",
expected_args={"url": "https://api.example.com"}
)
)
# Verify success rate across all tool calls
await session.assert_that(
Expect.tools.success_rate(min_rate=0.95),
name="high_reliability"
)
# Check specific tool's output
await session.assert_that(
Expect.tools.output_matches(
tool_name="weather_api",
expected_output="sunny",
match_type="contains" # or "exact", "regex", "partial"
)
)
Ensure your agent meets performance requirements:
Response time and efficiency
# Maximum response time in milliseconds
await session.assert_that(
Expect.performance.response_time_under(5000),
name="fast_response"
)
# Limit conversation iterations
await session.assert_that(
Expect.performance.max_iterations(3),
name="efficient_solution"
)
Resource usage
# Inspect tokens/cost programmatically via session.get_metrics()
metrics = session.get_metrics()
# LLM token usage (if available)
if metrics.llm_metrics:
print(f"Input tokens: {metrics.llm_metrics.input_tokens}")
print(f"Output tokens: {metrics.llm_metrics.output_tokens}")
print(f"Total tokens: {metrics.llm_metrics.total_tokens}")
# Cost estimate (if computed)
print(f"Estimated cost: ${metrics.cost_estimate:.4f}")
Judge namespace
Use LLM judges for nuanced quality evaluation:
Simple rubric evaluation
# Basic quality check
await session.assert_that(
Expect.judge.llm(
rubric="""
The response should:
- Be professional and courteous
- Provide accurate information
- Be concise (under 100 words)
""",
min_score=0.8,
include_input=True # Give judge full context
),
response=response,
name="quality_check"
)
Multi-criteria evaluation
from mcp_eval.evaluators import EvaluationCriterion
# Define weighted criteria
criteria = [
EvaluationCriterion(
name="accuracy",
description="All facts are correct",
weight=3.0, # Most important
min_score=0.9
),
EvaluationCriterion(
name="completeness",
description="Addresses all user requirements",
weight=2.0,
min_score=0.8
),
EvaluationCriterion(
name="clarity",
description="Easy to understand",
weight=1.0,
min_score=0.7
)
]
# Apply multi-criteria judge
await session.assert_that(
Expect.judge.multi_criteria(
criteria=criteria,
aggregate_method="weighted", # or "min", "average"
require_all_pass=False,
use_cot=True # Chain-of-thought reasoning
),
response=response,
name="comprehensive_quality"
)
Custom judge configuration
# Use specific model for judging
judge = Expect.judge.llm(
rubric="Evaluate for technical accuracy",
min_score=0.85,
model="claude-3-opus-20240229"
)
await session.assert_that(judge, response=response)
Path namespace
Enforce optimal execution paths:
Golden path enforcement
# Define the ideal execution path
await session.assert_that(
Expect.path.efficiency(
expected_tool_sequence=["validate", "process", "save"],
tool_usage_limits={
"validate": 1, # Should validate only once
"process": 1, # Process only once
"save": 1 # Save only once
},
allow_extra_steps=0,
penalize_backtracking=True
),
name="golden_path"
)
Flexible path checking
# Allow some variation while ensuring key waypoints
await session.assert_that(
# For waypoint/pattern checks, combine Expect.tools.sequence and
# Expect.tools.count along with Expect.path.efficiency
)
Combining assertions
Create comprehensive test scenarios by combining multiple assertions:
@task("Complete workflow test")
async def test_full_workflow(agent, session):
"""Test the entire user journey with multiple checkpoints."""
response = await agent.generate_str(
"Fetch weather for NYC and format it nicely"
)
# Content quality
await session.assert_that(
Expect.content.regex(r"\d+°[CF]"),
response=response,
name="has_temperature"
)
# Tool usage
await session.assert_that(
Expect.tools.sequence(["weather_api", "format_tool"]),
name="correct_tools"
)
# Performance
await session.assert_that(
Expect.performance.response_time_under(3000),
name="fast_enough"
)
# Quality judgment
await session.assert_that(
Expect.judge.llm(
"Output should be well-formatted and user-friendly",
min_score=0.8
),
response=response,
name="quality_output"
)
Best practices
Start simple, then add complexity: Begin with basic content assertions, then layer on tool checks, performance requirements, and quality judgments as needed.
Avoid over-constraining: Too many strict assertions can make tests brittle. Focus on what truly matters for your use case.
Use descriptive names: Always provide a name
parameter for your assertions. This makes debugging much easier when tests fail.
See also