The Core API provides the fundamental building blocks for writing mcp-eval tests. These decorators and utilities make your tests clean, reusable, and powerful.
Quick reference
from mcp_eval.core import (
task, # Define a test task
with_agent, # Specify which agent to use
parametrize, # Run tests with multiple inputs
setup, # Run before tests
teardown, # Run after tests
TestResult # Test execution results
)
The @task decorator
The foundation of every mcp-eval test:
Basic usage
from mcp_eval.core import task
from mcp_eval.catalog import Expect
@task("My first test")
async def test_basic(agent, session):
"""A simple test that checks basic functionality."""
response = await agent.generate_str("Hello, world!")
await session.assert_that(
Expect.content.contains("Hello"),
response=response
)
Task parameters
# Compose multiple assertions and give them names for clear reporting
@task("Fetch and summarize")
async def test_fetch_and_summarize(agent, session):
response = await agent.generate_str(
"Fetch https://example.com and summarize in one sentence"
)
await session.assert_that(
Expect.tools.was_called("fetch"),
name="fetch_called"
)
await session.assert_that(
Expect.content.contains("Example Domain"),
response=response,
name="has_expected_text"
)
await session.assert_that(
Expect.performance.max_iterations(3),
name="efficient"
)
Task with custom configuration
@task("Expensive test")
async def test_with_config(agent, session):
response = await agent.generate_str("Complex analysis task")
# Test continues...
The @with_agent decorator
Specify which agent configuration to use:
Using named agents
@with_agent("default")
@task("Test with default agent")
async def test_default(agent):
# Uses the 'default' agent from config
response = await agent.generate_str("Test prompt")
Multiple agent configurations
# Define agents in mcpeval.yaml first
@with_agent("specialized_agent")
@task("Test specialized behavior")
async def test_specialized(agent):
# Uses a different agent configuration
pass
@with_agent("minimal_agent")
@task("Test minimal setup")
async def test_minimal(agent):
# Uses yet another configuration
pass
Inline agent configuration
from mcp_agent.agents.agent_spec import AgentSpec
@with_agent(AgentSpec(
name="code_reviewer",
instruction="You are a code reviewer. Be thorough and critical.",
server_names=["filesystem", "git"]
))
@task("Code review test")
async def test_code_review(agent):
response = await agent.generate_str("Review this code: ...")
Decorator order matters! Always apply @with_agent
above @task
to ensure the agent is properly configured when the task runs.# ✅ Correct
@with_agent("default")
@task("test")
async def test_func(agent): ...
# ❌ Wrong - will error
@task("test")
@with_agent("default")
async def test_func(agent): ...
The @parametrize decorator
Run the same test with different inputs:
Basic parametrization
@with_agent("default")
@parametrize("number", [1, 2, 5, 10, 100])
@task("Test with different numbers")
async def test_numbers(agent, number):
response = await agent.generate_str(f"Is {number} prime?")
# Each number creates a separate test case
await agent.assert_that(
Expect.content.regex(r"(yes|no|prime|composite)")
)
Multiple parameters
@with_agent("default")
@parametrize("operation", ["add", "subtract", "multiply"])
@parametrize("x", [1, 10, 100])
@parametrize("y", [2, 5])
@task("Test calculator operations")
async def test_calculator(agent, operation, x, y):
# Creates 3 * 3 * 2 = 18 test cases!
prompt = f"Use the calculator to {operation} {x} and {y}"
response = await agent.generate_str(prompt)
# Verify the right tool was used
await agent.assert_that(
Expect.tools.was_called(f"calculator_{operation}")
)
Named scenarios
Use @parametrize("name,url,expected", [...])
to model named cases.
@with_agent("default")
@parametrize(
"name,url,expected",
[
("home", "https://example.com", "Example Domain"),
("httpbin_json", "https://httpbin.org/json", "slideshow"),
],
)
@task("Fetch {name}")
async def test_fetch_case(agent, session, name: str, url: str, expected: str):
response = await agent.generate_str(f"Fetch {url}")
await session.assert_that(
Expect.tools.was_called("fetch"),
name=f"{name}_fetch_called",
)
await session.assert_that(
Expect.content.contains(expected, case_sensitive=False),
response=response,
name=f"{name}_has_expected",
)
Dynamic parametrization
def get_test_cases():
"""Generate test cases dynamically."""
import json
with open("test_data.json") as f:
return json.load(f)["test_cases"]
@with_agent("default")
@parametrize("test_case", get_test_cases())
@task("Dynamic test cases")
async def test_dynamic(agent, test_case):
response = await agent.generate_str(test_case["prompt"])
await agent.assert_that(
Expect.content.contains(test_case["expected"]),
response=response
)
Setup and teardown
Run code before and after your tests:
Simple setup/teardown
from mcp_eval.core import setup, teardown
import os
import tempfile
test_dir = None
@setup
def prepare_test_environment():
"""Create temporary test directory."""
global test_dir
test_dir = tempfile.mkdtemp(prefix="mcp_test_")
print(f"🚀 Created test directory: {test_dir}")
# Set up test files
with open(f"{test_dir}/test.txt", "w") as f:
f.write("Test content")
@teardown
def cleanup_test_environment():
"""Clean up after tests."""
global test_dir
if test_dir and os.path.exists(test_dir):
import shutil
shutil.rmtree(test_dir)
print(f"🧹 Cleaned up {test_dir}")
Async setup/teardown
@setup
async def async_prepare():
"""Setup that requires async operations."""
# Connect to database
await db.connect()
# Seed test data
await db.execute("INSERT INTO test_table ...")
print("✅ Database ready")
@teardown
async def async_cleanup():
"""Async cleanup operations."""
await db.execute("DELETE FROM test_table WHERE ...")
await db.disconnect()
Setup with validation
@setup
def validate_environment():
"""Ensure test environment is properly configured."""
import sys
# Check Python version
if sys.version_info < (3, 10):
raise RuntimeError("Tests require Python 3.10+")
# Check required environment variables
required_vars = ["ANTHROPIC_API_KEY", "TEST_SERVER_URL"]
missing = [var for var in required_vars if not os.getenv(var)]
if missing:
raise RuntimeError(f"Missing environment variables: {missing}")
# Check MCP servers are accessible
from mcp_eval.utils import check_server_health
if not check_server_health("my_server"):
raise RuntimeError("MCP server 'my_server' is not responding")
print("✅ Environment validated successfully")
TestResult object
Understanding test execution results:
TestResult structure
from mcp_eval.core import TestResult
# After a test runs, you get a TestResult:
result = TestResult(
id="test_123_abc",
name="Test basic fetch",
passed=True,
duration_ms=1234.56,
parameters={"url": "https://example.com"},
metrics={
"tool_calls": 2,
"tokens_used": 500,
"cost_usd": 0.01
},
evaluations=[
{"name": "content_check", "passed": True, "score": 1.0},
{"name": "performance", "passed": True, "details": "Under 2s"}
],
error=None # Or error message if failed
)
Accessing TestResult in hooks
# Use CLI combined reports or session.get_metrics()/get_results() for summaries
Aggregating results
def analyze_test_results(results: list[TestResult]):
"""Analyze a batch of test results."""
total = len(results)
passed = sum(1 for r in results if r.passed)
total_duration = sum(r.duration_ms for r in results)
total_cost = sum(r.metrics.get('cost_usd', 0) for r in results)
print(f"\n📊 Test Summary:")
print(f" Total tests: {total}")
print(f" Passed: {passed}/{total} ({passed/total*100:.1f}%)")
print(f" Total duration: {total_duration/1000:.2f}s")
print(f" Total cost: ${total_cost:.4f}")
# Find slowest tests
slowest = sorted(results, key=lambda r: r.duration_ms, reverse=True)[:3]
print(f"\n🐢 Slowest tests:")
for result in slowest:
print(f" {result.name}: {result.duration_ms:.0f}ms")
# Find failed tests
failed = [r for r in results if not r.passed]
if failed:
print(f"\n❌ Failed tests:")
for result in failed:
print(f" {result.name}: {result.error}")
Advanced patterns
Conditional test execution
Prefer selecting tests with your runner and environment rather than custom decorators.
# Run a single test function (pytest-style selector supported by the runner for decorator tests)
mcp-eval run tests/test_fetch.py::test_fetch_case
# Run pytest tests (use pytest)
uv run pytest -q tests
Using pytest marks for conditions (when running under pytest):
import os, pytest
from mcp_eval.core import task
@pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip on CI")
@task("Local-only behavior")
async def test_local_only(agent, session):
response = await agent.generate_str("Do something local")
# assertions...
@pytest.mark.slow
@task("Slow end-to-end scenario")
async def test_slow_scenario(agent, session):
# long-running flow...
...
Then select by mark:
pytest -m "not slow" tests/
Test dependencies
Prefer independent tests; if ordering is required, orchestrate via your runner.
Custom test context
from contextvars import ContextVar
test_context = ContextVar('test_context', default={})
@task("Test with context")
async def test_with_context(agent, session):
# Set context for this test
ctx = test_context.get().copy()
ctx['test_id'] = session.test_id
ctx['start_time'] = time.time()
test_context.set(ctx)
response = await agent.generate_str("Test prompt")
# Context is available throughout the test
duration = time.time() - ctx['start_time']
print(f"Test {ctx['test_id']} took {duration:.2f}s")
Best practices
Name your tests clearly: Use descriptive names that explain what the test validates. This helps when reviewing test reports.
Avoid test interdependence: Each test should be independent and not rely on side effects from other tests, unless explicitly using depends_on
.
Use parametrize wisely: While parametrization is powerful, too many parameter combinations can make tests slow. Consider grouping related parameters.
Common patterns
Testing error handling
@with_agent("default")
@task("Test error recovery")
async def test_error_handling(agent):
# Trigger an error condition
response = await agent.generate_str("Divide 10 by 0")
# Verify graceful handling
await agent.assert_that(
Expect.content.regex(r"(error|cannot|undefined|infinity)"),
name="handles_division_by_zero"
)
# Verify no tool crashes
# Check tool success via success_rate, e.g., Expect.tools.success_rate(1.0)
Testing multi-step workflows
@with_agent("default")
@task("Test complete workflow")
async def test_workflow(agent, session):
# Step 1: Authentication
auth_response = await agent.generate_str("Authenticate as test_user")
await session.assert_that(
Expect.tools.was_called("auth"),
name="authentication_attempted"
)
# Step 2: Fetch data
data_response = await agent.generate_str("Get my profile data")
await session.assert_that(
Expect.tools.was_called("fetch_profile"),
name="profile_fetched"
)
# Step 3: Process
process_response = await agent.generate_str("Summarize my activity")
await session.assert_that(
Expect.content.contains("summary"),
response=process_response,
name="summary_generated"
)
# Verify the complete sequence
await session.assert_that(
Expect.tools.sequence(["auth", "fetch_profile", "summarize"]),
name="correct_workflow_order"
)
See also