The Session API is the heart of mcp-eval testing. It manages your agent’s lifecycle, collects metrics, runs assertions, and produces comprehensive test results.
Quick start
The simplest way to create a test session:
from mcp_eval.session import test_session
from mcp_eval.catalog import Expect
async with test_session("my-test") as agent:
# Agent is ready with MCP servers connected
response = await agent.generate_str("Fetch https://example.com")
# Run assertions
await agent.assert_that(
Expect.content.contains("Example Domain"),
response=response
)
Core concepts
TestSession
The orchestrator that manages everything:
- Lifecycle management: Starts/stops agents and MCP servers
- Tool discovery: Automatically finds and registers MCP tools
- Metrics collection: Tracks all interactions via OTEL
- Assertion execution: Runs evaluators at the right time
- Report generation: Produces test artifacts
TestAgent
A thin, friendly wrapper around your LLM agent:
- Simple interface: Just
generate()
and assert_that()
- Automatic tracking: All interactions are recorded
- Context preservation: Maintains conversation state
Creating sessions
Basic session creation
# Using context manager (recommended)
async with test_session("test-name") as agent:
# Your test code here
pass
# Manual lifecycle (advanced)
session = TestSession(test_name="test-name")
agent = await session.__aenter__()
try:
# Your test code
...
finally:
await session.__aexit__(None, None, None)
session.cleanup()
Session with custom configuration
from mcp_eval.session import test_session
from mcp_agent.agents.agent_spec import AgentSpec
spec = AgentSpec(
name="custom",
instruction="You are a helpful test assistant",
server_names=["my_server"],
)
async with test_session("custom-test", agent=spec) as agent:
# Your test code
pass
Agent interactions
Generating responses
# Simple string generation
response = await agent.generate_str("What is 2+2?")
print(response) # "The answer is 4"
# Full response object may be available depending on provider; prefer generate_str for portability
Multi-turn conversations
# Sessions maintain context
response1 = await agent.generate_str("My name is Alice")
response2 = await agent.generate_str("What's my name?")
# response2 will correctly identify "Alice"
Assertions in depth
# Immediate: evaluated right away (content, judge)
await session.assert_that(
Expect.content.contains("success"),
response=response, # Required for immediate
name="has_success"
)
# Deferred: evaluated at session end (tools, performance, path)
await session.assert_that(
Expect.tools.was_called("calculator"),
name="used_calculator" # No response needed
)
# Force deferred evaluation at end
await session.assert_that(
Expect.content.contains("final"),
response=response,
when="end" # Defer even content checks
)
Assertion timing control
# Evaluate specific assertions immediately
result = await session.evaluate_now_async(
Expect.performance.response_time_under(5000),
response=response,
name="quick_response"
)
if not result.passed:
print(f"Too slow: {result.details}")
# Take corrective action
# Batch evaluate multiple assertions
results = await session.evaluate_now_async(
Expect.tools.success_rate(0.95),
Expect.performance.max_iterations(3)
)
Named assertions for better reporting
# Always name your assertions for clarity
await session.assert_that(
Expect.content.regex(r"\d+ items? found"),
response=response,
name="item_count_format" # Appears in reports
)
Metrics and results
Accessing metrics during tests
# Get current metrics
metrics = session.get_metrics()
print(f"Tool calls: {len(metrics.tool_calls)}")
print(f"Total tokens: {metrics.total_tokens}")
print(f"Duration so far: {metrics.total_duration_ms}ms")
print(f"Estimated cost: ${metrics.total_cost_usd:.4f}")
# Detailed tool information
for call in metrics.tool_calls:
print(f"Tool: {call.name}")
print(f"Duration: {call.duration_ms}ms")
print(f"Success: {call.success}")
if not call.success:
print(f"Error: {call.error}")
Getting test results
# Check if all assertions passed
if session.all_passed():
print("✅ All tests passed!")
else:
print("❌ Some tests failed")
# Get detailed results
results = session.get_results()
for result in results:
print(f"Assertion: {result.name}")
print(f"Passed: {result.passed}")
if not result.passed:
print(f"Reason: {result.details}")
# Get pass/fail summary
summary = session.get_summary()
print(f"Passed: {summary['passed']}/{summary['total']}")
print(f"Pass rate: {summary['pass_rate']:.1%}")
Duration tracking
# Get test duration
duration_ms = session.get_duration_ms()
print(f"Test took {duration_ms/1000:.2f} seconds")
# Track specific operations
from time import time
start = time()
response = await agent.generate_str("Complex task")
operation_time = (time() - start) * 1000
if operation_time > 5000:
print(f"Warning: Operation took {operation_time:.0f}ms")
OpenTelemetry traces
Accessing trace data
# Get structured span tree
span_tree = session.get_span_tree()
def print_spans(span, indent=0):
prefix = " " * indent
print(f"{prefix}{span.name}: {span.duration_ms}ms")
for child in span.children:
print_spans(child, indent + 1)
print_spans(span_tree)
# Ensure traces are written to disk
await session._ensure_traces_flushed()
Custom span attributes
# Add custom attributes to current span
from opentelemetry import trace
tracer = trace.get_tracer(__name__)
with tracer.start_as_current_span("custom_operation") as span:
span.set_attribute("user_id", "123")
span.set_attribute("operation_type", "validation")
response = await agent.generate_str("Validate user input")
Artifacts and reporting
Session artifacts
# Sessions automatically save artifacts
session = await TestSession.create(
test_name="my-test",
output_dir="test-reports", # Custom output location
save_artifacts=True # Enable artifact saving
)
# After test completion, find artifacts at:
# test-reports/my-test_[timestamp]/
# ├── trace.jsonl # OTEL traces
# ├── results.json # Test results
# ├── metrics.json # Performance metrics
# └── conversation.json # Full conversation log
Programmatic report generation
# Generate reports programmatically
from mcp_eval.reports import ReportGenerator
generator = ReportGenerator(session)
# Generate different formats
await generator.save_json("results.json")
await generator.save_markdown("results.md")
await generator.save_html("results.html")
# Get report data for custom processing
report_data = generator.get_report_data()
print(f"Test: {report_data['test_name']}")
print(f"Duration: {report_data['duration_ms']}ms")
print(f"Passed: {report_data['passed']}/{report_data['total']}")
Advanced patterns
Custom session hooks
class CustomSession(TestSession):
async def on_tool_call(self, tool_name: str, args: dict):
"""Hook called before each tool execution."""
print(f"About to call {tool_name} with {args}")
# Validate tool usage
if tool_name == "dangerous_tool":
raise ValueError("Dangerous tool not allowed in tests")
async def on_assertion_complete(self, result):
"""Hook called after each assertion."""
if not result.passed:
# Log failures to external system
await self.log_to_monitoring(result)
Session state management
# Store custom state in session
session.state["test_user_id"] = "user_123"
session.state["test_context"] = {"environment": "staging"}
# Access state in assertions or hooks
user_id = session.state.get("test_user_id")
Parallel session execution
import asyncio
async def run_test(test_name: str, prompt: str):
async with test_session(test_name) as agent:
response = await agent.generate_str(prompt)
await agent.assert_that(
Expect.content.contains("success"),
response=response
)
return agent.session.all_passed()
# Run multiple tests in parallel
results = await asyncio.gather(
run_test("test1", "Task 1"),
run_test("test2", "Task 2"),
run_test("test3", "Task 3")
)
print(f"All passed: {all(results)}")
Best practices
Use context managers: Always use async with test_session()
to ensure proper cleanup, even if tests fail.
Name your assertions: Always provide descriptive names for assertions. This makes debugging much easier when reviewing test reports.
Monitor metrics: Check metrics during long-running tests to catch performance issues early.
Error handling
try:
async with test_session("error-test") as agent:
response = await agent.generate_str("Test prompt")
await agent.assert_that(
Expect.content.contains("expected"),
response=response
)
except TimeoutError:
print("Test timed out - increase timeout_seconds")
except AssertionError as e:
print(f"Assertion failed: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
# Session cleanup is still guaranteed
See also