The Session API is the heart of mcp-eval testing. It manages your agent’s lifecycle, collects metrics, runs assertions, and produces comprehensive test results.

Quick start

The simplest way to create a test session:
from mcp_eval.session import test_session
from mcp_eval.catalog import Expect

async with test_session("my-test") as agent:
    # Agent is ready with MCP servers connected
    response = await agent.generate_str("Fetch https://example.com")
    
    # Run assertions
    await agent.assert_that(
        Expect.content.contains("Example Domain"),
        response=response
    )

Core concepts

TestSession

The orchestrator that manages everything:
  • Lifecycle management: Starts/stops agents and MCP servers
  • Tool discovery: Automatically finds and registers MCP tools
  • Metrics collection: Tracks all interactions via OTEL
  • Assertion execution: Runs evaluators at the right time
  • Report generation: Produces test artifacts

TestAgent

A thin, friendly wrapper around your LLM agent:
  • Simple interface: Just generate() and assert_that()
  • Automatic tracking: All interactions are recorded
  • Context preservation: Maintains conversation state

Creating sessions

Basic session creation

# Using context manager (recommended)
async with test_session("test-name") as agent:
    # Your test code here
    pass

# Manual lifecycle (advanced)
session = TestSession(test_name="test-name")
agent = await session.__aenter__()
try:
    # Your test code
    ...
finally:
    await session.__aexit__(None, None, None)
    session.cleanup()

Session with custom configuration

from mcp_eval.session import test_session
from mcp_agent.agents.agent_spec import AgentSpec

spec = AgentSpec(
    name="custom",
    instruction="You are a helpful test assistant",
    server_names=["my_server"],
)
async with test_session("custom-test", agent=spec) as agent:
    # Your test code
    pass

Agent interactions

Generating responses

# Simple string generation
response = await agent.generate_str("What is 2+2?")
print(response)  # "The answer is 4"

# Full response object may be available depending on provider; prefer generate_str for portability

Multi-turn conversations

# Sessions maintain context
response1 = await agent.generate_str("My name is Alice")
response2 = await agent.generate_str("What's my name?")
# response2 will correctly identify "Alice"

Assertions in depth

Immediate vs. deferred assertions

# Immediate: evaluated right away (content, judge)
await session.assert_that(
    Expect.content.contains("success"),
    response=response,  # Required for immediate
    name="has_success"
)

# Deferred: evaluated at session end (tools, performance, path)
await session.assert_that(
    Expect.tools.was_called("calculator"),
    name="used_calculator"  # No response needed
)

# Force deferred evaluation at end
await session.assert_that(
    Expect.content.contains("final"),
    response=response,
    when="end"  # Defer even content checks
)

Assertion timing control

# Evaluate specific assertions immediately
result = await session.evaluate_now_async(
    Expect.performance.response_time_under(5000),
    response=response,
    name="quick_response"
)

if not result.passed:
    print(f"Too slow: {result.details}")
    # Take corrective action

# Batch evaluate multiple assertions
results = await session.evaluate_now_async(
    Expect.tools.success_rate(0.95),
    Expect.performance.max_iterations(3)
)

Named assertions for better reporting

# Always name your assertions for clarity
await session.assert_that(
    Expect.content.regex(r"\d+ items? found"),
    response=response,
    name="item_count_format"  # Appears in reports
)

Metrics and results

Accessing metrics during tests

# Get current metrics
metrics = session.get_metrics()

print(f"Tool calls: {len(metrics.tool_calls)}")
print(f"Total tokens: {metrics.total_tokens}")
print(f"Duration so far: {metrics.total_duration_ms}ms")
print(f"Estimated cost: ${metrics.total_cost_usd:.4f}")

# Detailed tool information
for call in metrics.tool_calls:
    print(f"Tool: {call.name}")
    print(f"Duration: {call.duration_ms}ms")
    print(f"Success: {call.success}")
    if not call.success:
        print(f"Error: {call.error}")

Getting test results

# Check if all assertions passed
if session.all_passed():
    print("✅ All tests passed!")
else:
    print("❌ Some tests failed")

# Get detailed results
results = session.get_results()
for result in results:
    print(f"Assertion: {result.name}")
    print(f"Passed: {result.passed}")
    if not result.passed:
        print(f"Reason: {result.details}")

# Get pass/fail summary
summary = session.get_summary()
print(f"Passed: {summary['passed']}/{summary['total']}")
print(f"Pass rate: {summary['pass_rate']:.1%}")

Duration tracking

# Get test duration
duration_ms = session.get_duration_ms()
print(f"Test took {duration_ms/1000:.2f} seconds")

# Track specific operations
from time import time

start = time()
response = await agent.generate_str("Complex task")
operation_time = (time() - start) * 1000

if operation_time > 5000:
    print(f"Warning: Operation took {operation_time:.0f}ms")

OpenTelemetry traces

Accessing trace data

# Get structured span tree
span_tree = session.get_span_tree()

def print_spans(span, indent=0):
    prefix = "  " * indent
    print(f"{prefix}{span.name}: {span.duration_ms}ms")
    for child in span.children:
        print_spans(child, indent + 1)

print_spans(span_tree)

# Ensure traces are written to disk
await session._ensure_traces_flushed()

Custom span attributes

# Add custom attributes to current span
from opentelemetry import trace

tracer = trace.get_tracer(__name__)

with tracer.start_as_current_span("custom_operation") as span:
    span.set_attribute("user_id", "123")
    span.set_attribute("operation_type", "validation")
    
    response = await agent.generate_str("Validate user input")

Artifacts and reporting

Session artifacts

# Sessions automatically save artifacts
session = await TestSession.create(
    test_name="my-test",
    output_dir="test-reports",  # Custom output location
    save_artifacts=True  # Enable artifact saving
)

# After test completion, find artifacts at:
# test-reports/my-test_[timestamp]/
#   ├── trace.jsonl          # OTEL traces
#   ├── results.json         # Test results
#   ├── metrics.json         # Performance metrics
#   └── conversation.json    # Full conversation log

Programmatic report generation

# Generate reports programmatically
from mcp_eval.reports import ReportGenerator

generator = ReportGenerator(session)

# Generate different formats
await generator.save_json("results.json")
await generator.save_markdown("results.md")
await generator.save_html("results.html")

# Get report data for custom processing
report_data = generator.get_report_data()
print(f"Test: {report_data['test_name']}")
print(f"Duration: {report_data['duration_ms']}ms")
print(f"Passed: {report_data['passed']}/{report_data['total']}")

Advanced patterns

Custom session hooks

class CustomSession(TestSession):
    async def on_tool_call(self, tool_name: str, args: dict):
        """Hook called before each tool execution."""
        print(f"About to call {tool_name} with {args}")
        
        # Validate tool usage
        if tool_name == "dangerous_tool":
            raise ValueError("Dangerous tool not allowed in tests")
    
    async def on_assertion_complete(self, result):
        """Hook called after each assertion."""
        if not result.passed:
            # Log failures to external system
            await self.log_to_monitoring(result)

Session state management

# Store custom state in session
session.state["test_user_id"] = "user_123"
session.state["test_context"] = {"environment": "staging"}

# Access state in assertions or hooks
user_id = session.state.get("test_user_id")

Parallel session execution

import asyncio

async def run_test(test_name: str, prompt: str):
    async with test_session(test_name) as agent:
        response = await agent.generate_str(prompt)
        await agent.assert_that(
            Expect.content.contains("success"),
            response=response
        )
        return agent.session.all_passed()

# Run multiple tests in parallel
results = await asyncio.gather(
    run_test("test1", "Task 1"),
    run_test("test2", "Task 2"),
    run_test("test3", "Task 3")
)

print(f"All passed: {all(results)}")

Best practices

Use context managers: Always use async with test_session() to ensure proper cleanup, even if tests fail.
Name your assertions: Always provide descriptive names for assertions. This makes debugging much easier when reviewing test reports.
Monitor metrics: Check metrics during long-running tests to catch performance issues early.

Error handling

try:
    async with test_session("error-test") as agent:
        response = await agent.generate_str("Test prompt")
        await agent.assert_that(
            Expect.content.contains("expected"),
            response=response
        )
except TimeoutError:
    print("Test timed out - increase timeout_seconds")
except AssertionError as e:
    print(f"Assertion failed: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")
    # Session cleanup is still guaranteed

See also