Skip to main content
The Core API provides the fundamental building blocks for writing mcp-eval tests. These decorators and utilities make your tests clean, reusable, and powerful.

Quick reference

from mcp_eval.core import (  task, # Define a test task  with_agent, # Specify which agent to use  parametrize, # Run tests with multiple inputs  setup, # Run before tests  teardown, # Run after tests  TestResult # Test execution results ) 

The @task decorator

The foundation of every mcp-eval test:

Basic usage

from mcp_eval.core import task from mcp_eval.catalog import Expect  @task("My first test") async def test_basic(agent, session):  """A simple test that checks basic functionality."""  response = await agent.generate_str("Hello, world!")    await session.assert_that(  Expect.content.contains("Hello"),  response=response  ) 

Task parameters

# Compose multiple assertions and give them names for clear reporting @task("Fetch and summarize") async def test_fetch_and_summarize(agent, session):  response = await agent.generate_str(  "Fetch https://example.com and summarize in one sentence"  )   await session.assert_that(  Expect.tools.was_called("fetch"),  name="fetch_called"  )  await session.assert_that(  Expect.content.contains("Example Domain"),  response=response,  name="has_expected_text"  )  await session.assert_that(  Expect.performance.max_iterations(3),  name="efficient"  ) 

Task with custom configuration

@task("Expensive test") async def test_with_config(agent, session):  response = await agent.generate_str("Complex analysis task")  # Test continues... 

The @with_agent decorator

Specify which agent configuration to use:

Using named agents

@with_agent("default") @task("Test with default agent") async def test_default(agent):  # Uses the 'default' agent from config  response = await agent.generate_str("Test prompt") 

Multiple agent configurations

# Define agents in mcpeval.yaml first @with_agent("specialized_agent") @task("Test specialized behavior") async def test_specialized(agent):  # Uses a different agent configuration  pass  @with_agent("minimal_agent") @task("Test minimal setup") async def test_minimal(agent):  # Uses yet another configuration  pass 

Inline agent configuration

from mcp_agent.agents.agent_spec import AgentSpec  @with_agent(AgentSpec(  name="code_reviewer",  instruction="You are a code reviewer. Be thorough and critical.",  server_names=["filesystem", "git"] )) @task("Code review test") async def test_code_review(agent):  response = await agent.generate_str("Review this code: ...") 
Decorator order matters! Always apply @with_agent above @task to ensure the agent is properly configured when the task runs.
# ✅ Correct @with_agent("default") @task("test") async def test_func(agent): ...  # ❌ Wrong - will error @task("test") @with_agent("default") async def test_func(agent): ... 

The @parametrize decorator

Run the same test with different inputs:

Basic parametrization

@with_agent("default") @parametrize("number", [1, 2, 5, 10, 100]) @task("Test with different numbers") async def test_numbers(agent, number):  response = await agent.generate_str(f"Is {number} prime?")  # Each number creates a separate test case  await agent.assert_that(  Expect.content.regex(r"(yes|no|prime|composite)")  ) 

Multiple parameters

@with_agent("default") @parametrize("operation", ["add", "subtract", "multiply"]) @parametrize("x", [1, 10, 100]) @parametrize("y", [2, 5]) @task("Test calculator operations") async def test_calculator(agent, operation, x, y):  # Creates 3 * 3 * 2 = 18 test cases!  prompt = f"Use the calculator to {operation} {x} and {y}"  response = await agent.generate_str(prompt)    # Verify the right tool was used  await agent.assert_that(  Expect.tools.was_called(f"calculator_{operation}")  ) 

Named scenarios

Use @parametrize("name,url,expected", [...]) to model named cases.
@with_agent("default") @parametrize(  "name,url,expected",  [  ("home", "https://example.com", "Example Domain"),  ("httpbin_json", "https://httpbin.org/json", "slideshow"),  ], ) @task("Fetch {name}") async def test_fetch_case(agent, session, name: str, url: str, expected: str):  response = await agent.generate_str(f"Fetch {url}")   await session.assert_that(  Expect.tools.was_called("fetch"),  name=f"{name}_fetch_called",  )  await session.assert_that(  Expect.content.contains(expected, case_sensitive=False),  response=response,  name=f"{name}_has_expected",  ) 

Dynamic parametrization

def get_test_cases():  """Generate test cases dynamically."""  import json  with open("test_data.json") as f:  return json.load(f)["test_cases"]  @with_agent("default") @parametrize("test_case", get_test_cases()) @task("Dynamic test cases") async def test_dynamic(agent, test_case):  response = await agent.generate_str(test_case["prompt"])    await agent.assert_that(  Expect.content.contains(test_case["expected"]),  response=response  ) 

Setup and teardown

Run code before and after your tests:

Simple setup/teardown

from mcp_eval.core import setup, teardown import os import tempfile  test_dir = None  @setup def prepare_test_environment():  """Create temporary test directory."""  global test_dir  test_dir = tempfile.mkdtemp(prefix="mcp_test_")  print(f"🚀 Created test directory: {test_dir}")    # Set up test files  with open(f"{test_dir}/test.txt", "w") as f:  f.write("Test content")  @teardown def cleanup_test_environment():  """Clean up after tests."""  global test_dir  if test_dir and os.path.exists(test_dir):  import shutil  shutil.rmtree(test_dir)  print(f"🧹 Cleaned up {test_dir}") 

Async setup/teardown

@setup async def async_prepare():  """Setup that requires async operations."""  # Connect to database  await db.connect()    # Seed test data  await db.execute("INSERT INTO test_table ...")    print("✅ Database ready")  @teardown async def async_cleanup():  """Async cleanup operations."""  await db.execute("DELETE FROM test_table WHERE ...")  await db.disconnect() 

Setup with validation

@setup def validate_environment():  """Ensure test environment is properly configured."""  import sys    # Check Python version  if sys.version_info < (3, 10):  raise RuntimeError("Tests require Python 3.10+")    # Check required environment variables  required_vars = ["ANTHROPIC_API_KEY", "TEST_SERVER_URL"]  missing = [var for var in required_vars if not os.getenv(var)]    if missing:  raise RuntimeError(f"Missing environment variables: {missing}")    # Check MCP servers are accessible  from mcp_eval.utils import check_server_health  if not check_server_health("my_server"):  raise RuntimeError("MCP server 'my_server' is not responding")    print("✅ Environment validated successfully") 

TestResult object

Understanding test execution results:

TestResult structure

from mcp_eval.core import TestResult  # After a test runs, you get a TestResult: result = TestResult(  id="test_123_abc",  name="Test basic fetch",  passed=True,  duration_ms=1234.56,  parameters={"url": "https://example.com"},  metrics={  "tool_calls": 2,  "tokens_used": 500,  "cost_usd": 0.01  },  evaluations=[  {"name": "content_check", "passed": True, "score": 1.0},  {"name": "performance", "passed": True, "details": "Under 2s"}  ],  error=None # Or error message if failed ) 

Accessing TestResult in hooks

# Use CLI combined reports or session.get_metrics()/get_results() for summaries 

Aggregating results

def analyze_test_results(results: list[TestResult]):  """Analyze a batch of test results."""  total = len(results)  passed = sum(1 for r in results if r.passed)    total_duration = sum(r.duration_ms for r in results)  total_cost = sum(r.metrics.get('cost_usd', 0) for r in results)    print(f"\n📊 Test Summary:")  print(f" Total tests: {total}")  print(f" Passed: {passed}/{total} ({passed/total*100:.1f}%)")  print(f" Total duration: {total_duration/1000:.2f}s")  print(f" Total cost: ${total_cost:.4f}")    # Find slowest tests  slowest = sorted(results, key=lambda r: r.duration_ms, reverse=True)[:3]  print(f"\n🐢 Slowest tests:")  for result in slowest:  print(f" {result.name}: {result.duration_ms:.0f}ms")    # Find failed tests  failed = [r for r in results if not r.passed]  if failed:  print(f"\n❌ Failed tests:")  for result in failed:  print(f" {result.name}: {result.error}") 

Advanced patterns

Conditional test execution

Prefer selecting tests with your runner and environment rather than custom decorators.
# Run a single test function (pytest-style selector supported by the runner for decorator tests) mcp-eval run tests/test_fetch.py::test_fetch_case  # Run pytest tests (use pytest) uv run pytest -q tests 
Using pytest marks for conditions (when running under pytest):
import os, pytest from mcp_eval.core import task  @pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip on CI") @task("Local-only behavior") async def test_local_only(agent, session):  response = await agent.generate_str("Do something local")  # assertions...  @pytest.mark.slow @task("Slow end-to-end scenario") async def test_slow_scenario(agent, session):  # long-running flow...  ... 
Then select by mark:
pytest -m "not slow" tests/ 

Test dependencies

Prefer independent tests; if ordering is required, orchestrate via your runner.

Custom test context

from contextvars import ContextVar  test_context = ContextVar('test_context', default={})  @task("Test with context") async def test_with_context(agent, session):  # Set context for this test  ctx = test_context.get().copy()  ctx['test_id'] = session.test_id  ctx['start_time'] = time.time()  test_context.set(ctx)    response = await agent.generate_str("Test prompt")    # Context is available throughout the test  duration = time.time() - ctx['start_time']  print(f"Test {ctx['test_id']} took {duration:.2f}s") 

Best practices

Name your tests clearly: Use descriptive names that explain what the test validates. This helps when reviewing test reports.
Avoid test interdependence: Each test should be independent and not rely on side effects from other tests, unless explicitly using depends_on.
Use parametrize wisely: While parametrization is powerful, too many parameter combinations can make tests slow. Consider grouping related parameters.

Common patterns

Testing error handling

@with_agent("default") @task("Test error recovery") async def test_error_handling(agent):  # Trigger an error condition  response = await agent.generate_str("Divide 10 by 0")    # Verify graceful handling  await agent.assert_that(  Expect.content.regex(r"(error|cannot|undefined|infinity)"),  name="handles_division_by_zero"  )    # Verify no tool crashes  # Check tool success via success_rate, e.g., Expect.tools.success_rate(1.0) 

Testing multi-step workflows

@with_agent("default") @task("Test complete workflow") async def test_workflow(agent, session):  # Step 1: Authentication  auth_response = await agent.generate_str("Authenticate as test_user")  await session.assert_that(  Expect.tools.was_called("auth"),  name="authentication_attempted"  )    # Step 2: Fetch data  data_response = await agent.generate_str("Get my profile data")  await session.assert_that(  Expect.tools.was_called("fetch_profile"),  name="profile_fetched"  )    # Step 3: Process  process_response = await agent.generate_str("Summarize my activity")  await session.assert_that(  Expect.content.contains("summary"),  response=process_response,  name="summary_generated"  )    # Verify the complete sequence  await session.assert_that(  Expect.tools.sequence(["auth", "fetch_profile", "summarize"]),  name="correct_workflow_order"  ) 

See also