mcpmark/pipeline.py at main · eval-sys/mcpmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3
"""
MCPMark Unified Evaluation Pipeline
===================================

This script provides an automated evaluation pipeline for testing Large Language Models (LLMs)
on various Multi-Step Cognitive Processes (MCP) services like Notion, GitHub, and PostgreSQL.
"""

import argparse
import sys
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv

from src.logger import get_logger
from src.evaluator import MCPEvaluator
from src.agents import AGENT_REGISTRY
from src.factory import MCPServiceFactory
from src.model_config import ModelConfig


# Suppress httpcore/anyio cleanup exceptions that don't affect functionality.
# These "Exception ignored" messages are caused by MCP library's streamablehttp_client
# timing issues during cleanup, but don't impact actual task execution.
def _suppress_cleanup_exceptions(unraisable):
    """Suppress known cleanup exceptions from httpcore/anyio."""
    msg = str(unraisable.exc_value)
    if any(
        pattern in msg
        for pattern in [
            "async generator ignored GeneratorExit",
            "cancel scope in a different task",
            "no running event loop",
        ]
    ):
        return  # Silently ignore
    # Use default handler for other exceptions
    sys.__unraisablehook__(unraisable)


sys.unraisablehook = _suppress_cleanup_exceptions

# Initialize logger
logger = get_logger(__name__)


def main():
    """Main entry point for the evaluation pipeline."""
    parser = argparse.ArgumentParser(description="MCPMark Unified Evaluation Pipeline.")

    supported_mcp_services = MCPServiceFactory.get_supported_mcp_services()
    supported_models = ModelConfig.get_supported_models()

    # Main configuration
    parser.add_argument(
        "--mcp",
        default="filesystem",
        choices=supported_mcp_services,
        help="MCP service to use (default: filesystem)",
    )
    parser.add_argument(
        "--models",
        required=True,
        help="Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1')",
    )

    parser.add_argument(
        "--agent",
        default="mcpmark",
        choices=sorted(AGENT_REGISTRY.keys()),
        help="Agent implementation to use (default: mcpmark)",
    )
    parser.add_argument(
        "--tasks",
        default="all",
        help='Tasks to run: (1). "all"; (2). "category"; or (3). "category/task".',
    )
    parser.add_argument(
        "--task-suite",
        default="standard",
        choices=["standard", "easy"],
        help="Task suite to run (default: standard). Use 'easy' to run the lightweight dataset.",
    )
    parser.add_argument(
        "--exp-name",
        default=None,
        help="Experiment name; results are saved under results/<exp-name>/ (default: YYYY-MM-DD-HH-MM-SS)",
    )
    parser.add_argument(
        "--k",
        type=int,
        default=4,
        help="Number of evaluation runs (default: 1)",
    )

    # Execution configuration
    parser.add_argument(
        "--timeout",
        type=int,
        default=3600,
        help="Timeout in seconds for agent execution",
    )
    parser.add_argument(
        "--compaction-token",
        type=int,
        default=999_999_999,
        help=(
            "Auto-compact conversation when prompt tokens (from API usage) reach this limit. "
            "Use 999999999 to disable compaction."
        ),
    )
    parser.add_argument(
        "--reasoning-effort",
        default="default",
        choices=["default", "minimal", "low", "medium", "high"],
        help="Reasoning effort level for supported models (default: None)",
    )

    # Output configuration
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("./results"),
        help="Directory to save results",
    )

    # Load arguments and environment variables
    args = parser.parse_args()
    load_dotenv(dotenv_path=".mcp_env", override=False)

    # Validate k parameter and exp-name requirement
    if args.k > 1 and args.exp_name is None:
        parser.error("--exp-name is required when k > 1")

    # Generate default exp-name if not provided
    if args.exp_name is None:
        args.exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

    # Parse models (no validation - allow unsupported models)
    model_list = [m.strip() for m in args.models.split(",") if m.strip()]
    if not model_list:
        parser.error("No valid models provided")

    # Log warning for unsupported models but don't error
    unsupported_models = [m for m in model_list if m not in supported_models]
    if unsupported_models:
        logger.warning(
            f"Using unsupported models: {', '.join(unsupported_models)}. Will use OPENAI_BASE_URL and OPENAI_API_KEY from environment."
        )

    logger.info("MCPMark Evaluation")
    logger.info(
        f"Experiment: {args.exp_name} | {len(model_list)} Model(s): {', '.join(model_list)}"
    )
    logger.info(f"Task suite: {args.task_suite}")
    if args.k > 1:
        logger.info(f"Running {args.k} evaluation runs for pass@k metrics")

    # Run k evaluation runs
    for run_idx in range(1, args.k + 1):
        if args.k > 1:
            logger.info(f"\n{'=' * 80}")
            logger.info(f"Starting Run {run_idx}/{args.k}")
            logger.info(f"{'=' * 80}\n")

            # For k-runs, results/{exp}/{mcp}__{model}/run-N
            run_exp_name = f"run-{run_idx}"
            run_output_dir = args.output_dir / args.exp_name
        else:
            # For single run, still use run-1 under service_model
            run_exp_name = "run-1"
            run_output_dir = args.output_dir / args.exp_name

        # Run evaluation for each model
        for i, model in enumerate(model_list, 1):
            logger.info(f"\n{'=' * 60}")
            if args.k > 1:
                logger.info(
                    f"Run {run_idx}/{args.k} | Model {i}/{len(model_list)}: {model}"
                )
            else:
                logger.info(f"Starting evaluation {i}/{len(model_list)}: {model}")
            logger.info(f"{'=' * 60}\n")

            # Initialize and run the evaluation pipeline for this model
            pipeline = MCPEvaluator(
                mcp_service=args.mcp,
                model=model,
                timeout=args.timeout,
                exp_name=run_exp_name,
                output_dir=run_output_dir,
                reasoning_effort=args.reasoning_effort,
                agent_name=args.agent,
                task_suite=args.task_suite,
                compaction_token=args.compaction_token,
            )

            pipeline.run_evaluation(args.tasks)
            logger.info(f"📁 Results: {pipeline.base_experiment_dir}")

    logger.info(f"\n{'=' * 60}")
    if args.k > 1:
        logger.info(f"✓ All {args.k} runs completed for {len(model_list)} model(s)")
        logger.info(
            f"Run `python -m src.aggregators.aggregate_results --exp-name {args.exp_name}` to compute all metrics"
        )
    else:
        logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)")
    logger.info(f"{'=' * 60}")


if __name__ == "__main__":
    main()