MiroMindAI · JoeXic · Oct 24, 2025 · Nov 1, 2025 · Nov 1, 2025 · Nov 1, 2025
diff --git a/main.py b/main.py
@@ -5,6 +5,7 @@
 import utils.calculate_average_score
 import utils.calculate_score_from_log
 import common_benchmark
+import run_benchmark_with_monitor
 import dotenv
 import utils.eval_answer_from_log
 import fire
@@ -35,6 +36,7 @@ def print_config(*args):
  "print-config": print_config,
  "trace": utils.trace_single_task.main,
  "common-benchmark": common_benchmark.main,
+ "run-benchmark-with-monitor": run_benchmark_with_monitor.main,
  "eval-answer": utils.eval_answer_from_log.main,
  "avg-score": utils.calculate_average_score.main,
  "score-from-log": utils.calculate_score_from_log.main,

diff --git a/monitor_guide.md b/monitor_guide.md
@@ -0,0 +1,75 @@
+# Web Monitoring Guide for Benchmark Evaluation
+
+This document provides guidance for using the web monitoring dashboard while evaluating benchmarks with MiroFlow.
+
+## Overview
+
+The web monitoring system provides real-time progress tracking, statistics, and task reports through a web interface. It runs alongside the benchmark evaluation process.
+
+## Architecture
+
+```txt
+run_benchmark_with_monitor.py (Wrapper)
+ ├─> Process 1: common_benchmark.py (Executor)
+ │ └─> Executes tasks and generates log files
+ │
+ └─> Process 2: benchmark_monitor.py (Monitor)
+ └─> Reads log files and displays monitoring interface
+ └─> Generates task reports via generate_benchmark_report.py
+```
+
+## Features
+
+- **Real-time Dashboard**: Monitor progress, statistics, and task status in real-time
+- **Web Interface**: Access dashboard at `http://localhost:8080` (or next available port)
+- **Task Reports**: View detailed reports for individual tasks
+- **Benchmark-Specific Metrics**: Tailored statistics for different benchmark types (GAIA, FutureX, FinSearchComp, xBench)
+- **Auto-refresh**: Dashboard updates automatically every 30 seconds
+
+## Supported Benchmarks
+
+`run_benchmark_with_monitor.py` currently supports the following benchmark evaluations:
+
+- **GAIA Validation**
+- **FutureX**
+- **FinSearchComp**
+- **xBench-DeepSearch**
+
+## Usage Examples
+
+#### GAIA Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+ --config_file_name=agent_gaia-validation_claude37sonnet \
+ --output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+#### FutureX Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+ --config_file_name=agent_quickstart_reading \
+ benchmark=futurex \
+ --output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
+```
+
+#### FinSearchComp Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+ --config_file_name=agent_finsearchcomp_claude37sonnet \
+ --output_dir="logs/finsearchcomp-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+#### xBench-DeepSearch Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+ --config_file_name=agent_xbench-ds_claude37sonnet \
+ benchmark=xbench-ds \
+ --output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
+```
+
+💡 To resume an interrupted evaluation, simply replace the output directory with an existing log directory.
+
diff --git a/run_benchmark_with_monitor.py b/run_benchmark_with_monitor.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+import signal
+import sys
+import time
+from typing import Optional
+
+
+def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080):
+ """Run benchmark with integrated web monitoring"""
+
+ # Validate required arguments
+ if not output_dir:
+ print("Error: output_dir is required")
+ print(
+ "Usage: uv run main.py run-benchmark-with-monitor --config_file_name=name --output_dir=path"
+ )
+ return 1
+
+ # Create output directory if it doesn't exist
+ os.makedirs(output_dir, exist_ok=True)
+
+ print("=" * 50)
+ print("Benchmark Runner with Monitor")
+ print("=" * 50)
+ print(f"Output directory: {output_dir}")
+ print(f"Config name: {config_file_name}")
+ print(f"Web port: {web_port}")
+ print("=" * 50)
+
+ # Global variables for process management
+ benchmark_process: Optional[subprocess.Popen] = None
+ monitor_process: Optional[subprocess.Popen] = None
+
+ def cleanup_processes():
+ """Clean up running processes"""
+ print("\nShutting down processes...")
+
+ if benchmark_process and benchmark_process.poll() is None:
+ print(f"Stopping benchmark (PID: {benchmark_process.pid})...")
+ benchmark_process.terminate()
+ try:
+ benchmark_process.wait(timeout=5)
+ except subprocess.TimeoutExpired:
+ benchmark_process.kill()
+
+ if monitor_process and monitor_process.poll() is None:
+ print(f"Stopping monitor (PID: {monitor_process.pid})...")
+ monitor_process.terminate()
+ try:
+ monitor_process.wait(timeout=5)
+ except subprocess.TimeoutExpired:
+ monitor_process.kill()
+
+ print("Cleanup complete.")
+
+ def signal_handler(signum, frame):
+ """Handle Ctrl+C gracefully"""
+ cleanup_processes()
+ sys.exit(0)
+
+ # Set up signal handlers
+ signal.signal(signal.SIGINT, signal_handler)
+ signal.signal(signal.SIGTERM, signal_handler)
+
+ try:
+ # Start benchmark
+ print("Starting benchmark...")
+ benchmark_cmd = [
+ "uv",
+ "run",
+ "main.py",
+ "common-benchmark",
+ f"--config_file_name={config_file_name}",
+ f"output_dir={output_dir}",
+ ]
+ # Add any additional arguments (e.g., benchmark=futurex)
+ benchmark_cmd.extend(list(args))
+ benchmark_process = subprocess.Popen(benchmark_cmd)
+ print(f"Benchmark started with PID: {benchmark_process.pid}")
+
+ # Wait a moment for benchmark to initialize
+ time.sleep(3)
+
+ # Start monitor
+ print("Starting web monitor...")
+ monitor_cmd = [
+ "uv",
+ "run",
+ "utils/progress_check/benchmark_monitor.py",
+ output_dir,
+ f"--web-port={web_port}",
+ ]
+ monitor_process = subprocess.Popen(monitor_cmd)
+ print(f"Monitor started with PID: {monitor_process.pid}")
+ print(f"Web dashboard available at: http://localhost:{web_port}")
+
+ print("\n" + "=" * 50)
+ print("Both processes are running!")
+ print("Press Ctrl+C to stop both processes")
+ print("Monitor will continue running even if benchmark finishes")
+ print("=" * 50)
+
+ # Monitor the processes
+ while True:
+ time.sleep(5)
+
+ # Check if benchmark process is still running
+ if benchmark_process and benchmark_process.poll() is not None:
+ print("Benchmark process ended")
+ benchmark_process = None
+
+ # Check if monitor process is still running
+ if monitor_process and monitor_process.poll() is not None:
+ print("Monitor process died unexpectedly. Restarting...")
+ monitor_process = subprocess.Popen(monitor_cmd)
+ print(f"Monitor restarted with PID: {monitor_process.pid}")
+
+ except KeyboardInterrupt:
+ cleanup_processes()
+
+ return 0
+
+
+if __name__ == "__main__":
+ import fire
+
+ fire.Fire(main)