Skip to content
2 changes: 2 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import utils.calculate_average_score
import utils.calculate_score_from_log
import common_benchmark
import run_benchmark_with_monitor
import dotenv
import utils.eval_answer_from_log
import fire
Expand Down Expand Up @@ -35,6 +36,7 @@ def print_config(*args):
"print-config": print_config,
"trace": utils.trace_single_task.main,
"common-benchmark": common_benchmark.main,
"run-benchmark-with-monitor": run_benchmark_with_monitor.main,
"eval-answer": utils.eval_answer_from_log.main,
"avg-score": utils.calculate_average_score.main,
"score-from-log": utils.calculate_score_from_log.main,
Expand Down
75 changes: 75 additions & 0 deletions monitor_guide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Web Monitoring Guide for Benchmark Evaluation

This document provides guidance for using the web monitoring dashboard while evaluating benchmarks with MiroFlow.

## Overview

The web monitoring system provides real-time progress tracking, statistics, and task reports through a web interface. It runs alongside the benchmark evaluation process.

## Architecture

```txt
run_benchmark_with_monitor.py (Wrapper)
├─> Process 1: common_benchmark.py (Executor)
│ └─> Executes tasks and generates log files
└─> Process 2: benchmark_monitor.py (Monitor)
└─> Reads log files and displays monitoring interface
└─> Generates task reports via generate_benchmark_report.py
```

## Features

- **Real-time Dashboard**: Monitor progress, statistics, and task status in real-time
- **Web Interface**: Access dashboard at `http://localhost:8080` (or next available port)
- **Task Reports**: View detailed reports for individual tasks
- **Benchmark-Specific Metrics**: Tailored statistics for different benchmark types (GAIA, FutureX, FinSearchComp, xBench)
- **Auto-refresh**: Dashboard updates automatically every 30 seconds

## Supported Benchmarks

`run_benchmark_with_monitor.py` currently supports the following benchmark evaluations:

- **GAIA Validation**
- **FutureX**
- **FinSearchComp**
- **xBench-DeepSearch**

## Usage Examples

#### GAIA Benchmark

```bash
uv run main.py run-benchmark-with-monitor \
--config_file_name=agent_gaia-validation_claude37sonnet \
--output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
```

#### FutureX Benchmark

```bash
uv run main.py run-benchmark-with-monitor \
--config_file_name=agent_quickstart_reading \
benchmark=futurex \
--output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
```

#### FinSearchComp Benchmark

```bash
uv run main.py run-benchmark-with-monitor \
--config_file_name=agent_finsearchcomp_claude37sonnet \
--output_dir="logs/finsearchcomp-claude37sonnet/$(date +"%Y%m%d_%H%M")"
```

#### xBench-DeepSearch Benchmark

```bash
uv run main.py run-benchmark-with-monitor \
--config_file_name=agent_xbench-ds_claude37sonnet \
benchmark=xbench-ds \
--output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
```

💡 To resume an interrupted evaluation, simply replace the output directory with an existing log directory.

132 changes: 132 additions & 0 deletions run_benchmark_with_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# SPDX-FileCopyrightText: 2025 MiromindAI
#
# SPDX-License-Identifier: Apache-2.0

import os
import subprocess
import signal
import sys
import time
from typing import Optional


def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080):
"""Run benchmark with integrated web monitoring"""

# Validate required arguments
if not output_dir:
print("Error: output_dir is required")
print(
"Usage: uv run main.py run-benchmark-with-monitor --config_file_name=name --output_dir=path"
)
return 1

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print("=" * 50)
print("Benchmark Runner with Monitor")
print("=" * 50)
print(f"Output directory: {output_dir}")
print(f"Config name: {config_file_name}")
print(f"Web port: {web_port}")
print("=" * 50)

# Global variables for process management
benchmark_process: Optional[subprocess.Popen] = None
monitor_process: Optional[subprocess.Popen] = None

def cleanup_processes():
"""Clean up running processes"""
print("\nShutting down processes...")

if benchmark_process and benchmark_process.poll() is None:
print(f"Stopping benchmark (PID: {benchmark_process.pid})...")
benchmark_process.terminate()
try:
benchmark_process.wait(timeout=5)
except subprocess.TimeoutExpired:
benchmark_process.kill()

if monitor_process and monitor_process.poll() is None:
print(f"Stopping monitor (PID: {monitor_process.pid})...")
monitor_process.terminate()
try:
monitor_process.wait(timeout=5)
except subprocess.TimeoutExpired:
monitor_process.kill()

print("Cleanup complete.")

def signal_handler(signum, frame):
"""Handle Ctrl+C gracefully"""
cleanup_processes()
sys.exit(0)

# Set up signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

try:
# Start benchmark
print("Starting benchmark...")
benchmark_cmd = [
"uv",
"run",
"main.py",
"common-benchmark",
f"--config_file_name={config_file_name}",
f"output_dir={output_dir}",
]
# Add any additional arguments (e.g., benchmark=futurex)
benchmark_cmd.extend(list(args))
benchmark_process = subprocess.Popen(benchmark_cmd)
print(f"Benchmark started with PID: {benchmark_process.pid}")

# Wait a moment for benchmark to initialize
time.sleep(3)

# Start monitor
print("Starting web monitor...")
monitor_cmd = [
"uv",
"run",
"utils/progress_check/benchmark_monitor.py",
output_dir,
f"--web-port={web_port}",
]
monitor_process = subprocess.Popen(monitor_cmd)
print(f"Monitor started with PID: {monitor_process.pid}")
print(f"Web dashboard available at: http://localhost:{web_port}")

print("\n" + "=" * 50)
print("Both processes are running!")
print("Press Ctrl+C to stop both processes")
print("Monitor will continue running even if benchmark finishes")
print("=" * 50)

# Monitor the processes
while True:
time.sleep(5)

# Check if benchmark process is still running
if benchmark_process and benchmark_process.poll() is not None:
print("Benchmark process ended")
benchmark_process = None

# Check if monitor process is still running
if monitor_process and monitor_process.poll() is not None:
print("Monitor process died unexpectedly. Restarting...")
monitor_process = subprocess.Popen(monitor_cmd)
print(f"Monitor restarted with PID: {monitor_process.pid}")

except KeyboardInterrupt:
cleanup_processes()

return 0


if __name__ == "__main__":
import fire

fire.Fire(main)
Loading