from abc import ABC, abstractmethod
from typing import Any, Dict, List
import logging
logger = logging.getLogger(__name__)
class Evaluator(ABC):
"""评估器基类"""
@abstractmethod
def evaluate(self, prediction: Any, ground_truth: Any) -> Dict[str, float]:
"""评估单个样本"""
pass
@abstractmethod
def aggregate(self, results: List[Dict[str, float]]) -> EvaluationMetrics:
"""聚合多个样本的评估结果"""
pass
class StepLevelEvaluator(Evaluator):
"""步骤级评估器"""
def evaluate(self, predicted_call: ToolCall, ground_truth_call: ToolCall) -> Dict[str, float]:
"""评估单个工具调用"""
tool_correct = predicted_call.tool_name == ground_truth_call.tool_name
params_correct = predicted_call.args == ground_truth_call.args
success = predicted_call.success
return {
"tool_correct": 1.0 if tool_correct else 0.0,
"params_correct": 1.0 if params_correct else 0.0,
"execution_success": 1.0 if success else 0.0,
}
def aggregate(self, results: List[Dict[str, float]]) -> Dict[str, float]:
"""聚合结果"""
if not results:
return {"tool_accuracy": 0, "param_accuracy": 0, "success_rate": 0}
tool_sum = sum(r["tool_correct"] for r in results)
param_sum = sum(r["params_correct"] for r in results)
success_sum = sum(r["execution_success"] for r in results)
total = len(results)
return {
"tool_accuracy": tool_sum / total,
"param_accuracy": param_sum / total,
"success_rate": success_sum / total,
}
class TrajectoryLevelEvaluator(Evaluator):
"""轨迹级评估器"""
def evaluate(self, trajectory: List[ToolCall],
optimal_trajectory: List[ToolCall]) -> Dict[str, float]:
"""评估工具调用序列"""
actual_len = len(trajectory)
optimal_len = len(optimal_trajectory)
efficiency = optimal_len / actual_len if actual_len > 0 else 0
errors = sum(1 for call in trajectory if not call.success)
if errors > 0:
recovery = sum(1 for i in range(len(trajectory)-1)
if not trajectory[i].success and trajectory[i+1].success)
recovery_rate = recovery / errors
else:
recovery_rate = 0
duplicates = sum(1 for i in range(len(trajectory)-1)
if trajectory[i].tool_name == trajectory[i+1].tool_name)
duplicate_rate = duplicates / actual_len if actual_len > 0 else 0
return {
"efficiency": efficiency,
"recovery_rate": recovery_rate,
"duplicate_rate": duplicate_rate,
}
def aggregate(self, results: List[Dict[str, float]]) -> Dict[str, float]:
"""聚合结果"""
if not results:
return {"avg_efficiency": 0, "avg_recovery_rate": 0, "avg_duplicate_rate": 0}
avg_efficiency = sum(r["efficiency"] for r in results) / len(results)
avg_recovery = sum(r["recovery_rate"] for r in results) / len(results)
avg_duplicate = sum(r["duplicate_rate"] for r in results) / len(results)
return {
"avg_efficiency": avg_efficiency,
"avg_recovery_rate": avg_recovery,
"avg_duplicate_rate": avg_duplicate,
}
class TaskLevelEvaluator(Evaluator):
"""任务级评估器"""
def evaluate(self, result: TaskResult) -> Dict[str, float]:
"""评估单个任务"""
success = 1.0 if result.success else 0.0
time_seconds = (result.end_time - result.start_time).total_seconds()
cost = result.tokens_used * 0.001 / 1000
return {
"success": success,
"time_seconds": time_seconds,
"tokens": float(result.tokens_used),
"cost": cost,
}
def aggregate(self, results: List[Dict[str, float]]) -> Dict[str, float]:
"""聚合结果"""
if not results:
return {"success_rate": 0, "avg_time": 0, "avg_tokens": 0, "avg_cost": 0}
success_sum = sum(r["success"] for r in results)
success_rate = success_sum / len(results)
avg_time = sum(r["time_seconds"] for r in results) / len(results)
avg_tokens = sum(r["tokens"] for r in results) / len(results)
avg_cost = sum(r["cost"] for r in results) / len(results)
return {
"success_rate": success_rate,
"avg_time_seconds": avg_time,
"avg_tokens": avg_tokens,
"avg_cost_usd": avg_cost,
}