from typing import List, Dict
from dataclasses import dataclass
from enum import Enum
import json
class EvaluationStatus(Enum):
"""评估状态"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
@dataclass
class EvaluationConfig:
"""评估配置"""
prompt_version: str
test_set_path: str
metrics: List[str]
min_pass_rate: float = 0.8
min_quality_score: float = 7.0
model: str = "gpt-4"
temperature: float = 0.7
timeout_seconds: int = 300
class PromptEvaluationPipeline:
"""提示词评估流水线"""
def __init__(self, llm_client, storage_client):
self.llm = llm_client
self.storage = storage_client
self.history = []
def run_evaluation(self, config: EvaluationConfig) -> Dict:
"""运行完整评估"""
# 1. 加载测试集
test_cases = self._load_test_set(config.test_set_path)
# 2. 读取提示词
prompt_template = self._load_prompt(config.prompt_version)
# 3. 批量运行测试
outputs = self._batch_generate(prompt_template, test_cases, config)
# 4. 评估输出
metric_results = self._evaluate_outputs(
outputs,
test_cases,
config.metrics
)
# 5. 聚合结果
report = self._aggregate_results(metric_results, config)
# 6. 持久化结果
self._save_results(config.prompt_version, report)
return report
def _batch_generate(self, prompt_template: str, test_cases: List,
config: EvaluationConfig) -> List[str]:
"""批量生成模型输出"""
outputs = []
for case in test_cases:
try:
# 格式化提示词
formatted_prompt = prompt_template.format(**case)
# 调用模型
response = self.llm.generate(
prompt=formatted_prompt,
temperature=config.temperature,
max_tokens=config.max_tokens
)
outputs.append(response)
except Exception as e:
print(f"Error processing case {case['id']}: {e}")
outputs.append(None)
return outputs
def _evaluate_outputs(self, outputs: List[str], test_cases: List,
metrics: List[str]) -> Dict:
"""评估输出"""
results = {metric: [] for metric in metrics}
for output, case in zip(outputs, test_cases):
if output is None:
continue
for metric in metrics:
if metric == "accuracy":
score = self._metric_accuracy(output, case["expected"])
elif metric == "relevance":
score = self._metric_relevance(output, case["input"])
elif metric == "consistency":
score = self._metric_consistency(output)
elif metric == "safety":
score = self._metric_safety(output)
else:
score = 0.0
results[metric].append(score)
return results
def _aggregate_results(self, metric_results: Dict,
config: EvaluationConfig) -> Dict:
"""聚合评估结果"""
import numpy as np
aggregated = {}
for metric, scores in metric_results.items():
aggregated[metric] = {
"mean": np.mean(scores),
"std": np.std(scores),
"min": np.min(scores),
"max": np.max(scores)
}
# 计算是否通过
pass_rate = np.mean([
1 if s >= config.min_quality_score else 0
for scores in metric_results.values()
for s in scores
])
return {
"config": config.__dict__,
"metrics": aggregated,
"pass_rate": pass_rate,
"passed": pass_rate >= config.min_pass_rate,
"timestamp": self._get_timestamp()
}
def _save_results(self, version: str, report: Dict):
"""保存评估结果"""
self.storage.save(
f"evaluations/{version}/{self._get_timestamp()}.json",
json.dumps(report, indent=2)
)
self.history.append(report)
# 评估指标实现方法
def _metric_accuracy(self, output: str, expected: str) -> float:
"""准确性评估"""
# 实现准确性计算逻辑
pass
def _metric_relevance(self, output: str, input_text: str) -> float:
"""相关性评估"""
# 实现相关性计算逻辑
pass
def _metric_consistency(self, output: str) -> float:
"""一致性评估"""
# 实现一致性计算逻辑
pass
def _metric_safety(self, output: str) -> float:
"""安全性评估"""
# 实现安全性计算逻辑
pass
def _get_timestamp(self) -> str:
from datetime import datetime
return datetime.now().isoformat()
# 使用示例
config = EvaluationConfig(
prompt_version="v2.3",
test_set_path="test_cases.json",
metrics=["accuracy", "relevance", "consistency", "safety"],
min_pass_rate=0.85,
model="gpt-4"
)
# 实例化评估管道
pipeline = PromptEvaluationPipeline(llm_client, storage_client)
# 运行评估
report = pipeline.run_evaluation(config)
# 输出结果
if report["passed"]:
print("✓ 评估通过!可以发布")
else:
print("✗ 评估失败,需要改进")
for metric, stats in report["metrics"].items():
print(f"{metric}: {stats['mean']:.2f} ± {stats['std']:.2f}")