> For the complete documentation index, see [llms.txt](https://yeasy.gitbook.io/harness_engineering_guide/llms.txt). Markdown versions of documentation pages are available by appending `.md` to page URLs; this page is available as [Markdown](https://yeasy.gitbook.io/harness_engineering_guide/di-si-bu-fen-an-quan-ping-gu-yu-yan-jin/13_evaluation/13.2_e2e_testing.md).

# 13.2 端到端测试策略

本节讨论端到端测试的多种策略，包括Mock测试与真实测试的权衡、测试夹具设计、回归基线管理以及完整的E2E测试套件构建。

## 13.2.1 测试分类

端到端(E2E)测试涉及多种不同的测试策略，需要根据目的选择：

| 维度     | 分类                   |
| ------ | -------------------- |
| 模型调用方式 | Mock LLM vs 真实LLM    |
| 工具调用方式 | Mock Tools vs 真实工具   |
| 数据来源   | 合成数据 vs 真实数据 vs 众包数据 |
| 运行环境   | 离线 vs 实时在线           |
| 评估方式   | 自动化评估 vs 人工评估        |

## 13.2.2 Mock测试 vs 真实测试

### Mock 测试

Mock测试的实现代码示例：

```python
from unittest.mock import Mock, patch, MagicMock
import asyncio

class MockLLMResponse:
    """模拟LLM响应"""
    def __init__(self, tool_calls: list):
        self.tool_calls = tool_calls

class E2ETestWithMocks:
    """使用Mock的端到端测试"""

    def setup_mock_llm(self):
        """设置Mock LLM"""
        self.mock_llm = Mock()

        # 预设第一步响应：读取文件
        self.mock_llm.generate.side_effect = [
            MockLLMResponse([
                {"tool_name": "read_file", "args": {"path": "data.txt"}}
            ]),
            MockLLMResponse([
                {"tool_name": "analyze_text", "args": {"text": "file content"}}
            ]),
            MockLLMResponse([
                {"tool_name": "write_file", "args": {"path": "result.txt", "content": "analysis"}}
            ])
        ]

    def setup_mock_tools(self):
        """设置Mock工具"""
        self.mock_tools = {
            "read_file": Mock(return_value="file content"),
            "analyze_text": Mock(return_value="analysis result"),
            "write_file": Mock(return_value=True),
        }

    async def test_file_analysis_workflow(self):
        """测试文件分析工作流"""

        self.setup_mock_llm()
        self.setup_mock_tools()

        # 创建Agent(使用Mock)
        agent = Agent(
            llm=self.mock_llm,
            tools=self.mock_tools
        )

        # 运行任务
        result = await agent.run("分析data.txt文件并保存结果到result.txt")

        # 验证
        assert result.success == True
        assert self.mock_tools["read_file"].called
        assert self.mock_tools["analyze_text"].called
        assert self.mock_tools["write_file"].called

        # 验证调用顺序
        calls = self.mock_llm.generate.call_args_list
        assert len(calls) == 3  # 三步调用
```

**Mock测试的优点**：

* 快速(<100ms)
* 确定性（相同输入→相同输出）
* 易于隔离问题

**缺点**：

* 无法测试LLM实际行为
* Mock可能不够真实
* 无法测试新的工具组合

### 真实测试

使用真实LLM的测试实现代码：

```python
import os
from anthropic import Anthropic

class E2ETestWithRealLLM:
    """使用真实LLM的端到端测试"""

    def __init__(self):
        self.client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
        self.test_data_dir = "/tmp/e2e_test_data"
        os.makedirs(self.test_data_dir, exist_ok=True)

    async def test_real_file_analysis(self):
        """使用真实LLM的测试"""

        # 准备测试数据
        test_file = os.path.join(self.test_data_dir, "sample.txt")
        with open(test_file, "w") as f:
            f.write("This is a sample document for testing.\nIt contains multiple lines.")

        # 定义真实工具
        tools = [
            {
                "name": "read_file",
                "description": "读取文件内容",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "path": {"type": "string", "description": "文件路径"}
                    },
                    "required": ["path"]
                }
            },
            {
                "name": "write_file",
                "description": "写入文件",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "path": {"type": "string"},
                        "content": {"type": "string"}
                    },
                    "required": ["path", "content"]
                }
            }
        ]

        # 系统消息
        system_prompt = """你是一个文件分析助手。
        可用工具：read_file(读取文件)、write_file(写入文件)
        任务：读取文件并分析其内容,然后将分析结果写入输出文件。
        """

        # 初始消息
        messages = [
            {"role": "user", "content": f"请分析{test_file}的内容,并将结果写入 {os.path.join(self.test_data_dir, 'result.txt')}"}
        ]

        # 迭代交互
        max_iterations = 5
        iteration = 0
        tool_calls_made = []

        while iteration < max_iterations:
            iteration += 1

            # 调用真实LLM
            response = self.client.messages.create(
                model="claude-sonnet-4-6",
                max_tokens=1024,
                system=system_prompt,
                tools=tools,
                messages=messages
            )

            # 检查是否需要工具调用
            if response.stop_reason == "tool_use":
                tool_use_block = next(
                    (block for block in response.content if block.type == "tool_use"),
                    None
                )

                if tool_use_block:
                    tool_name = tool_use_block.name
                    tool_input = tool_use_block.input

                    tool_calls_made.append({
                        "tool": tool_name,
                        "input": tool_input
                    })

                    # 执行真实工具
                    if tool_name == "read_file":
                        with open(tool_input["path"], "r") as f:
                            tool_result = f.read()
                    elif tool_name == "write_file":
                        with open(tool_input["path"], "w") as f:
                            f.write(tool_input["content"])
                        tool_result = "文件已写入"
                    else:
                        tool_result = "未知工具"

                    # 继续对话
                    messages.append({"role": "assistant", "content": response.content})
                    messages.append({
                        "role": "user",
                        "content": [
                            {
                                "type": "tool_result",
                                "tool_use_id": tool_use_block.id,
                                "content": tool_result
                            }
                        ]
                    })
            else:
                # LLM完成
                break

        # 验证结果
        result_file = os.path.join(self.test_data_dir, "result.txt")
        assert os.path.exists(result_file), "结果文件未创建"

        with open(result_file, "r") as f:
            result_content = f.read()

        assert len(result_content) > 0, "结果文件为空"
        assert len(tool_calls_made) >= 2, f"工具调用不足,只有{len(tool_calls_made)}次"

        return {
            "success": True,
            "tool_calls": len(tool_calls_made),
            "iterations": iteration,
            "result": result_content
        }
```

**真实测试的优点**：

* 真实性强
* 能发现Mock无法发现的问题
* 能评估LLM在新任务上的表现

**缺点**：

* 慢（每个测试5-30秒）
* API费用（真实调用）
* 不确定性（LLM输出变化）

## 13.2.3 测试夹具设计

测试夹具(Test Fixtures)是测试的基础设施，包括测试数据、环境配置等。

```python
import pytest
import tempfile
import json
from pathlib import Path

class TestFixture:
    """测试夹具基类"""

    @pytest.fixture(scope="session")
    def test_data_dir(self):
        """创建临时测试数据目录"""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield tmpdir

    @pytest.fixture
    def sample_files(self, test_data_dir):
        """创建示例文件"""
        files = {}

        # 创建文本文件
        text_file = Path(test_data_dir) / "sample.txt"
        text_file.write_text("Sample text content for testing")
        files["text"] = str(text_file)

        # 创建JSON文件
        json_file = Path(test_data_dir) / "data.json"
        json_file.write_text(json.dumps({"key": "value", "number": 42}))
        files["json"] = str(json_file)

        # 创建CSV文件
        csv_file = Path(test_data_dir) / "data.csv"
        csv_file.write_text("name,age,city\nAlice,30,NYC\nBob,25,LA")
        files["csv"] = str(csv_file)

        return files

    @pytest.fixture
    def mock_tools(self):
        """Mock工具集合"""
        from unittest.mock import Mock

        return {
            "read_file": Mock(return_value="mocked file content"),
            "analyze_text": Mock(return_value="analysis result"),
            "list_files": Mock(return_value=["file1.txt", "file2.txt"]),
        }

    @pytest.fixture
    def agent_config(self):
        """Agent配置"""
        return {
            "model": "claude-sonnet-4-6",
            "max_iterations": 10,
            "timeout": 30,
            "tools_enabled": ["read_file", "analyze_text", "write_file"]
        }
```

## 13.2.4 回归测试基线

当系统演进时，需要维护一个基线来检测性能退化。

```python
class RegressionTestBaseline:
    """回归测试基线管理"""

    def __init__(self, baseline_file: str = "test_baseline.json"):
        self.baseline_file = baseline_file
        self.baseline = self._load_baseline()

    def _load_baseline(self) -> dict:
        """加载基线"""
        try:
            with open(self.baseline_file, "r") as f:
                return json.load(f)
        except FileNotFoundError:
            return {}

    def save_baseline(self, metrics: dict):
        """保存新基线"""
        with open(self.baseline_file, "w") as f:
            json.dump(metrics, f, indent=2)
        print(f"基线已保存: {self.baseline_file}")

    def check_regression(self, current_metrics: dict, threshold: float = 0.05) -> list:
        """
        检查回归

        Args:
            current_metrics: 当前指标
            threshold: 允许的降低幅度(5%)

        Returns:
            回归列表：[(指标名, 基线值, 当前值, 降低%)]
        """
        regressions = []

        for metric_name, baseline_value in self.baseline.items():
            if metric_name not in current_metrics:
                continue

            current_value = current_metrics[metric_name]

            # 对于"越高越好"的指标(accuracy, success_rate等)
            if "accuracy" in metric_name or "success" in metric_name or "rate" in metric_name:
                if current_value < baseline_value * (1 - threshold):
                    degradation = (baseline_value - current_value) / baseline_value
                    regressions.append((
                        metric_name,
                        baseline_value,
                        current_value,
                        degradation * 100
                    ))

            # 对于"越低越好"的指标(latency, cost等)
            elif "time" in metric_name or "cost" in metric_name or "latency" in metric_name:
                if current_value > baseline_value * (1 + threshold):
                    degradation = (current_value - baseline_value) / baseline_value
                    regressions.append((
                        metric_name,
                        baseline_value,
                        current_value,
                        degradation * 100
                    ))

        return regressions

# 使用示例
baseline_mgr = RegressionTestBaseline()

# 首次运行：保存基线
baseline_mgr.save_baseline({
    "success_rate": 0.95,
    "avg_tokens": 250,
    "avg_time_sec": 5.2
})

# 后续运行：检查回归
current = {
    "success_rate": 0.92,  # 降低了
    "avg_tokens": 280,     # 增加了
    "avg_time_sec": 4.8    # 降低了(好的)
}

regressions = baseline_mgr.check_regression(current)
if regressions:
    print("⚠ 检测到性能退化：")
    for metric, baseline, current, degradation in regressions:
        print(f"  {metric}: {baseline} → {current} (降低{degradation:.1f}%)")
```

## 13.2.5 完整的E2E测试套件

代码如下：

```python
# tests/test_e2e.py
import pytest
import asyncio
from pathlib import Path

class TestE2EWorkflows:
    """端到端工作流测试"""

    @pytest.mark.asyncio
    async def test_simple_file_read(self, sample_files, mock_tools):
        """简单的文件读取"""
        # 这个应该成功
        pass

    @pytest.mark.asyncio
    async def test_multi_step_workflow(self, sample_files, mock_tools):
        """多步工作流"""
        # 读取 → 分析 → 写入
        pass

    @pytest.mark.asyncio
    async def test_error_recovery(self, sample_files, mock_tools):
        """错误恢复测试"""
        # 工具调用失败后是否能恢复
        pass

    @pytest.mark.asyncio
    async def test_edge_cases(self, sample_files):
        """边界情况"""
        # 空文件、大文件、特殊字符等
        pass

    def test_token_efficiency(self):
        """Token效率测试"""
        # 平均调用数、Token使用
        pass

    def test_performance_baseline(self):
        """性能基线测试"""
        # 与基线对比
        pass
```

***

**本节总结**：E2E测试需要结合Mock测试（快速反馈）和真实测试（真实验证），使用明确的测试夹具和回归测试基线来确保质量。