# 13.2 端到端测试策略

本节讨论端到端测试的多种策略，包括Mock测试与真实测试的权衡、测试夹具设计、回归基线管理以及完整的E2E测试套件构建。

## 13.2.1 测试分类

端到端(E2E)测试涉及多种不同的测试策略，需要根据目的选择：

| 维度     | 分类                   |
| ------ | -------------------- |
| 模型调用方式 | Mock LLM vs 真实LLM    |
| 工具调用方式 | Mock Tools vs 真实工具   |
| 数据来源   | 合成数据 vs 真实数据 vs 众包数据 |
| 运行环境   | 离线 vs 实时在线           |
| 评估方式   | 自动化评估 vs 人工评估        |

## 13.2.2 Mock测试 vs 真实测试

### Mock 测试

Mock测试的实现代码示例：

```python
from unittest.mock import Mock, patch, MagicMock
import asyncio

class MockLLMResponse:
    """模拟LLM响应"""
    def __init__(self, tool_calls: list):
        self.tool_calls = tool_calls

class E2ETestWithMocks:
    """使用Mock的端到端测试"""

    def setup_mock_llm(self):
        """设置Mock LLM"""
        self.mock_llm = Mock()

        # 预设第一步响应：读取文件
        self.mock_llm.generate.side_effect = [
            MockLLMResponse([
                {"tool_name": "read_file", "args": {"path": "data.txt"}}
            ]),
            MockLLMResponse([
                {"tool_name": "analyze_text", "args": {"text": "file content"}}
            ]),
            MockLLMResponse([
                {"tool_name": "write_file", "args": {"path": "result.txt", "content": "analysis"}}
            ])
        ]

    def setup_mock_tools(self):
        """设置Mock工具"""
        self.mock_tools = {
            "read_file": Mock(return_value="file content"),
            "analyze_text": Mock(return_value="analysis result"),
            "write_file": Mock(return_value=True),
        }

    async def test_file_analysis_workflow(self):
        """测试文件分析工作流"""

        self.setup_mock_llm()
        self.setup_mock_tools()

        # 创建Agent(使用Mock)
        agent = Agent(
            llm=self.mock_llm,
            tools=self.mock_tools
        )

        # 运行任务
        result = await agent.run("分析data.txt文件并保存结果到result.txt")

        # 验证
        assert result.success == True
        assert self.mock_tools["read_file"].called
        assert self.mock_tools["analyze_text"].called
        assert self.mock_tools["write_file"].called

        # 验证调用顺序
        calls = self.mock_llm.generate.call_args_list
        assert len(calls) == 3  # 三步调用
```

**Mock测试的优点**：

* 快速(<100ms)
* 确定性（相同输入→相同输出）
* 易于隔离问题

**缺点**：

* 无法测试LLM实际行为
* Mock可能不够真实
* 无法测试新的工具组合

### 真实测试

使用真实LLM的测试实现代码：

```python
import os
from anthropic import Anthropic

class E2ETestWithRealLLM:
    """使用真实LLM的端到端测试"""

    def __init__(self):
        self.client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
        self.test_data_dir = "/tmp/e2e_test_data"
        os.makedirs(self.test_data_dir, exist_ok=True)

    async def test_real_file_analysis(self):
        """使用真实LLM的测试"""

        # 准备测试数据
        test_file = os.path.join(self.test_data_dir, "sample.txt")
        with open(test_file, "w") as f:
            f.write("This is a sample document for testing.\nIt contains multiple lines.")

        # 定义真实工具
        tools = [
            {
                "name": "read_file",
                "description": "读取文件内容",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "path": {"type": "string", "description": "文件路径"}
                    },
                    "required": ["path"]
                }
            },
            {
                "name": "write_file",
                "description": "写入文件",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "path": {"type": "string"},
                        "content": {"type": "string"}
                    },
                    "required": ["path", "content"]
                }
            }
        ]

        # 系统消息
        system_prompt = """你是一个文件分析助手。
        可用工具：read_file(读取文件)、write_file(写入文件)
        任务：读取文件并分析其内容,然后将分析结果写入输出文件。
        """

        # 初始消息
        messages = [
            {"role": "user", "content": f"请分析{test_file}的内容,并将结果写入result.txt"}
        ]

        # 迭代交互
        max_iterations = 5
        iteration = 0
        tool_calls_made = []

        while iteration < max_iterations:
            iteration += 1

            # 调用真实LLM
            response = self.client.messages.create(
                model="claude-sonnet-4-6",
                max_tokens=1024,
                system=system_prompt,
                tools=tools,
                messages=messages
            )

            # 检查是否需要工具调用
            if response.stop_reason == "tool_use":
                tool_use_block = next(
                    (block for block in response.content if block.type == "tool_use"),
                    None
                )

                if tool_use_block:
                    tool_name = tool_use_block.name
                    tool_input = tool_use_block.input

                    tool_calls_made.append({
                        "tool": tool_name,
                        "input": tool_input
                    })

                    # 执行真实工具
                    if tool_name == "read_file":
                        with open(tool_input["path"], "r") as f:
                            tool_result = f.read()
                    elif tool_name == "write_file":
                        with open(tool_input["path"], "w") as f:
                            f.write(tool_input["content"])
                        tool_result = "文件已写入"
                    else:
                        tool_result = "未知工具"

                    # 继续对话
                    messages.append({"role": "assistant", "content": response.content})
                    messages.append({
                        "role": "user",
                        "content": [
                            {
                                "type": "tool_result",
                                "tool_use_id": tool_use_block.id,
                                "content": tool_result
                            }
                        ]
                    })
            else:
                # LLM完成
                break

        # 验证结果
        result_file = os.path.join(self.test_data_dir, "result.txt")
        assert os.path.exists(result_file), "结果文件未创建"

        with open(result_file, "r") as f:
            result_content = f.read()

        assert len(result_content) > 0, "结果文件为空"
        assert len(tool_calls_made) >= 2, f"工具调用不足,只有{len(tool_calls_made)}次"

        return {
            "success": True,
            "tool_calls": len(tool_calls_made),
            "iterations": iteration,
            "result": result_content
        }
```

**真实测试的优点**：

* 真实性强
* 能发现Mock无法发现的问题
* 能评估LLM在新任务上的表现

**缺点**：

* 慢（每个测试5-30秒）
* API费用（真实调用）
* 不确定性（LLM输出变化）

## 13.2.3 测试夹具设计

测试夹具(Test Fixtures)是测试的基础设施，包括测试数据、环境配置等。

```python
import pytest
import tempfile
import json
from pathlib import Path

class TestFixture:
    """测试夹具基类"""

    @pytest.fixture(scope="session")
    def test_data_dir(self):
        """创建临时测试数据目录"""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield tmpdir

    @pytest.fixture
    def sample_files(self, test_data_dir):
        """创建示例文件"""
        files = {}

        # 创建文本文件
        text_file = Path(test_data_dir) / "sample.txt"
        text_file.write_text("Sample text content for testing")
        files["text"] = str(text_file)

        # 创建JSON文件
        json_file = Path(test_data_dir) / "data.json"
        json_file.write_text(json.dumps({"key": "value", "number": 42}))
        files["json"] = str(json_file)

        # 创建CSV文件
        csv_file = Path(test_data_dir) / "data.csv"
        csv_file.write_text("name,age,city\nAlice,30,NYC\nBob,25,LA")
        files["csv"] = str(csv_file)

        return files

    @pytest.fixture
    def mock_tools(self):
        """Mock工具集合"""
        from unittest.mock import Mock

        return {
            "read_file": Mock(return_value="mocked file content"),
            "analyze_text": Mock(return_value="analysis result"),
            "list_files": Mock(return_value=["file1.txt", "file2.txt"]),
        }

    @pytest.fixture
    def agent_config(self):
        """Agent配置"""
        return {
            "model": "claude-sonnet-4-6",
            "max_iterations": 10,
            "timeout": 30,
            "tools_enabled": ["read_file", "analyze_text", "write_file"]
        }
```

## 13.2.4 回归测试基线

当系统演进时，需要维护一个基线来检测性能退化。

```python
class RegressionTestBaseline:
    """回归测试基线管理"""

    def __init__(self, baseline_file: str = "test_baseline.json"):
        self.baseline_file = baseline_file
        self.baseline = self._load_baseline()

    def _load_baseline(self) -> dict:
        """加载基线"""
        try:
            with open(self.baseline_file, "r") as f:
                return json.load(f)
        except FileNotFoundError:
            return {}

    def save_baseline(self, metrics: dict):
        """保存新基线"""
        with open(self.baseline_file, "w") as f:
            json.dump(metrics, f, indent=2)
        print(f"基线已保存: {self.baseline_file}")

    def check_regression(self, current_metrics: dict, threshold: float = 0.05) -> list:
        """
        检查回归

        Args:
            current_metrics: 当前指标
            threshold: 允许的降低幅度(5%)

        Returns:
            回归列表：[(指标名, 基线值, 当前值, 降低%)]
        """
        regressions = []

        for metric_name, baseline_value in self.baseline.items():
            if metric_name not in current_metrics:
                continue

            current_value = current_metrics[metric_name]

            # 对于"越高越好"的指标(accuracy, success_rate等)
            if "accuracy" in metric_name or "success" in metric_name or "rate" in metric_name:
                if current_value < baseline_value * (1 - threshold):
                    degradation = (baseline_value - current_value) / baseline_value
                    regressions.append((
                        metric_name,
                        baseline_value,
                        current_value,
                        degradation * 100
                    ))

            # 对于"越低越好"的指标(latency, cost等)
            elif "time" in metric_name or "cost" in metric_name or "latency" in metric_name:
                if current_value > baseline_value * (1 + threshold):
                    degradation = (current_value - baseline_value) / baseline_value
                    regressions.append((
                        metric_name,
                        baseline_value,
                        current_value,
                        degradation * 100
                    ))

        return regressions

# 使用示例
baseline_mgr = RegressionTestBaseline()

# 首次运行：保存基线
baseline_mgr.save_baseline({
    "success_rate": 0.95,
    "avg_tokens": 250,
    "avg_time_sec": 5.2
})

# 后续运行：检查回归
current = {
    "success_rate": 0.92,  # 降低了
    "avg_tokens": 280,     # 增加了
    "avg_time_sec": 4.8    # 降低了(好的)
}

regressions = baseline_mgr.check_regression(current)
if regressions:
    print("⚠ 检测到性能退化：")
    for metric, baseline, current, degradation in regressions:
        print(f"  {metric}: {baseline} → {current} (降低{degradation:.1f}%)")
```

## 13.2.5 完整的E2E测试套件

代码如下：

```python
# tests/test_e2e.py
import pytest
import asyncio
from pathlib import Path

class TestE2EWorkflows:
    """端到端工作流测试"""

    @pytest.mark.asyncio
    async def test_simple_file_read(self, sample_files, mock_tools):
        """简单的文件读取"""
        # 这个应该成功
        pass

    @pytest.mark.asyncio
    async def test_multi_step_workflow(self, sample_files, mock_tools):
        """多步工作流"""
        # 读取 → 分析 → 写入
        pass

    @pytest.mark.asyncio
    async def test_error_recovery(self, sample_files, mock_tools):
        """错误恢复测试"""
        # 工具调用失败后是否能恢复
        pass

    @pytest.mark.asyncio
    async def test_edge_cases(self, sample_files):
        """边界情况"""
        # 空文件、大文件、特殊字符等
        pass

    def test_token_efficiency(self):
        """Token效率测试"""
        # 平均调用数、Token使用
        pass

    def test_performance_baseline(self):
        """性能基线测试"""
        # 与基线对比
        pass
```

***

**本节总结**：E2E测试需要结合Mock测试（快速反馈）和真实测试（真实验证），使用明确的测试夹具和回归测试基线来确保质量。


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://yeasy.gitbook.io/harness_engineering_guide/di-si-bu-fen-an-quan-ping-gu-yu-yan-jin/13_evaluation/13.2_e2e_testing.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
