class WebArenaBenchmark:
"""WebArena基准实现"""
def __init__(self, domain: str = "ecommerce"):
"""
domain: 'ecommerce', 'social_media', 'forum', 'government'
"""
self.domain = domain
self.tasks = self._get_sample_tasks()
def _get_sample_tasks(self) -> list:
"""获取示例任务"""
tasks = {
"ecommerce": [
{
"id": "shop_001",
"instruction": "在Amazon上搜索'笔记本电脑',按价格从低到高排序",
"expected_steps": ["navigate", "search", "sort"],
"ground_truth": "sorted by price ascending"
},
{
"id": "shop_002",
"instruction": "添加商品到购物车并查看总价",
"expected_steps": ["add_to_cart", "view_cart"],
"ground_truth": "total price displayed"
}
],
"social_media": [
{
"id": "social_001",
"instruction": "在Twitter上发布一条包含#AgentBench标签的推文",
"expected_steps": ["navigate", "compose", "post"],
"ground_truth": "tweet posted"
}
]
}
return tasks.get(self.domain, [])
async def run_evaluation(self, agent, size: int = 5):
"""运行评估"""
tasks = self.tasks[:size]
results = []
for task in tasks:
try:
# 模拟网页交互
trajectory = await agent.interact_with_web(
instruction=task["instruction"]
)
# 检查是否完成了预期步骤
completed_steps = [step for step in task["expected_steps"]
if step in [call.tool_name for call in trajectory]]
success = len(completed_steps) == len(task["expected_steps"])
results.append({
"task_id": task["id"],
"success": success,
"steps_completed": len(completed_steps),
"steps_expected": len(task["expected_steps"])
})
except Exception as e:
results.append({
"task_id": task["id"],
"success": False,
"error": str(e)
})
success_rate = sum(1 for r in results if r["success"]) / len(results)
return {
"domain": self.domain,
"success_rate": success_rate,
"total": len(results),
"results": results
}