# 可直接运行的示例
import numpy as np
from typing import List, Tuple
class GraphRAGFeasibilityEvaluation:
"""Graph RAG 可行性评估工具(包含具体评估方法)"""
def assess_complete(self, domain_data: dict) -> dict:
"""完整的可行性评估流程
Args:
domain_data: 包含以下 key的字典
- documents: List[str] - 样本文档
- entities: List[str] - 已识别的实体列表
- relations: List[Tuple] - 已识别的关系列表
- update_frequency: str - 数据更新频率(daily/weekly/monthly)
Returns:
详细的评估报告
"""
results = {}
# 1. 图密度分析(Graph Density Analysis)
results['graph_density'] = self._analyze_graph_density(
domain_data.get('entities', []),
domain_data.get('relations', [])
)
# 2. 查询模式分析(Query Pattern Profiling)
results['query_patterns'] = self._profile_query_patterns(
domain_data.get('documents', [])
)
# 3. 消歧难度评估(Disambiguation Feasibility)
results['disambiguation'] = self._assess_disambiguation_complexity(
domain_data.get('entities', [])
)
# 4. 数据稳定性评估(Data Stability Assessment)
results['stability'] = self._assess_data_stability(
domain_data.get('update_frequency', 'unknown')
)
# 5. 综合评分
results['overall_score'] = self._compute_overall_score(results)
results['recommendation'] = self._get_recommendation(results['overall_score'])
return results
def _analyze_graph_density(self, entities: List[str], relations: List[Tuple]) -> dict:
"""
图密度分析:评估实体和关系的连接程度
指标:
- 平均度数(avg degree):每个实体平均连接的关系数
- 连通性比率(connectivity ratio):在最大连通分量中的实体占比
- 孤立节点比率(isolated nodes ratio)
"""
if not entities or not relations:
return {"status": "insufficient_data", "score": 0}
# 构建邻接表
adjacency = {entity: 0 for entity in entities}
for src, relation_type, dst in relations:
if src in adjacency:
adjacency[src] += 1
if dst in adjacency:
adjacency[dst] += 1
# 计算指标
degrees = list(adjacency.values())
avg_degree = np.mean(degrees) if degrees else 0
isolated_ratio = sum(1 for d in degrees if d == 0) / len(degrees) if degrees else 1
# 评分
# 理想:平均度数>3,孤立率<20%
degree_score = min(avg_degree / 3, 1.0)
isolation_score = max(1 - isolated_ratio * 5, 0)
density_score = (degree_score + isolation_score) / 2
return {
"avg_degree": round(avg_degree, 2),
"isolated_ratio": round(isolated_ratio, 2),
"score": round(density_score, 2),
"recommendation": (
"✓ 图结构良好" if density_score > 0.7 else
"~ 图结构尚可,需优化" if density_score > 0.4 else
"✗ 图结构稀疏,Graph RAG不适合"
)
}
def _profile_query_patterns(self, documents: List[str]) -> dict:
"""
查询模式分析:评估典型查询的复杂度
评估:
- 单实体查询占比(simple queries)
- 2 - 3 跳查询占比(medium queries)
- 4+跳查询占比(complex queries)
Graph RAG适合处理简单和中等查询,不适合复杂推理
"""
if not documents:
return {"status": "insufficient_data", "score": 0.5}
# 简化启发式:基于文档中的关系提及
sample_queries_needed = min(100, len(documents))
simple_count = sum(1 for doc in documents[:sample_queries_needed]
if len(doc.split()) < 20) # 短查询通常是简单查询
medium_count = sum(1 for doc in documents[:sample_queries_needed]
if 20 <= len(doc.split()) < 100)
complex_count = sample_queries_needed - simple_count - medium_count
simple_ratio = simple_count / sample_queries_needed
medium_ratio = medium_count / sample_queries_needed
complex_ratio = complex_count / sample_queries_needed
# Graph RAG最适合的是: 简单+中等占比>80%
query_score = simple_ratio + medium_ratio * 0.8
return {
"simple_queries_ratio": round(simple_ratio, 2),
"medium_queries_ratio": round(medium_ratio, 2),
"complex_queries_ratio": round(complex_ratio, 2),
"score": round(query_score, 2),
"recommendation": (
"✓ 查询模式适合 Graph RAG" if query_score > 0.7 else
"~ 部分查询可用 Graph RAG" if query_score > 0.5 else
"✗ 查询过于复杂,Graph RAG不适合"
)
}
def _assess_disambiguation_complexity(self, entities: List[str]) -> dict:
"""
消歧难度评估:评估实体名称的歧义程度
指标:
- 实体名称重复率:同名实体数量
- 实体覆盖度:不同实体的数量
"""
if not entities:
return {"status": "insufficient_data", "score": 0.5}
# 计算实体名称的重复度
from collections import Counter
entity_counts = Counter(entities)
unique_count = len(entity_counts)
total_count = len(entities)
# 计算重复率(高重复=高歧义)
repeat_count = sum(count - 1 for count in entity_counts.values())
repeat_ratio = repeat_count / total_count if total_count > 0 else 0
# 评分:低重复率(<10%)是好的
disambiguation_score = max(1 - repeat_ratio * 10, 0)
return {
"total_entities": total_count,
"unique_entities": unique_count,
"repeat_ratio": round(repeat_ratio, 2),
"score": round(disambiguation_score, 2),
"recommendation": (
"✓ 消歧容易,实体清晰" if disambiguation_score > 0.8 else
"~ 消歧有难度,但可管理" if disambiguation_score > 0.6 else
"✗ 高度歧义,Graph RAG维护成本高"
)
}
def _assess_data_stability(self, update_frequency: str) -> dict:
"""
数据稳定性评估:评估知识图谱维护的难度
更新频率:
- yearly: 稳定,Graph RAG友好
- monthly: 可接受
- weekly: 需要自动化更新机制
- daily: 几乎不可行,建议用向量检索
"""
frequency_scores = {
"yearly": 1.0,
"monthly": 0.8,
"quarterly": 0.7,
"weekly": 0.4,
"daily": 0.1,
"realtime": 0.0,
"unknown": 0.5
}
stability_score = frequency_scores.get(update_frequency.lower(), 0.5)
return {
"update_frequency": update_frequency,
"score": round(stability_score, 2),
"recommendation": (
"✓ 数据稳定,维护成本低" if stability_score > 0.7 else
"~ 需要定期维护" if stability_score > 0.4 else
"✗ 数据更新频繁,Graph RAG不推荐"
)
}
def _compute_overall_score(self, results: dict) -> float:
"""综合计分"""
weights = {
'graph_density': 0.25,
'query_patterns': 0.25,
'disambiguation': 0.25,
'stability': 0.25
}
total_score = 0
for key, weight in weights.items():
if key in results and 'score' in results[key]:
total_score += results[key]['score'] * weight
return round(total_score, 2)
def _get_recommendation(self, score: float) -> str:
"""根据综合评分给出建议"""
if score >= 0.75:
return "✓ Graph RAG可行,强烈推荐"
elif score >= 0.6:
return "~ 混合方案更优(向量为主,图为辅)"
elif score >= 0.4:
return "⚠ Graph RAG风险高,需要充分试点"
else:
return "✗ 不建议 Graph RAG,使用纯向量检索"
# 使用示例
if __name__ == "__main__":
# 准备样本数据
sample_data = {
"documents": [
"张三在 ABC公司担任 CTO",
"ABC公司融资 5000万元",
"张三是 MIT毕业生",
"ABC公司总部在北京"
] * 25, # 100个文档
"entities": ["张三", "ABC公司", "CTO", "MIT", "北京"] * 40,
"relations": [
("张三", "任职", "ABC公司"),
("ABC公司", "融资", "5000万"),
("张三", "毕业", "MIT"),
("ABC公司", "位于", "北京")
] * 50,
"update_frequency": "monthly"
}
evaluator = GraphRAGFeasibilityEvaluation()
results = evaluator.assess_complete(sample_data)
print("Graph RAG 可行性评估报告")
print("=" * 50)
for key, value in results.items():
if isinstance(value, dict):
print(f"\n{key}:")
for k, v in value.items():
print(f" {k}: {v}")
else:
print(f"{key}: {value}")