跳转至

RAG 效果评估

本教程讲解如何评估和优化基于 Unifiles 构建的 RAG(检索增强生成)系统的效果。

评估框架概述

RAG 评估维度:
├── 检索质量 (Retrieval Quality)
│   ├── 召回率 (Recall)
│   ├── 精确率 (Precision)
│   └── MRR (Mean Reciprocal Rank)
├── 生成质量 (Generation Quality)
│   ├── 答案相关性
│   ├── 事实准确性
│   └── 完整性
└── 端到端效果 (End-to-End)
    ├── 用户满意度
    └── 任务完成率

检索质量评估

构建评估数据集

from dataclasses import dataclass
from typing import List

@dataclass
class RetrievalTestCase:
    query: str
    relevant_doc_ids: List[str]  # 标注的相关文档
    description: str = ""

# 创建测试数据集
test_cases = [
    RetrievalTestCase(
        query="年假申请流程是什么?",
        relevant_doc_ids=["doc_hr_001", "doc_hr_002"],
        description="年假相关查询"
    ),
    RetrievalTestCase(
        query="报销需要哪些材料?",
        relevant_doc_ids=["doc_finance_001"],
        description="报销相关查询"
    ),
    RetrievalTestCase(
        query="如何申请远程办公?",
        relevant_doc_ids=["doc_hr_005", "doc_it_001"],
        description="远程办公相关查询"
    ),
]

计算检索指标

from unifiles import UnifilesClient
from typing import List, Dict
import numpy as np

client = UnifilesClient(api_key="sk_...")

def evaluate_retrieval(
    kb_id: str,
    test_cases: List[RetrievalTestCase],
    top_k: int = 5
) -> Dict:
    """评估检索质量"""

    metrics = {
        "recall": [],
        "precision": [],
        "mrr": [],
        "hit_rate": []
    }

    for case in test_cases:
        # 执行搜索
        results = client.knowledge_bases.search(
            kb_id=kb_id,
            query=case.query,
            top_k=top_k
        )

        retrieved_ids = [chunk.document_id for chunk in results.chunks]
        relevant_ids = set(case.relevant_doc_ids)

        # 计算召回率
        hits = len(set(retrieved_ids) & relevant_ids)
        recall = hits / len(relevant_ids) if relevant_ids else 0
        metrics["recall"].append(recall)

        # 计算精确率
        precision = hits / len(retrieved_ids) if retrieved_ids else 0
        metrics["precision"].append(precision)

        # 计算 MRR
        mrr = 0
        for i, doc_id in enumerate(retrieved_ids):
            if doc_id in relevant_ids:
                mrr = 1 / (i + 1)
                break
        metrics["mrr"].append(mrr)

        # 命中率
        hit = 1 if hits > 0 else 0
        metrics["hit_rate"].append(hit)

    # 计算平均值
    return {
        "avg_recall": np.mean(metrics["recall"]),
        "avg_precision": np.mean(metrics["precision"]),
        "avg_mrr": np.mean(metrics["mrr"]),
        "hit_rate": np.mean(metrics["hit_rate"]),
        "num_cases": len(test_cases)
    }

# 评估
results = evaluate_retrieval(kb_id, test_cases, top_k=5)
print(f"召回率: {results['avg_recall']:.2%}")
print(f"精确率: {results['avg_precision']:.2%}")
print(f"MRR: {results['avg_mrr']:.3f}")
print(f"命中率: {results['hit_rate']:.2%}")

分块策略对比评估

def compare_chunking_strategies(
    file_ids: List[str],
    test_cases: List[RetrievalTestCase],
    strategies: List[Dict]
) -> Dict:
    """对比不同分块策略的效果"""

    results = {}

    for strategy in strategies:
        # 创建知识库
        kb = client.knowledge_bases.create(
            name=f"eval_{strategy['name']}",
            chunking_strategy=strategy["config"]
        )

        # 添加文档
        for file_id in file_ids:
            doc = client.knowledge_bases.documents.create(
                kb_id=kb.id,
                file_id=file_id
            )
            doc.wait()

        # 评估
        metrics = evaluate_retrieval(kb.id, test_cases)
        results[strategy["name"]] = metrics

        # 清理
        client.knowledge_bases.delete(kb.id)

    return results

# 对比策略
strategies = [
    {"name": "small_fixed", "config": {"type": "fixed", "chunk_size": 256}},
    {"name": "medium_semantic", "config": {"type": "semantic", "chunk_size": 512}},
    {"name": "large_semantic", "config": {"type": "semantic", "chunk_size": 1024}},
]

comparison = compare_chunking_strategies(file_ids, test_cases, strategies)

for name, metrics in comparison.items():
    print(f"\n{name}:")
    print(f"  召回率: {metrics['avg_recall']:.2%}")
    print(f"  MRR: {metrics['avg_mrr']:.3f}")

生成质量评估

使用 LLM 评估答案质量

from openai import OpenAI

openai = OpenAI(api_key="sk_openai_...")

@dataclass
class QATestCase:
    question: str
    reference_answer: str  # 标准答案
    context_doc_ids: List[str]

def evaluate_answer_quality(
    question: str,
    generated_answer: str,
    reference_answer: str
) -> Dict:
    """使用 LLM 评估答案质量"""

    prompt = f"""评估生成答案的质量,与参考答案对比。

问题:{question}

参考答案:{reference_answer}

生成答案:{generated_answer}

请从以下维度评分(1-5分)并说明理由:
1. 相关性:答案是否回答了问题
2. 准确性:答案是否事实正确
3. 完整性:答案是否完整覆盖问题
4. 简洁性:答案是否简明扼要

输出 JSON 格式:
{{"relevance": 分数, "accuracy": 分数, "completeness": 分数, "conciseness": 分数, "reasoning": "评估理由"}}
"""

    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

def evaluate_rag_pipeline(
    kb_id: str,
    test_cases: List[QATestCase],
    llm_model: str = "gpt-4"
) -> Dict:
    """评估完整 RAG 管道"""

    scores = {
        "relevance": [],
        "accuracy": [],
        "completeness": [],
        "conciseness": []
    }

    for case in test_cases:
        # 检索
        results = client.knowledge_bases.search(
            kb_id=kb_id,
            query=case.question,
            top_k=5
        )

        context = "\n\n".join([c.content for c in results.chunks])

        # 生成
        generated = generate_answer(case.question, context, llm_model)

        # 评估
        eval_result = evaluate_answer_quality(
            case.question,
            generated,
            case.reference_answer
        )

        for key in scores:
            scores[key].append(eval_result.get(key, 0))

    return {
        "avg_relevance": np.mean(scores["relevance"]),
        "avg_accuracy": np.mean(scores["accuracy"]),
        "avg_completeness": np.mean(scores["completeness"]),
        "avg_conciseness": np.mean(scores["conciseness"]),
        "overall": np.mean([np.mean(v) for v in scores.values()])
    }

def generate_answer(question: str, context: str, model: str) -> str:
    """基于上下文生成答案"""

    response = openai.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "基于提供的上下文回答问题。如果上下文不包含答案,说明无法回答。"
            },
            {
                "role": "user",
                "content": f"上下文:\n{context}\n\n问题:{question}"
            }
        ]
    )

    return response.choices[0].message.content

自动化评估流水线

import json
from datetime import datetime
from pathlib import Path

class RAGEvaluator:
    """RAG 评估器"""

    def __init__(self, unifiles_client, openai_client, kb_id: str):
        self.unifiles = unifiles_client
        self.openai = openai_client
        self.kb_id = kb_id

    def run_evaluation(
        self,
        retrieval_cases: List[RetrievalTestCase],
        qa_cases: List[QATestCase],
        output_dir: str = "./eval_results"
    ) -> Dict:
        """运行完整评估"""

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # 检索评估
        print("评估检索质量...")
        retrieval_metrics = evaluate_retrieval(
            self.kb_id,
            retrieval_cases
        )

        # 生成评估
        print("评估生成质量...")
        generation_metrics = evaluate_rag_pipeline(
            self.kb_id,
            qa_cases
        )

        # 综合报告
        report = {
            "timestamp": timestamp,
            "kb_id": self.kb_id,
            "retrieval": retrieval_metrics,
            "generation": generation_metrics,
            "overall_score": self._calculate_overall_score(
                retrieval_metrics,
                generation_metrics
            )
        }

        # 保存结果
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)

        with open(output_path / f"eval_{timestamp}.json", "w") as f:
            json.dump(report, f, indent=2, ensure_ascii=False)

        return report

    def _calculate_overall_score(
        self,
        retrieval: Dict,
        generation: Dict
    ) -> float:
        """计算综合分数"""

        # 加权平均
        retrieval_score = (
            retrieval["avg_recall"] * 0.3 +
            retrieval["avg_mrr"] * 0.3 +
            retrieval["hit_rate"] * 0.4
        )

        generation_score = generation["overall"] / 5  # 归一化到 0-1

        return retrieval_score * 0.4 + generation_score * 0.6

# 使用
evaluator = RAGEvaluator(client, openai, kb_id)
report = evaluator.run_evaluation(retrieval_cases, qa_cases)

print(f"\n=== 评估报告 ===")
print(f"检索召回率: {report['retrieval']['avg_recall']:.2%}")
print(f"检索 MRR: {report['retrieval']['avg_mrr']:.3f}")
print(f"生成质量: {report['generation']['overall']:.2f}/5")
print(f"综合得分: {report['overall_score']:.2%}")

持续优化策略

A/B 测试框架

import random

class ABTestManager:
    """A/B 测试管理器"""

    def __init__(self, variants: Dict[str, str]):
        """
        variants: {"A": kb_id_a, "B": kb_id_b}
        """
        self.variants = variants
        self.results = {v: [] for v in variants}

    def get_variant(self, user_id: str) -> str:
        """根据用户 ID 分配变体"""
        # 确定性分配
        hash_val = hash(user_id) % 100
        return "A" if hash_val < 50 else "B"

    def search(self, user_id: str, query: str, **kwargs):
        """执行搜索(自动选择变体)"""
        variant = self.get_variant(user_id)
        kb_id = self.variants[variant]

        results = client.knowledge_bases.search(
            kb_id=kb_id,
            query=query,
            **kwargs
        )

        return results, variant

    def record_feedback(self, variant: str, query: str, helpful: bool):
        """记录用户反馈"""
        self.results[variant].append({
            "query": query,
            "helpful": helpful
        })

    def get_statistics(self) -> Dict:
        """获取 A/B 测试统计"""
        stats = {}

        for variant, feedbacks in self.results.items():
            if feedbacks:
                helpful_count = sum(1 for f in feedbacks if f["helpful"])
                stats[variant] = {
                    "total": len(feedbacks),
                    "helpful": helpful_count,
                    "helpful_rate": helpful_count / len(feedbacks)
                }

        return stats

# 使用
ab_test = ABTestManager({
    "A": "kb_chunking_256",
    "B": "kb_chunking_512"
})

# 用户搜索
results, variant = ab_test.search("user_123", "年假政策")

# 记录反馈
ab_test.record_feedback(variant, "年假政策", helpful=True)

# 查看结果
print(ab_test.get_statistics())

反馈收集与分析

from collections import defaultdict

class FeedbackCollector:
    """反馈收集器"""

    def __init__(self):
        self.feedbacks = defaultdict(list)

    def collect(
        self,
        query: str,
        retrieved_chunks: List,
        answer: str,
        user_rating: int,  # 1-5
        user_comment: str = ""
    ):
        """收集反馈"""
        self.feedbacks[query].append({
            "chunks": [c.id for c in retrieved_chunks],
            "answer": answer,
            "rating": user_rating,
            "comment": user_comment
        })

    def analyze(self) -> Dict:
        """分析反馈数据"""

        all_ratings = []
        low_rating_queries = []

        for query, feedbacks in self.feedbacks.items():
            ratings = [f["rating"] for f in feedbacks]
            avg_rating = sum(ratings) / len(ratings)
            all_ratings.extend(ratings)

            if avg_rating < 3:
                low_rating_queries.append({
                    "query": query,
                    "avg_rating": avg_rating,
                    "count": len(feedbacks)
                })

        return {
            "overall_avg_rating": sum(all_ratings) / len(all_ratings),
            "total_feedbacks": len(all_ratings),
            "low_rating_queries": sorted(
                low_rating_queries,
                key=lambda x: x["avg_rating"]
            )[:10]  # 前 10 个低分查询
        }

# 使用
collector = FeedbackCollector()

# 收集反馈
collector.collect(
    query="年假政策",
    retrieved_chunks=results.chunks,
    answer=generated_answer,
    user_rating=4,
    user_comment="基本满意,但缺少具体天数"
)

# 分析
analysis = collector.analyze()
print(f"平均评分: {analysis['overall_avg_rating']:.2f}")
print(f"需要优化的查询: {analysis['low_rating_queries']}")

评估报告模板

# RAG 系统评估报告

## 评估概述
- 评估日期: {timestamp}
- 知识库 ID: {kb_id}
- 测试用例数: {num_cases}

## 检索质量
| 指标 | 数值 | 目标 | 状态 |
|-----|------|------|------|
| 召回率 | {recall}% | >80% | ✓/✗ |
| 精确率 | {precision}% | >70% | ✓/✗ |
| MRR | {mrr} | >0.7 | ✓/✗ |
| 命中率 | {hit_rate}% | >90% | ✓/✗ |

## 生成质量
| 维度 | 评分 | 目标 | 状态 |
|-----|------|------|------|
| 相关性 | {relevance}/5 | >4.0 | ✓/✗ |
| 准确性 | {accuracy}/5 | >4.0 | ✓/✗ |
| 完整性 | {completeness}/5 | >3.5 | ✓/✗ |

## 优化建议
1. {recommendation_1}
2. {recommendation_2}
3. {recommendation_3}

下一步