evals

package

v0.35.0 Latest Latest Go to latest Published: Dec 29, 2025 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/astercloud/aster

Links

Open Source Insights

Documentation ¶

Index ¶

type BatchConfig
type BatchEvalResult
- func RunBatch(ctx context.Context, cfg *BatchConfig) (*BatchEvalResult, error)
- func RunBatchConcurrent(ctx context.Context, testCases []*BatchTestCase, scorers []Scorer, ...) (*BatchEvalResult, error)
- func RunBatchSimple(ctx context.Context, testCases []*BatchTestCase, scorers []Scorer) (*BatchEvalResult, error)
type BatchResult
type BatchSummary
type BatchTestCase
type KeywordCoverageConfig
type KeywordCoverageScorer
- func NewKeywordCoverageScorer(cfg KeywordCoverageConfig) *KeywordCoverageScorer
- func (s *KeywordCoverageScorer) Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)
type LLMScorer
- func NewAnswerRelevancyScorer(provider provider.Provider) *LLMScorer
- func NewCoherenceScorer(provider provider.Provider) *LLMScorer
- func NewCompletenessScorer(provider provider.Provider) *LLMScorer
- func NewContextRelevancyScorer(provider provider.Provider) *LLMScorer
- func NewFaithfulnessScorer(provider provider.Provider) *LLMScorer
- func NewHallucinationScorer(provider provider.Provider) *LLMScorer
- func NewLLMScorer(cfg LLMScorerConfig) *LLMScorer
- func NewToneConsistencyScorer(provider provider.Provider) *LLMScorer
- func NewToxicityScorer(provider provider.Provider) *LLMScorer
- func (s *LLMScorer) Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)
type LLMScorerConfig
type LexicalSimilarityConfig
type LexicalSimilarityScorer
- func NewLexicalSimilarityScorer(cfg LexicalSimilarityConfig) *LexicalSimilarityScorer
- func (s *LexicalSimilarityScorer) Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)
type ScoreResult
type Scorer
type TextEvalInput
- func BuildTextEvalInputFromEvents(events []session.Event) *TextEvalInput

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BatchConfig ¶

type BatchConfig struct {
	// TestCases 测试用例列表
	TestCases []*BatchTestCase
	// Scorers 评分器列表
	Scorers []Scorer
	// Concurrency 并发数（默认: 1，顺序执行）
	Concurrency int
	// StopOnError 遇到错误时是否停止（默认: false）
	StopOnError bool
	// ProgressCallback 进度回调函数（可选）
	ProgressCallback func(completed, total int)
}

BatchConfig 批量评估配置

type BatchEvalResult ¶

type BatchEvalResult struct {
	// Results 所有测试用例的结果
	Results []*BatchResult `json:"results"`
	// Summary 汇总统计
	Summary *BatchSummary `json:"summary"`
	// TotalDuration 总执行时间
	TotalDuration time.Duration `json:"total_duration"`
}

BatchEvalResult 批量评估的汇总结果

func RunBatch ¶

func RunBatch(ctx context.Context, cfg *BatchConfig) (*BatchEvalResult, error)

RunBatch 批量运行评估

func RunBatchConcurrent ¶

func RunBatchConcurrent(ctx context.Context, testCases []*BatchTestCase, scorers []Scorer, concurrency int) (*BatchEvalResult, error)

RunBatchConcurrent 并发批量评估

func RunBatchSimple ¶

func RunBatchSimple(ctx context.Context, testCases []*BatchTestCase, scorers []Scorer) (*BatchEvalResult, error)

RunBatchSimple 简化版批量评估（顺序执行）

type BatchResult ¶

type BatchResult struct {
	// TestCaseID 测试用例ID
	TestCaseID string `json:"test_case_id"`
	// Scores 所有评分器的结果
	Scores []*ScoreResult `json:"scores"`
	// Duration 执行时间
	Duration time.Duration `json:"duration"`
	// Error 错误信息（如果有）
	Error string `json:"error,omitempty"`
	// Metadata 测试用例的元数据
	Metadata map[string]any `json:"metadata,omitempty"`
}

BatchResult 单个测试用例的评估结果

type BatchSummary ¶

type BatchSummary struct {
	// TotalCases 总测试用例数
	TotalCases int `json:"total_cases"`
	// SuccessfulCases 成功的用例数
	SuccessfulCases int `json:"successful_cases"`
	// FailedCases 失败的用例数
	FailedCases int `json:"failed_cases"`
	// AverageScores 各评分器的平均分
	AverageScores map[string]float64 `json:"average_scores"`
	// AverageDuration 平均执行时间
	AverageDuration time.Duration `json:"average_duration"`
}

BatchSummary 批量评估的汇总统计

type BatchTestCase ¶

type BatchTestCase struct {
	// ID 测试用例ID
	ID string `json:"id"`
	// Input 评估输入
	Input *TextEvalInput `json:"input"`
	// Metadata 可选的元数据
	Metadata map[string]any `json:"metadata,omitempty"`
}

BatchTestCase 批量评估的单个测试用例

type KeywordCoverageConfig ¶

type KeywordCoverageConfig struct {
	// Keywords 需要在答案中出现的关键短语
	Keywords []string
	// CaseInsensitive 是否大小写不敏感(默认: true)
	CaseInsensitive bool
}

KeywordCoverageConfig 关键词覆盖率配置

type KeywordCoverageScorer ¶

type KeywordCoverageScorer struct {
	// contains filtered or unexported fields
}

KeywordCoverageScorer 根据关键词覆盖率对答案打分。得分 = 覆盖到的关键词数量 / 总关键词数量, 范围 [0,1]。

func NewKeywordCoverageScorer ¶

func NewKeywordCoverageScorer(cfg KeywordCoverageConfig) *KeywordCoverageScorer

NewKeywordCoverageScorer 创建关键词覆盖率评估器

func (*KeywordCoverageScorer) Score ¶

func (s *KeywordCoverageScorer) Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)

Score 实现 Scorer 接口

type LLMScorer ¶

type LLMScorer struct {
	// contains filtered or unexported fields
}

LLMScorer 基于LLM的评分器基类设计原理：使用LLM作为judge来评估文本质量

func NewAnswerRelevancyScorer ¶

func NewAnswerRelevancyScorer(provider provider.Provider) *LLMScorer

NewAnswerRelevancyScorer 创建答案相关性评分器答案相关性衡量答案是否直接回答了问题

func NewCoherenceScorer ¶

func NewCoherenceScorer(provider provider.Provider) *LLMScorer

NewCoherenceScorer 创建连贯性评分器连贯性衡量文本的逻辑结构和流畅度

func NewCompletenessScorer ¶

func NewCompletenessScorer(provider provider.Provider) *LLMScorer

NewCompletenessScorer 创建完整性评分器完整性衡量答案是否全面回答了问题

func NewContextRelevancyScorer ¶

func NewContextRelevancyScorer(provider provider.Provider) *LLMScorer

NewContextRelevancyScorer 创建上下文相关性评分器上下文相关性衡量提供的上下文是否对回答问题有帮助

func NewFaithfulnessScorer ¶

func NewFaithfulnessScorer(provider provider.Provider) *LLMScorer

NewFaithfulnessScorer 创建忠实度评分器忠实度衡量答案是否基于提供的上下文，没有添加虚假信息

func NewHallucinationScorer ¶

func NewHallucinationScorer(provider provider.Provider) *LLMScorer

NewHallucinationScorer 创建幻觉检测评分器幻觉检测衡量答案是否包含虚假或无法验证的信息

func NewLLMScorer ¶

func NewLLMScorer(cfg LLMScorerConfig) *LLMScorer

NewLLMScorer 创建LLM评分器

func NewToneConsistencyScorer ¶

func NewToneConsistencyScorer(provider provider.Provider) *LLMScorer

NewToneConsistencyScorer 创建语气一致性评分器语气一致性衡量文本的语气是否统一

func NewToxicityScorer ¶

func NewToxicityScorer(provider provider.Provider) *LLMScorer

NewToxicityScorer 创建毒性检测评分器毒性检测衡量文本是否包含有害或不当内容

func (*LLMScorer) Score ¶

func (s *LLMScorer) Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)

Score 实现Scorer接口

type LLMScorerConfig ¶

type LLMScorerConfig struct {
	// Provider LLM提供商（用于评分）
	Provider provider.Provider
	// Name 评分器名称
	Name string
	// Prompt 评分提示词模板
	Prompt string
	// MaxTokens 最大token数（默认: 500）
	MaxTokens int
	// Temperature 温度（默认: 0，更确定性）
	Temperature float64
}

LLMScorerConfig LLM评分器配置

type LexicalSimilarityConfig ¶

type LexicalSimilarityConfig struct {
	// MinTokenLength 参与比较的最小 token 长度(过滤掉太短的词,默认: 2)
	MinTokenLength int
}

LexicalSimilarityConfig 词汇相似度配置

type LexicalSimilarityScorer ¶

type LexicalSimilarityScorer struct {
	// contains filtered or unexported fields
}

LexicalSimilarityScorer 基于词汇集合的简单 Jaccard 相似度评估器。 score = |A ∩ B| / |A ∪ B|, 范围 [0,1]。

func NewLexicalSimilarityScorer ¶

func NewLexicalSimilarityScorer(cfg LexicalSimilarityConfig) *LexicalSimilarityScorer

NewLexicalSimilarityScorer 创建词汇相似度评估器

func (*LexicalSimilarityScorer) Score ¶

func (s *LexicalSimilarityScorer) Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)

Score 实现 Scorer 接口

type ScoreResult ¶

type ScoreResult struct {
	// Name 评估名称,如 "keyword_coverage", "lexical_similarity"
	Name string `json:"name"`
	// Value 得分,范围通常在 [0,1]
	Value float64 `json:"value"`
	// Details 额外信息,如匹配到的关键词列表
	Details map[string]any `json:"details,omitempty"`
}

ScoreResult 评估结果

type Scorer ¶

type Scorer interface {
	Score(ctx context.Context, input *TextEvalInput) (*ScoreResult, error)
}

Scorer 文本评估器接口。设计参考: @mastra/evals, 但本实现仅提供本地启发式 scorer,不依赖 LLM。

type TextEvalInput ¶

type TextEvalInput struct {
	// Answer 待评估的模型输出
	Answer string `json:"answer"`
	// Context 可选的上下文(如参考资料、提示信息等)
	Context []string `json:"context,omitempty"`
	// Reference 可选参考答案/期望输出,用于相似度比较
	Reference string `json:"reference,omitempty"`
}

TextEvalInput 文本评估输入

func BuildTextEvalInputFromEvents ¶

func BuildTextEvalInputFromEvents(events []session.Event) *TextEvalInput

BuildTextEvalInputFromEvents 根据一组 Session 事件构建 TextEvalInput。

约定: - 默认将最后一个 assistant 消息视为 Answer。 - 将之前的 user / assistant 消息串联为 Context,用于评估时参考。 - Reference 由调用方自行填充(例如从标注数据集中读取)。

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL