initial commit
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
"""LangSmith evaluation utilities.
|
||||
|
||||
This module provides utilities for evaluating Chains and other language model
|
||||
applications using LangChain evaluators and LangSmith.
|
||||
|
||||
For more information on the LangSmith API, see the
|
||||
[LangSmith API documentation](https://docs.langchain.com/langsmith/home).
|
||||
|
||||
**Example**
|
||||
|
||||
```python
|
||||
from langsmith import Client
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_classic.chains import LLMChain
|
||||
from langchain_classic.smith import EvaluatorType, RunEvalConfig, run_on_dataset
|
||||
|
||||
|
||||
def construct_chain():
|
||||
model = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(model, "What's the answer to {your_input_key}")
|
||||
return chain
|
||||
|
||||
|
||||
evaluation_config = RunEvalConfig(
|
||||
evaluators=[
|
||||
EvaluatorType.QA, # "Correctness" against a reference answer
|
||||
EvaluatorType.EMBEDDING_DISTANCE,
|
||||
RunEvalConfig.Criteria("helpfulness"),
|
||||
RunEvalConfig.Criteria(
|
||||
{
|
||||
"fifth-grader-score": "Do you have to be smarter than a fifth "
|
||||
"grader to answer this question?"
|
||||
}
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
client = Client()
|
||||
run_on_dataset(
|
||||
client, "<my_dataset_name>", construct_chain, evaluation=evaluation_config
|
||||
)
|
||||
```
|
||||
|
||||
**Attributes**
|
||||
|
||||
- `arun_on_dataset`: Asynchronous function to evaluate a chain or other LangChain
|
||||
component over a dataset.
|
||||
- `run_on_dataset`: Function to evaluate a chain or other LangChain component over a
|
||||
dataset.
|
||||
- `RunEvalConfig`: Class representing the configuration for running evaluation.
|
||||
- `StringRunEvaluatorChain`: Class representing a string run evaluator chain.
|
||||
- `InputFormatError`: Exception raised when the input format is incorrect.
|
||||
|
||||
"""
|
||||
|
||||
from langchain_classic.smith.evaluation.config import RunEvalConfig
|
||||
from langchain_classic.smith.evaluation.runner_utils import (
|
||||
InputFormatError,
|
||||
arun_on_dataset,
|
||||
run_on_dataset,
|
||||
)
|
||||
from langchain_classic.smith.evaluation.string_run_evaluator import (
|
||||
StringRunEvaluatorChain,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"InputFormatError",
|
||||
"RunEvalConfig",
|
||||
"StringRunEvaluatorChain",
|
||||
"arun_on_dataset",
|
||||
"run_on_dataset",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,273 @@
|
||||
"""Configuration for run evaluators."""
|
||||
|
||||
from collections.abc import Callable, Sequence
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langsmith import RunEvaluator
|
||||
from langsmith.evaluation.evaluator import EvaluationResult, EvaluationResults
|
||||
from langsmith.schemas import Example, Run
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from typing_extensions import override
|
||||
|
||||
from langchain_classic.evaluation.criteria.eval_chain import CRITERIA_TYPE
|
||||
from langchain_classic.evaluation.embedding_distance.base import (
|
||||
EmbeddingDistance as EmbeddingDistanceEnum,
|
||||
)
|
||||
from langchain_classic.evaluation.schema import EvaluatorType, StringEvaluator
|
||||
from langchain_classic.evaluation.string_distance.base import (
|
||||
StringDistance as StringDistanceEnum,
|
||||
)
|
||||
|
||||
RUN_EVALUATOR_LIKE = Callable[
|
||||
[Run, Example | None],
|
||||
EvaluationResult | EvaluationResults | dict,
|
||||
]
|
||||
BATCH_EVALUATOR_LIKE = Callable[
|
||||
[Sequence[Run], Sequence[Example] | None],
|
||||
EvaluationResult | EvaluationResults | dict,
|
||||
]
|
||||
|
||||
|
||||
class EvalConfig(BaseModel):
|
||||
"""Configuration for a given run evaluator.
|
||||
|
||||
Attributes:
|
||||
evaluator_type: The type of evaluator to use.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType
|
||||
|
||||
def get_kwargs(self) -> dict[str, Any]:
|
||||
"""Get the keyword arguments for the `load_evaluator` call.
|
||||
|
||||
Returns:
|
||||
The keyword arguments for the `load_evaluator` call.
|
||||
"""
|
||||
kwargs = {}
|
||||
for field, val in self:
|
||||
if field == "evaluator_type" or val is None:
|
||||
continue
|
||||
kwargs[field] = val
|
||||
return kwargs
|
||||
|
||||
|
||||
class SingleKeyEvalConfig(EvalConfig):
|
||||
"""Configuration for a run evaluator that only requires a single key."""
|
||||
|
||||
reference_key: str | None = None
|
||||
"""The key in the dataset run to use as the reference string.
|
||||
If not provided, we will attempt to infer automatically."""
|
||||
prediction_key: str | None = None
|
||||
"""The key from the traced run's outputs dictionary to use to
|
||||
represent the prediction. If not provided, it will be inferred
|
||||
automatically."""
|
||||
input_key: str | None = None
|
||||
"""The key from the traced run's inputs dictionary to use to represent the
|
||||
input. If not provided, it will be inferred automatically."""
|
||||
|
||||
@override
|
||||
def get_kwargs(self) -> dict[str, Any]:
|
||||
kwargs = super().get_kwargs()
|
||||
# Filer out the keys that are not needed for the evaluator.
|
||||
for key in ["reference_key", "prediction_key", "input_key"]:
|
||||
kwargs.pop(key, None)
|
||||
return kwargs
|
||||
|
||||
|
||||
CUSTOM_EVALUATOR_TYPE = RUN_EVALUATOR_LIKE | RunEvaluator | StringEvaluator
|
||||
SINGLE_EVAL_CONFIG_TYPE = EvaluatorType | str | EvalConfig
|
||||
|
||||
|
||||
class RunEvalConfig(BaseModel):
|
||||
"""Configuration for a run evaluation."""
|
||||
|
||||
evaluators: list[SINGLE_EVAL_CONFIG_TYPE | CUSTOM_EVALUATOR_TYPE] = Field(
|
||||
default_factory=list
|
||||
)
|
||||
"""Configurations for which evaluators to apply to the dataset run.
|
||||
Each can be the string of an
|
||||
`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
|
||||
as `EvaluatorType.QA`, the evaluator type string ("qa"), or a configuration for a
|
||||
given evaluator
|
||||
(e.g.,
|
||||
`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`)."""
|
||||
custom_evaluators: list[CUSTOM_EVALUATOR_TYPE] | None = None
|
||||
"""Custom evaluators to apply to the dataset run."""
|
||||
batch_evaluators: list[BATCH_EVALUATOR_LIKE] | None = None
|
||||
"""Evaluators that run on an aggregate/batch level.
|
||||
|
||||
These generate one or more metrics that are assigned to the full test run.
|
||||
As a result, they are not associated with individual traces.
|
||||
"""
|
||||
|
||||
reference_key: str | None = None
|
||||
"""The key in the dataset run to use as the reference string.
|
||||
If not provided, we will attempt to infer automatically."""
|
||||
prediction_key: str | None = None
|
||||
"""The key from the traced run's outputs dictionary to use to
|
||||
represent the prediction. If not provided, it will be inferred
|
||||
automatically."""
|
||||
input_key: str | None = None
|
||||
"""The key from the traced run's inputs dictionary to use to represent the
|
||||
input. If not provided, it will be inferred automatically."""
|
||||
eval_llm: BaseLanguageModel | None = None
|
||||
"""The language model to pass to any evaluators that require one."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True,
|
||||
)
|
||||
|
||||
class Criteria(SingleKeyEvalConfig):
|
||||
"""Configuration for a reference-free criteria evaluator.
|
||||
|
||||
Attributes:
|
||||
criteria: The criteria to evaluate.
|
||||
llm: The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
criteria: CRITERIA_TYPE | None = None
|
||||
llm: BaseLanguageModel | None = None
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
|
||||
|
||||
class LabeledCriteria(SingleKeyEvalConfig):
|
||||
"""Configuration for a labeled (with references) criteria evaluator.
|
||||
|
||||
Attributes:
|
||||
criteria: The criteria to evaluate.
|
||||
llm: The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
criteria: CRITERIA_TYPE | None = None
|
||||
llm: BaseLanguageModel | None = None
|
||||
evaluator_type: EvaluatorType = EvaluatorType.LABELED_CRITERIA
|
||||
|
||||
class EmbeddingDistance(SingleKeyEvalConfig):
|
||||
"""Configuration for an embedding distance evaluator.
|
||||
|
||||
Attributes:
|
||||
embeddings: The embeddings to use for computing the distance.
|
||||
distance_metric: The distance metric to use for computing the distance.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.EMBEDDING_DISTANCE
|
||||
embeddings: Embeddings | None = None
|
||||
distance_metric: EmbeddingDistanceEnum | None = None
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True,
|
||||
)
|
||||
|
||||
class StringDistance(SingleKeyEvalConfig):
|
||||
"""Configuration for a string distance evaluator.
|
||||
|
||||
Attributes:
|
||||
distance: The string distance metric to use (`damerau_levenshtein`,
|
||||
`levenshtein`, `jaro`, or `jaro_winkler`).
|
||||
normalize_score: Whether to normalize the distance to between 0 and 1.
|
||||
Applies only to the Levenshtein and Damerau-Levenshtein distances.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
|
||||
distance: StringDistanceEnum | None = None
|
||||
normalize_score: bool = True
|
||||
|
||||
class QA(SingleKeyEvalConfig):
|
||||
"""Configuration for a QA evaluator.
|
||||
|
||||
Attributes:
|
||||
prompt: The prompt template to use for generating the question.
|
||||
llm: The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.QA
|
||||
llm: BaseLanguageModel | None = None
|
||||
prompt: BasePromptTemplate | None = None
|
||||
|
||||
class ContextQA(SingleKeyEvalConfig):
|
||||
"""Configuration for a context-based QA evaluator.
|
||||
|
||||
Attributes:
|
||||
prompt: The prompt template to use for generating the question.
|
||||
llm: The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
|
||||
llm: BaseLanguageModel | None = None
|
||||
prompt: BasePromptTemplate | None = None
|
||||
|
||||
class CoTQA(SingleKeyEvalConfig):
|
||||
"""Configuration for a context-based QA evaluator.
|
||||
|
||||
Attributes:
|
||||
prompt: The prompt template to use for generating the question.
|
||||
llm: The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
|
||||
llm: BaseLanguageModel | None = None
|
||||
prompt: BasePromptTemplate | None = None
|
||||
|
||||
class JsonValidity(SingleKeyEvalConfig):
|
||||
"""Configuration for a json validity evaluator."""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.JSON_VALIDITY
|
||||
|
||||
class JsonEqualityEvaluator(EvalConfig):
|
||||
"""Configuration for a json equality evaluator."""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
|
||||
|
||||
class ExactMatch(SingleKeyEvalConfig):
|
||||
"""Configuration for an exact match string evaluator.
|
||||
|
||||
Attributes:
|
||||
ignore_case: Whether to ignore case when comparing strings.
|
||||
ignore_punctuation: Whether to ignore punctuation when comparing strings.
|
||||
ignore_numbers: Whether to ignore numbers when comparing strings.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.EXACT_MATCH
|
||||
ignore_case: bool = False
|
||||
ignore_punctuation: bool = False
|
||||
ignore_numbers: bool = False
|
||||
|
||||
class RegexMatch(SingleKeyEvalConfig):
|
||||
"""Configuration for a regex match string evaluator.
|
||||
|
||||
Attributes:
|
||||
flags: The flags to pass to the regex. Example: `re.IGNORECASE`.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
|
||||
flags: int = 0
|
||||
|
||||
class ScoreString(SingleKeyEvalConfig):
|
||||
"""Configuration for a score string evaluator.
|
||||
|
||||
This is like the criteria evaluator but it is configured by
|
||||
default to return a score on the scale from 1-10.
|
||||
|
||||
It is recommended to normalize these scores
|
||||
by setting `normalize_by` to 10.
|
||||
|
||||
Attributes:
|
||||
criteria: The criteria to evaluate.
|
||||
llm: The language model to use for the evaluation chain.
|
||||
normalize_by: If you want to normalize the score, the denominator to use.
|
||||
If not provided, the score will be between 1 and 10.
|
||||
prompt: The prompt template to use for evaluation.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.SCORE_STRING
|
||||
criteria: CRITERIA_TYPE | None = None
|
||||
llm: BaseLanguageModel | None = None
|
||||
normalize_by: float | None = None
|
||||
prompt: BasePromptTemplate | None = None
|
||||
|
||||
class LabeledScoreString(ScoreString):
|
||||
"""Configuration for a labeled score string evaluator."""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.LABELED_SCORE_STRING
|
||||
@@ -0,0 +1,727 @@
|
||||
import random
|
||||
|
||||
adjectives = [
|
||||
"abandoned",
|
||||
"aching",
|
||||
"advanced",
|
||||
"ample",
|
||||
"artistic",
|
||||
"back",
|
||||
"best",
|
||||
"bold",
|
||||
"brief",
|
||||
"clear",
|
||||
"cold",
|
||||
"complicated",
|
||||
"cooked",
|
||||
"crazy",
|
||||
"crushing",
|
||||
"damp",
|
||||
"dear",
|
||||
"definite",
|
||||
"dependable",
|
||||
"diligent",
|
||||
"drab",
|
||||
"earnest",
|
||||
"elderly",
|
||||
"enchanted",
|
||||
"essential",
|
||||
"excellent",
|
||||
"extraneous",
|
||||
"fixed",
|
||||
"flowery",
|
||||
"formal",
|
||||
"fresh",
|
||||
"frosty",
|
||||
"giving",
|
||||
"glossy",
|
||||
"healthy",
|
||||
"helpful",
|
||||
"impressionable",
|
||||
"kind",
|
||||
"large",
|
||||
"left",
|
||||
"long",
|
||||
"loyal",
|
||||
"mealy",
|
||||
"memorable",
|
||||
"monthly",
|
||||
"new",
|
||||
"notable",
|
||||
"only",
|
||||
"ordinary",
|
||||
"passionate",
|
||||
"perfect",
|
||||
"pertinent",
|
||||
"proper",
|
||||
"puzzled",
|
||||
"reflecting",
|
||||
"respectful",
|
||||
"roasted",
|
||||
"scholarly",
|
||||
"shiny",
|
||||
"slight",
|
||||
"sparkling",
|
||||
"spotless",
|
||||
"stupendous",
|
||||
"sunny",
|
||||
"tart",
|
||||
"terrific",
|
||||
"timely",
|
||||
"unique",
|
||||
"upbeat",
|
||||
"vacant",
|
||||
"virtual",
|
||||
"warm",
|
||||
"weary",
|
||||
"whispered",
|
||||
"worthwhile",
|
||||
"yellow",
|
||||
]
|
||||
|
||||
nouns = [
|
||||
"account",
|
||||
"acknowledgment",
|
||||
"address",
|
||||
"advertising",
|
||||
"airplane",
|
||||
"animal",
|
||||
"appointment",
|
||||
"arrival",
|
||||
"artist",
|
||||
"attachment",
|
||||
"attitude",
|
||||
"availability",
|
||||
"backpack",
|
||||
"bag",
|
||||
"balance",
|
||||
"bass",
|
||||
"bean",
|
||||
"beauty",
|
||||
"bibliography",
|
||||
"bill",
|
||||
"bite",
|
||||
"blossom",
|
||||
"boat",
|
||||
"book",
|
||||
"box",
|
||||
"boy",
|
||||
"bread",
|
||||
"bridge",
|
||||
"broccoli",
|
||||
"building",
|
||||
"butter",
|
||||
"button",
|
||||
"cabbage",
|
||||
"cake",
|
||||
"camera",
|
||||
"camp",
|
||||
"candle",
|
||||
"candy",
|
||||
"canvas",
|
||||
"car",
|
||||
"card",
|
||||
"carrot",
|
||||
"cart",
|
||||
"case",
|
||||
"cat",
|
||||
"chain",
|
||||
"chair",
|
||||
"chalk",
|
||||
"chance",
|
||||
"change",
|
||||
"channel",
|
||||
"character",
|
||||
"charge",
|
||||
"charm",
|
||||
"chart",
|
||||
"check",
|
||||
"cheek",
|
||||
"cheese",
|
||||
"chef",
|
||||
"cherry",
|
||||
"chicken",
|
||||
"child",
|
||||
"church",
|
||||
"circle",
|
||||
"class",
|
||||
"clay",
|
||||
"click",
|
||||
"clock",
|
||||
"cloth",
|
||||
"cloud",
|
||||
"clove",
|
||||
"club",
|
||||
"coach",
|
||||
"coal",
|
||||
"coast",
|
||||
"coat",
|
||||
"cod",
|
||||
"coffee",
|
||||
"collar",
|
||||
"color",
|
||||
"comb",
|
||||
"comfort",
|
||||
"comic",
|
||||
"committee",
|
||||
"community",
|
||||
"company",
|
||||
"comparison",
|
||||
"competition",
|
||||
"condition",
|
||||
"connection",
|
||||
"control",
|
||||
"cook",
|
||||
"copper",
|
||||
"copy",
|
||||
"corn",
|
||||
"cough",
|
||||
"country",
|
||||
"cover",
|
||||
"crate",
|
||||
"crayon",
|
||||
"cream",
|
||||
"creator",
|
||||
"crew",
|
||||
"crown",
|
||||
"current",
|
||||
"curtain",
|
||||
"curve",
|
||||
"cushion",
|
||||
"dad",
|
||||
"daughter",
|
||||
"day",
|
||||
"death",
|
||||
"debt",
|
||||
"decision",
|
||||
"deer",
|
||||
"degree",
|
||||
"design",
|
||||
"desire",
|
||||
"desk",
|
||||
"detail",
|
||||
"development",
|
||||
"digestion",
|
||||
"dime",
|
||||
"dinner",
|
||||
"direction",
|
||||
"dirt",
|
||||
"discovery",
|
||||
"discussion",
|
||||
"disease",
|
||||
"disgust",
|
||||
"distance",
|
||||
"distribution",
|
||||
"division",
|
||||
"doctor",
|
||||
"dog",
|
||||
"door",
|
||||
"drain",
|
||||
"drawer",
|
||||
"dress",
|
||||
"drink",
|
||||
"driving",
|
||||
"dust",
|
||||
"ear",
|
||||
"earth",
|
||||
"edge",
|
||||
"education",
|
||||
"effect",
|
||||
"egg",
|
||||
"end",
|
||||
"energy",
|
||||
"engine",
|
||||
"error",
|
||||
"event",
|
||||
"example",
|
||||
"exchange",
|
||||
"existence",
|
||||
"expansion",
|
||||
"experience",
|
||||
"expert",
|
||||
"eye",
|
||||
"face",
|
||||
"fact",
|
||||
"fall",
|
||||
"family",
|
||||
"farm",
|
||||
"father",
|
||||
"fear",
|
||||
"feeling",
|
||||
"field",
|
||||
"finger",
|
||||
"fire",
|
||||
"fish",
|
||||
"flag",
|
||||
"flight",
|
||||
"floor",
|
||||
"flower",
|
||||
"fold",
|
||||
"food",
|
||||
"football",
|
||||
"force",
|
||||
"form",
|
||||
"frame",
|
||||
"friend",
|
||||
"frog",
|
||||
"fruit",
|
||||
"fuel",
|
||||
"furniture",
|
||||
"game",
|
||||
"garden",
|
||||
"gate",
|
||||
"girl",
|
||||
"glass",
|
||||
"glove",
|
||||
"goat",
|
||||
"gold",
|
||||
"government",
|
||||
"grade",
|
||||
"grain",
|
||||
"grass",
|
||||
"green",
|
||||
"grip",
|
||||
"group",
|
||||
"growth",
|
||||
"guide",
|
||||
"guitar",
|
||||
"hair",
|
||||
"hall",
|
||||
"hand",
|
||||
"harbor",
|
||||
"harmony",
|
||||
"hat",
|
||||
"head",
|
||||
"health",
|
||||
"heart",
|
||||
"heat",
|
||||
"hill",
|
||||
"history",
|
||||
"hobbies",
|
||||
"hole",
|
||||
"hope",
|
||||
"horn",
|
||||
"horse",
|
||||
"hospital",
|
||||
"hour",
|
||||
"house",
|
||||
"humor",
|
||||
"idea",
|
||||
"impulse",
|
||||
"income",
|
||||
"increase",
|
||||
"industry",
|
||||
"ink",
|
||||
"insect",
|
||||
"instrument",
|
||||
"insurance",
|
||||
"interest",
|
||||
"invention",
|
||||
"iron",
|
||||
"island",
|
||||
"jelly",
|
||||
"jet",
|
||||
"jewel",
|
||||
"join",
|
||||
"judge",
|
||||
"juice",
|
||||
"jump",
|
||||
"kettle",
|
||||
"key",
|
||||
"kick",
|
||||
"kiss",
|
||||
"kitten",
|
||||
"knee",
|
||||
"knife",
|
||||
"knowledge",
|
||||
"land",
|
||||
"language",
|
||||
"laugh",
|
||||
"law",
|
||||
"lead",
|
||||
"learning",
|
||||
"leather",
|
||||
"leg",
|
||||
"lettuce",
|
||||
"level",
|
||||
"library",
|
||||
"lift",
|
||||
"light",
|
||||
"limit",
|
||||
"line",
|
||||
"linen",
|
||||
"lip",
|
||||
"liquid",
|
||||
"list",
|
||||
"look",
|
||||
"loss",
|
||||
"love",
|
||||
"lunch",
|
||||
"machine",
|
||||
"man",
|
||||
"manager",
|
||||
"map",
|
||||
"marble",
|
||||
"mark",
|
||||
"market",
|
||||
"mass",
|
||||
"match",
|
||||
"meal",
|
||||
"measure",
|
||||
"meat",
|
||||
"meeting",
|
||||
"memory",
|
||||
"metal",
|
||||
"middle",
|
||||
"milk",
|
||||
"mind",
|
||||
"mine",
|
||||
"minute",
|
||||
"mist",
|
||||
"mitten",
|
||||
"mom",
|
||||
"money",
|
||||
"monkey",
|
||||
"month",
|
||||
"moon",
|
||||
"morning",
|
||||
"mother",
|
||||
"motion",
|
||||
"mountain",
|
||||
"mouth",
|
||||
"muscle",
|
||||
"music",
|
||||
"nail",
|
||||
"name",
|
||||
"nation",
|
||||
"neck",
|
||||
"need",
|
||||
"news",
|
||||
"night",
|
||||
"noise",
|
||||
"note",
|
||||
"number",
|
||||
"nut",
|
||||
"observation",
|
||||
"offer",
|
||||
"oil",
|
||||
"operation",
|
||||
"opinion",
|
||||
"orange",
|
||||
"order",
|
||||
"organization",
|
||||
"ornament",
|
||||
"oven",
|
||||
"page",
|
||||
"pail",
|
||||
"pain",
|
||||
"paint",
|
||||
"pan",
|
||||
"pancake",
|
||||
"paper",
|
||||
"parcel",
|
||||
"parent",
|
||||
"part",
|
||||
"passenger",
|
||||
"paste",
|
||||
"payment",
|
||||
"peace",
|
||||
"pear",
|
||||
"pen",
|
||||
"pencil",
|
||||
"person",
|
||||
"pest",
|
||||
"pet",
|
||||
"picture",
|
||||
"pie",
|
||||
"pin",
|
||||
"pipe",
|
||||
"pizza",
|
||||
"place",
|
||||
"plane",
|
||||
"plant",
|
||||
"plastic",
|
||||
"plate",
|
||||
"play",
|
||||
"pleasure",
|
||||
"plot",
|
||||
"plough",
|
||||
"pocket",
|
||||
"point",
|
||||
"poison",
|
||||
"police",
|
||||
"pollution",
|
||||
"popcorn",
|
||||
"porter",
|
||||
"position",
|
||||
"pot",
|
||||
"potato",
|
||||
"powder",
|
||||
"power",
|
||||
"price",
|
||||
"print",
|
||||
"process",
|
||||
"produce",
|
||||
"product",
|
||||
"profit",
|
||||
"property",
|
||||
"prose",
|
||||
"protest",
|
||||
"pull",
|
||||
"pump",
|
||||
"punishment",
|
||||
"purpose",
|
||||
"push",
|
||||
"quarter",
|
||||
"question",
|
||||
"quiet",
|
||||
"quill",
|
||||
"quilt",
|
||||
"quince",
|
||||
"rabbit",
|
||||
"rail",
|
||||
"rain",
|
||||
"range",
|
||||
"rat",
|
||||
"rate",
|
||||
"ray",
|
||||
"reaction",
|
||||
"reading",
|
||||
"reason",
|
||||
"record",
|
||||
"regret",
|
||||
"relation",
|
||||
"religion",
|
||||
"representative",
|
||||
"request",
|
||||
"respect",
|
||||
"rest",
|
||||
"reward",
|
||||
"rhythm",
|
||||
"rice",
|
||||
"river",
|
||||
"road",
|
||||
"roll",
|
||||
"room",
|
||||
"root",
|
||||
"rose",
|
||||
"route",
|
||||
"rub",
|
||||
"rule",
|
||||
"run",
|
||||
"sack",
|
||||
"sail",
|
||||
"salt",
|
||||
"sand",
|
||||
"scale",
|
||||
"scarecrow",
|
||||
"scarf",
|
||||
"scene",
|
||||
"scent",
|
||||
"school",
|
||||
"science",
|
||||
"scissors",
|
||||
"screw",
|
||||
"sea",
|
||||
"seat",
|
||||
"secretary",
|
||||
"seed",
|
||||
"selection",
|
||||
"self",
|
||||
"sense",
|
||||
"servant",
|
||||
"shade",
|
||||
"shake",
|
||||
"shame",
|
||||
"shape",
|
||||
"sheep",
|
||||
"sheet",
|
||||
"shelf",
|
||||
"ship",
|
||||
"shirt",
|
||||
"shock",
|
||||
"shoe",
|
||||
"shop",
|
||||
"show",
|
||||
"side",
|
||||
"sign",
|
||||
"silk",
|
||||
"sink",
|
||||
"sister",
|
||||
"size",
|
||||
"sky",
|
||||
"sleep",
|
||||
"smash",
|
||||
"smell",
|
||||
"smile",
|
||||
"smoke",
|
||||
"snail",
|
||||
"snake",
|
||||
"sneeze",
|
||||
"snow",
|
||||
"soap",
|
||||
"society",
|
||||
"sock",
|
||||
"soda",
|
||||
"sofa",
|
||||
"son",
|
||||
"song",
|
||||
"sort",
|
||||
"sound",
|
||||
"soup",
|
||||
"space",
|
||||
"spark",
|
||||
"speed",
|
||||
"sponge",
|
||||
"spoon",
|
||||
"spray",
|
||||
"spring",
|
||||
"spy",
|
||||
"square",
|
||||
"stamp",
|
||||
"star",
|
||||
"start",
|
||||
"statement",
|
||||
"station",
|
||||
"steam",
|
||||
"steel",
|
||||
"stem",
|
||||
"step",
|
||||
"stew",
|
||||
"stick",
|
||||
"stitch",
|
||||
"stocking",
|
||||
"stomach",
|
||||
"stone",
|
||||
"stop",
|
||||
"store",
|
||||
"story",
|
||||
"stove",
|
||||
"stranger",
|
||||
"straw",
|
||||
"stream",
|
||||
"street",
|
||||
"stretch",
|
||||
"string",
|
||||
"structure",
|
||||
"substance",
|
||||
"sugar",
|
||||
"suggestion",
|
||||
"suit",
|
||||
"summer",
|
||||
"sun",
|
||||
"support",
|
||||
"surprise",
|
||||
"sweater",
|
||||
"swim",
|
||||
"system",
|
||||
"table",
|
||||
"tail",
|
||||
"talk",
|
||||
"tank",
|
||||
"taste",
|
||||
"tax",
|
||||
"tea",
|
||||
"teaching",
|
||||
"team",
|
||||
"tendency",
|
||||
"test",
|
||||
"texture",
|
||||
"theory",
|
||||
"thing",
|
||||
"thought",
|
||||
"thread",
|
||||
"throat",
|
||||
"thumb",
|
||||
"thunder",
|
||||
"ticket",
|
||||
"time",
|
||||
"tin",
|
||||
"title",
|
||||
"toad",
|
||||
"toe",
|
||||
"tooth",
|
||||
"toothpaste",
|
||||
"touch",
|
||||
"town",
|
||||
"toy",
|
||||
"trade",
|
||||
"train",
|
||||
"transport",
|
||||
"tray",
|
||||
"treatment",
|
||||
"tree",
|
||||
"trick",
|
||||
"trip",
|
||||
"trouble",
|
||||
"trousers",
|
||||
"truck",
|
||||
"tub",
|
||||
"turkey",
|
||||
"turn",
|
||||
"twist",
|
||||
"umbrella",
|
||||
"uncle",
|
||||
"underwear",
|
||||
"unit",
|
||||
"use",
|
||||
"vacation",
|
||||
"value",
|
||||
"van",
|
||||
"vase",
|
||||
"vegetable",
|
||||
"veil",
|
||||
"vein",
|
||||
"verse",
|
||||
"vessel",
|
||||
"view",
|
||||
"visitor",
|
||||
"voice",
|
||||
"volcano",
|
||||
"walk",
|
||||
"wall",
|
||||
"war",
|
||||
"wash",
|
||||
"waste",
|
||||
"watch",
|
||||
"water",
|
||||
"wave",
|
||||
"wax",
|
||||
"way",
|
||||
"wealth",
|
||||
"weather",
|
||||
"week",
|
||||
"weight",
|
||||
"wheel",
|
||||
"whip",
|
||||
"whistle",
|
||||
"window",
|
||||
"wine",
|
||||
"wing",
|
||||
"winter",
|
||||
"wire",
|
||||
"wish",
|
||||
"woman",
|
||||
"wood",
|
||||
"wool",
|
||||
"word",
|
||||
"work",
|
||||
"worm",
|
||||
"wound",
|
||||
"wrist",
|
||||
"writer",
|
||||
"yard",
|
||||
"yoke",
|
||||
"zebra",
|
||||
"zinc",
|
||||
"zipper",
|
||||
"zone",
|
||||
]
|
||||
|
||||
|
||||
def random_name() -> str:
|
||||
"""Generate a random name."""
|
||||
adjective = random.choice(adjectives) # noqa: S311
|
||||
noun = random.choice(nouns) # noqa: S311
|
||||
number = random.randint(1, 100) # noqa: S311
|
||||
return f"{adjective}-{noun}-{number}"
|
||||
@@ -0,0 +1,145 @@
|
||||
"""A simple progress bar for the console."""
|
||||
|
||||
import threading
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from langchain_core.callbacks import base as base_callbacks
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.outputs import LLMResult
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
class ProgressBarCallback(base_callbacks.BaseCallbackHandler):
|
||||
"""A simple progress bar for the console."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
total: int,
|
||||
ncols: int = 50,
|
||||
end_with: str = "\n",
|
||||
):
|
||||
"""Initialize the progress bar.
|
||||
|
||||
Args:
|
||||
total: The total number of items to be processed.
|
||||
ncols: The character width of the progress bar.
|
||||
end_with: Last string to print after progress bar reaches end.
|
||||
"""
|
||||
self.total = total
|
||||
self.ncols = ncols
|
||||
self.end_with = end_with
|
||||
self.counter = 0
|
||||
self.lock = threading.Lock()
|
||||
self._print_bar()
|
||||
|
||||
def increment(self) -> None:
|
||||
"""Increment the counter and update the progress bar."""
|
||||
with self.lock:
|
||||
self.counter += 1
|
||||
self._print_bar()
|
||||
|
||||
def _print_bar(self) -> None:
|
||||
"""Print the progress bar to the console."""
|
||||
progress = self.counter / self.total
|
||||
arrow = "-" * int(round(progress * self.ncols) - 1) + ">"
|
||||
spaces = " " * (self.ncols - len(arrow))
|
||||
end = "" if self.counter < self.total else self.end_with
|
||||
print(f"\r[{arrow + spaces}] {self.counter}/{self.total}", end=end) # noqa: T201
|
||||
|
||||
@override
|
||||
def on_chain_error(
|
||||
self,
|
||||
error: BaseException,
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_chain_end(
|
||||
self,
|
||||
outputs: dict[str, Any],
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_retriever_error(
|
||||
self,
|
||||
error: BaseException,
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_retriever_end(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_llm_error(
|
||||
self,
|
||||
error: BaseException,
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_llm_end(
|
||||
self,
|
||||
response: LLMResult,
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_tool_error(
|
||||
self,
|
||||
error: BaseException,
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
|
||||
@override
|
||||
def on_tool_end(
|
||||
self,
|
||||
output: str,
|
||||
*,
|
||||
run_id: UUID,
|
||||
parent_run_id: UUID | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
if parent_run_id is None:
|
||||
self.increment()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,477 @@
|
||||
"""Run evaluator wrapper for string evaluators."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from abc import abstractmethod
|
||||
from typing import Any, cast
|
||||
|
||||
from langchain_core.callbacks.manager import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManagerForChainRun,
|
||||
)
|
||||
from langchain_core.load.dump import dumpd
|
||||
from langchain_core.load.load import load
|
||||
from langchain_core.load.serializable import Serializable
|
||||
from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict
|
||||
from langsmith import EvaluationResult, RunEvaluator
|
||||
from langsmith.schemas import DataType, Example, Run
|
||||
from typing_extensions import override
|
||||
|
||||
from langchain_classic.chains.base import Chain
|
||||
from langchain_classic.evaluation.schema import StringEvaluator
|
||||
from langchain_classic.schema import RUN_KEY
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_messages_from_run_dict(messages: list[dict]) -> list[BaseMessage]:
|
||||
if not messages:
|
||||
return []
|
||||
first_message = messages[0]
|
||||
if "lc" in first_message:
|
||||
return [load(dumpd(message)) for message in messages]
|
||||
return messages_from_dict(messages)
|
||||
|
||||
|
||||
class StringRunMapper(Serializable):
|
||||
"""Extract items to evaluate from the run object."""
|
||||
|
||||
@property
|
||||
def output_keys(self) -> list[str]:
|
||||
"""The keys to extract from the run."""
|
||||
return ["prediction", "input"]
|
||||
|
||||
@abstractmethod
|
||||
def map(self, run: Run) -> dict[str, str]:
|
||||
"""Maps the Run to a dictionary."""
|
||||
|
||||
def __call__(self, run: Run) -> dict[str, str]:
|
||||
"""Maps the Run to a dictionary."""
|
||||
if not run.outputs:
|
||||
msg = f"Run {run.id} has no outputs to evaluate."
|
||||
raise ValueError(msg)
|
||||
return self.map(run)
|
||||
|
||||
|
||||
class LLMStringRunMapper(StringRunMapper):
|
||||
"""Extract items to evaluate from the run object."""
|
||||
|
||||
def serialize_chat_messages(self, messages: list[dict] | list[list[dict]]) -> str:
|
||||
"""Extract the input messages from the run."""
|
||||
if isinstance(messages, list) and messages:
|
||||
if isinstance(messages[0], dict):
|
||||
chat_messages = _get_messages_from_run_dict(
|
||||
cast("list[dict]", messages)
|
||||
)
|
||||
elif isinstance(messages[0], list):
|
||||
# Runs from Tracer have messages as a list of lists of dicts
|
||||
chat_messages = _get_messages_from_run_dict(messages[0])
|
||||
else:
|
||||
msg = f"Could not extract messages to evaluate {messages}" # type: ignore[unreachable]
|
||||
raise ValueError(msg)
|
||||
return get_buffer_string(chat_messages)
|
||||
msg = f"Could not extract messages to evaluate {messages}"
|
||||
raise ValueError(msg)
|
||||
|
||||
def serialize_inputs(self, inputs: dict) -> str:
|
||||
"""Serialize inputs.
|
||||
|
||||
Args:
|
||||
inputs: The inputs from the run, expected to contain prompts or messages.
|
||||
|
||||
Returns:
|
||||
The serialized input text from the prompts or messages.
|
||||
|
||||
Raises:
|
||||
ValueError: If neither prompts nor messages are found in the inputs.
|
||||
"""
|
||||
if "prompts" in inputs: # Should we even accept this?
|
||||
input_ = "\n\n".join(inputs["prompts"])
|
||||
elif "prompt" in inputs:
|
||||
input_ = inputs["prompt"]
|
||||
elif "messages" in inputs:
|
||||
input_ = self.serialize_chat_messages(inputs["messages"])
|
||||
else:
|
||||
msg = "LLM Run must have either messages or prompts as inputs."
|
||||
raise ValueError(msg)
|
||||
return input_
|
||||
|
||||
def serialize_outputs(self, outputs: dict) -> str:
|
||||
"""Serialize outputs.
|
||||
|
||||
Args:
|
||||
outputs: The outputs from the run, expected to contain generations.
|
||||
|
||||
Returns:
|
||||
The serialized output text from the first generation.
|
||||
|
||||
Raises:
|
||||
ValueError: If no generations are found in the outputs or if the generations
|
||||
are empty.
|
||||
"""
|
||||
if not outputs.get("generations"):
|
||||
msg = "Cannot evaluate LLM Run without generations."
|
||||
raise ValueError(msg)
|
||||
generations: list[dict] | list[list[dict]] = outputs["generations"]
|
||||
if not generations:
|
||||
msg = "Cannot evaluate LLM run with empty generations."
|
||||
raise ValueError(msg)
|
||||
first_generation: dict | list[dict] = generations[0]
|
||||
if isinstance(first_generation, list):
|
||||
# Runs from Tracer have generations as a list of lists of dicts
|
||||
# Whereas Runs from the API have a list of dicts
|
||||
first_generation = first_generation[0]
|
||||
if "message" in first_generation:
|
||||
output_ = self.serialize_chat_messages([first_generation["message"]])
|
||||
else:
|
||||
output_ = first_generation["text"]
|
||||
return output_
|
||||
|
||||
def map(self, run: Run) -> dict[str, str]:
|
||||
"""Maps the Run to a dictionary."""
|
||||
if run.run_type != "llm":
|
||||
msg = "LLM RunMapper only supports LLM runs."
|
||||
raise ValueError(msg)
|
||||
if not run.outputs:
|
||||
if run.error:
|
||||
msg = f"Cannot evaluate errored LLM run {run.id}: {run.error}"
|
||||
raise ValueError(msg)
|
||||
msg = f"Run {run.id} has no outputs. Cannot evaluate this run."
|
||||
raise ValueError(msg)
|
||||
try:
|
||||
inputs = self.serialize_inputs(run.inputs)
|
||||
except Exception as e:
|
||||
msg = f"Could not parse LM input from run inputs {run.inputs}"
|
||||
raise ValueError(msg) from e
|
||||
try:
|
||||
output_ = self.serialize_outputs(run.outputs)
|
||||
except Exception as e:
|
||||
msg = f"Could not parse LM prediction from run outputs {run.outputs}"
|
||||
raise ValueError(msg) from e
|
||||
return {"input": inputs, "prediction": output_}
|
||||
|
||||
|
||||
class ChainStringRunMapper(StringRunMapper):
|
||||
"""Extract items to evaluate from the run object from a chain."""
|
||||
|
||||
input_key: str | None = None
|
||||
"""The key from the model Run's inputs to use as the eval input.
|
||||
If not provided, will use the only input key or raise an
|
||||
error if there are multiple."""
|
||||
prediction_key: str | None = None
|
||||
"""The key from the model Run's outputs to use as the eval prediction.
|
||||
If not provided, will use the only output key or raise an error
|
||||
if there are multiple."""
|
||||
|
||||
def _get_key(self, source: dict, key: str | None, which: str) -> str:
|
||||
if key is not None:
|
||||
return source[key]
|
||||
if len(source) == 1:
|
||||
return next(iter(source.values()))
|
||||
msg = (
|
||||
f"Could not map run {which} with multiple keys: "
|
||||
f"{source}\nPlease manually specify a {which}_key"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
def map(self, run: Run) -> dict[str, str]:
|
||||
"""Maps the Run to a dictionary."""
|
||||
if not run.outputs:
|
||||
msg = (
|
||||
f"Run with ID {run.id} lacks outputs required for evaluation."
|
||||
" Ensure the Run has valid outputs."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
if self.input_key is not None and self.input_key not in run.inputs:
|
||||
msg = (
|
||||
f"Run with ID {run.id} is missing the expected input key"
|
||||
f" '{self.input_key}'.\nAvailable input keys in this Run"
|
||||
f" are: {run.inputs.keys()}.\nAdjust the evaluator's"
|
||||
f" input_key or ensure your input data includes key"
|
||||
f" '{self.input_key}'."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
if self.prediction_key is not None and self.prediction_key not in run.outputs:
|
||||
available_keys = ", ".join(run.outputs.keys())
|
||||
msg = (
|
||||
f"Run with ID {run.id} doesn't have the expected prediction key"
|
||||
f" '{self.prediction_key}'. Available prediction keys in this Run are:"
|
||||
f" {available_keys}. Adjust the evaluator's prediction_key or"
|
||||
" ensure the Run object's outputs the expected key."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
input_ = self._get_key(run.inputs, self.input_key, "input")
|
||||
prediction = self._get_key(run.outputs, self.prediction_key, "prediction")
|
||||
return {
|
||||
"input": input_,
|
||||
"prediction": prediction,
|
||||
}
|
||||
|
||||
|
||||
class ToolStringRunMapper(StringRunMapper):
|
||||
"""Map an input to the tool."""
|
||||
|
||||
@override
|
||||
def map(self, run: Run) -> dict[str, str]:
|
||||
if not run.outputs:
|
||||
msg = f"Run {run.id} has no outputs to evaluate."
|
||||
raise ValueError(msg)
|
||||
return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
|
||||
|
||||
|
||||
class StringExampleMapper(Serializable):
|
||||
"""Map an example, or row in the dataset, to the inputs of an evaluation."""
|
||||
|
||||
reference_key: str | None = None
|
||||
|
||||
@property
|
||||
def output_keys(self) -> list[str]:
|
||||
"""The keys to extract from the run."""
|
||||
return ["reference"]
|
||||
|
||||
def serialize_chat_messages(self, messages: list[dict]) -> str:
|
||||
"""Extract the input messages from the run."""
|
||||
chat_messages = _get_messages_from_run_dict(messages)
|
||||
return get_buffer_string(chat_messages)
|
||||
|
||||
def map(self, example: Example) -> dict[str, str]:
|
||||
"""Maps the Example, or dataset row to a dictionary."""
|
||||
if not example.outputs:
|
||||
msg = f"Example {example.id} has no outputs to use as a reference."
|
||||
raise ValueError(msg)
|
||||
if self.reference_key is None:
|
||||
if len(example.outputs) > 1:
|
||||
msg = (
|
||||
f"Example {example.id} has multiple outputs, so you must"
|
||||
" specify a reference_key."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
output = next(iter(example.outputs.values()))
|
||||
elif self.reference_key not in example.outputs:
|
||||
msg = (
|
||||
f"Example {example.id} does not have reference key"
|
||||
f" {self.reference_key}."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
else:
|
||||
output = example.outputs[self.reference_key]
|
||||
return {
|
||||
"reference": self.serialize_chat_messages([output])
|
||||
if isinstance(output, dict) and output.get("type") and output.get("data")
|
||||
else output,
|
||||
}
|
||||
|
||||
def __call__(self, example: Example) -> dict[str, str]:
|
||||
"""Maps the Run and Example to a dictionary."""
|
||||
if not example.outputs:
|
||||
msg = f"Example {example.id} has no outputs to use as areference label."
|
||||
raise ValueError(msg)
|
||||
return self.map(example)
|
||||
|
||||
|
||||
class StringRunEvaluatorChain(Chain, RunEvaluator):
|
||||
"""Evaluate Run and optional examples."""
|
||||
|
||||
run_mapper: StringRunMapper
|
||||
"""Maps the Run to a dictionary with 'input' and 'prediction' strings."""
|
||||
example_mapper: StringExampleMapper | None = None
|
||||
"""Maps the Example (dataset row) to a dictionary
|
||||
with a 'reference' string."""
|
||||
name: str
|
||||
"""The name of the evaluation metric."""
|
||||
string_evaluator: StringEvaluator
|
||||
"""The evaluation chain."""
|
||||
|
||||
@property
|
||||
@override
|
||||
def input_keys(self) -> list[str]:
|
||||
return ["run", "example"]
|
||||
|
||||
@property
|
||||
@override
|
||||
def output_keys(self) -> list[str]:
|
||||
return ["feedback"]
|
||||
|
||||
def _prepare_input(self, inputs: dict[str, Any]) -> dict[str, str]:
|
||||
run: Run = inputs["run"]
|
||||
example: Example | None = inputs.get("example")
|
||||
evaluate_strings_inputs = self.run_mapper(run)
|
||||
if not self.string_evaluator.requires_input:
|
||||
# Hide warning about unused input
|
||||
evaluate_strings_inputs.pop("input", None)
|
||||
if example and self.example_mapper and self.string_evaluator.requires_reference:
|
||||
evaluate_strings_inputs.update(self.example_mapper(example))
|
||||
elif self.string_evaluator.requires_reference:
|
||||
msg = (
|
||||
f"Evaluator {self.name} requires an reference"
|
||||
" example from the dataset,"
|
||||
f" but none was provided for run {run.id}."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
return evaluate_strings_inputs
|
||||
|
||||
def _prepare_output(self, output: dict[str, Any]) -> dict[str, Any]:
|
||||
evaluation_result = EvaluationResult(
|
||||
key=self.name,
|
||||
comment=output.get("reasoning"),
|
||||
**output,
|
||||
)
|
||||
if RUN_KEY in output:
|
||||
# TODO: Not currently surfaced. Update
|
||||
evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
|
||||
return {"feedback": evaluation_result}
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: dict[str, str],
|
||||
run_manager: CallbackManagerForChainRun | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Call the evaluation chain."""
|
||||
evaluate_strings_inputs = self._prepare_input(inputs)
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
callbacks = _run_manager.get_child()
|
||||
chain_output = self.string_evaluator.evaluate_strings(
|
||||
**evaluate_strings_inputs,
|
||||
callbacks=callbacks,
|
||||
include_run_info=True,
|
||||
)
|
||||
return self._prepare_output(chain_output)
|
||||
|
||||
async def _acall(
|
||||
self,
|
||||
inputs: dict[str, str],
|
||||
run_manager: AsyncCallbackManagerForChainRun | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Call the evaluation chain."""
|
||||
evaluate_strings_inputs = self._prepare_input(inputs)
|
||||
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
|
||||
callbacks = _run_manager.get_child()
|
||||
chain_output = await self.string_evaluator.aevaluate_strings(
|
||||
**evaluate_strings_inputs,
|
||||
callbacks=callbacks,
|
||||
include_run_info=True,
|
||||
)
|
||||
return self._prepare_output(chain_output)
|
||||
|
||||
def _prepare_evaluator_output(self, output: dict[str, Any]) -> EvaluationResult:
|
||||
feedback: EvaluationResult = output["feedback"]
|
||||
if RUN_KEY not in feedback.evaluator_info:
|
||||
feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]
|
||||
return feedback
|
||||
|
||||
@override
|
||||
def evaluate_run(
|
||||
self,
|
||||
run: Run,
|
||||
example: Example | None = None,
|
||||
evaluator_run_id: uuid.UUID | None = None,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate an example."""
|
||||
try:
|
||||
result = self({"run": run, "example": example}, include_run_info=True)
|
||||
return self._prepare_evaluator_output(result)
|
||||
except Exception as e:
|
||||
_logger.exception("Error evaluating run %s", run.id)
|
||||
return EvaluationResult(
|
||||
key=self.string_evaluator.evaluation_name,
|
||||
comment=f"Error evaluating run {run.id}: {e}",
|
||||
# TODO: Add run ID once we can declare it via callbacks
|
||||
)
|
||||
|
||||
@override
|
||||
async def aevaluate_run(
|
||||
self,
|
||||
run: Run,
|
||||
example: Example | None = None,
|
||||
evaluator_run_id: uuid.UUID | None = None,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate an example."""
|
||||
try:
|
||||
result = await self.acall(
|
||||
{"run": run, "example": example},
|
||||
include_run_info=True,
|
||||
)
|
||||
return self._prepare_evaluator_output(result)
|
||||
except Exception as e:
|
||||
_logger.exception("Error evaluating run %s", run.id)
|
||||
return EvaluationResult(
|
||||
key=self.string_evaluator.evaluation_name,
|
||||
comment=f"Error evaluating run {run.id}: {e}",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_run_and_data_type(
|
||||
cls,
|
||||
evaluator: StringEvaluator,
|
||||
run_type: str,
|
||||
data_type: DataType,
|
||||
input_key: str | None = None,
|
||||
prediction_key: str | None = None,
|
||||
reference_key: str | None = None,
|
||||
tags: list[str] | None = None,
|
||||
) -> StringRunEvaluatorChain:
|
||||
"""Create a StringRunEvaluatorChain.
|
||||
|
||||
Create a StringRunEvaluatorChain from an evaluator and the run and dataset
|
||||
types.
|
||||
|
||||
This method provides an easy way to instantiate a StringRunEvaluatorChain, by
|
||||
taking an evaluator and information about the type of run and the data.
|
||||
The method supports LLM and chain runs.
|
||||
|
||||
Args:
|
||||
evaluator: The string evaluator to use.
|
||||
run_type: The type of run being evaluated.
|
||||
Supported types are LLM and Chain.
|
||||
data_type: The type of dataset used in the run.
|
||||
input_key: The key used to map the input from the run.
|
||||
prediction_key: The key used to map the prediction from the run.
|
||||
reference_key: The key used to map the reference from the dataset.
|
||||
tags: List of tags to attach to the evaluation chain.
|
||||
|
||||
Returns:
|
||||
The instantiated evaluation chain.
|
||||
|
||||
Raises:
|
||||
ValueError: If the run type is not supported, or if the evaluator requires a
|
||||
reference from the dataset but the reference key is not provided.
|
||||
|
||||
"""
|
||||
# Configure how run inputs/predictions are passed to the evaluator
|
||||
if run_type == "llm":
|
||||
run_mapper: StringRunMapper = LLMStringRunMapper()
|
||||
elif run_type == "chain":
|
||||
run_mapper = ChainStringRunMapper(
|
||||
input_key=input_key,
|
||||
prediction_key=prediction_key,
|
||||
)
|
||||
else:
|
||||
msg = f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."
|
||||
raise ValueError(msg)
|
||||
|
||||
# Configure how example rows are fed as a reference string to the evaluator
|
||||
if (
|
||||
reference_key is not None
|
||||
or data_type in (DataType.llm, DataType.chat)
|
||||
or evaluator.requires_reference
|
||||
):
|
||||
example_mapper = StringExampleMapper(reference_key=reference_key)
|
||||
elif evaluator.requires_reference:
|
||||
msg = ( # type: ignore[unreachable]
|
||||
f"Evaluator {evaluator.evaluation_name} requires a reference"
|
||||
" example from the dataset. Please specify the reference key from"
|
||||
" amongst the dataset outputs keys."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
else:
|
||||
example_mapper = None
|
||||
return cls(
|
||||
name=evaluator.evaluation_name,
|
||||
run_mapper=run_mapper,
|
||||
example_mapper=example_mapper,
|
||||
string_evaluator=evaluator,
|
||||
tags=tags,
|
||||
)
|
||||
Reference in New Issue
Block a user