initial commit

2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions
--- a/venv/Lib/site-packages/langchain_classic/evaluation/schema.py
+++ b/venv/Lib/site-packages/langchain_classic/evaluation/schema.py
@@ -0,0 +1,507 @@
+"""Interfaces to be implemented by general evaluators."""
+
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from enum import Enum
+from typing import Any
+from warnings import warn
+
+from langchain_core.agents import AgentAction
+from langchain_core.language_models import BaseLanguageModel
+from langchain_core.runnables.config import run_in_executor
+
+from langchain_classic.chains.base import Chain
+
+logger = logging.getLogger(__name__)
+
+
+class EvaluatorType(str, Enum):
+    """The types of the evaluators."""
+
+    QA = "qa"
+    """Question answering evaluator, which grades answers to questions
+    directly using an LLM."""
+    COT_QA = "cot_qa"
+    """Chain of thought question answering evaluator, which grades
+    answers to questions using
+    chain of thought 'reasoning'."""
+    CONTEXT_QA = "context_qa"
+    """Question answering evaluator that incorporates 'context' in the response."""
+    PAIRWISE_STRING = "pairwise_string"
+    """The pairwise string evaluator, which predicts the preferred prediction from
+    between two models."""
+    SCORE_STRING = "score_string"
+    """The scored string evaluator, which gives a score between 1 and 10
+    to a prediction."""
+    LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
+    """The labeled pairwise string evaluator, which predicts the preferred prediction
+    from between two models based on a ground truth reference label."""
+    LABELED_SCORE_STRING = "labeled_score_string"
+    """The labeled scored string evaluator, which gives a score between 1 and 10
+    to a prediction based on a ground truth reference label."""
+    AGENT_TRAJECTORY = "trajectory"
+    """The agent trajectory evaluator, which grades the agent's intermediate steps."""
+    CRITERIA = "criteria"
+    """The criteria evaluator, which evaluates a model based on a
+    custom set of criteria without any reference labels."""
+    LABELED_CRITERIA = "labeled_criteria"
+    """The labeled criteria evaluator, which evaluates a model based on a
+    custom set of criteria, with a reference label."""
+    STRING_DISTANCE = "string_distance"
+    """Compare predictions to a reference answer using string edit distances."""
+    EXACT_MATCH = "exact_match"
+    """Compare predictions to a reference answer using exact matching."""
+    REGEX_MATCH = "regex_match"
+    """Compare predictions to a reference answer using regular expressions."""
+    PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
+    """Compare predictions based on string edit distances."""
+    EMBEDDING_DISTANCE = "embedding_distance"
+    """Compare a prediction to a reference label using embedding distance."""
+    PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
+    """Compare two predictions using embedding distance."""
+    JSON_VALIDITY = "json_validity"
+    """Check if a prediction is valid JSON."""
+    JSON_EQUALITY = "json_equality"
+    """Check if a prediction is equal to a reference JSON."""
+    JSON_EDIT_DISTANCE = "json_edit_distance"
+    """Compute the edit distance between two JSON strings after canonicalization."""
+    JSON_SCHEMA_VALIDATION = "json_schema_validation"
+    """Check if a prediction is valid JSON according to a JSON schema."""
+
+
+class LLMEvalChain(Chain):
+    """A base class for evaluators that use an LLM."""
+
+    @classmethod
+    @abstractmethod
+    def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> LLMEvalChain:
+        """Create a new evaluator from an LLM."""
+
+
+class _EvalArgsMixin:
+    """Mixin for checking evaluation arguments."""
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @property
+    def requires_input(self) -> bool:
+        """Whether this evaluator requires an input string."""
+        return False
+
+    @property
+    def _skip_input_warning(self) -> str:
+        """Warning to show when input is ignored."""
+        return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Warning to show when reference is ignored."""
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+        )
+
+    def _check_evaluation_args(
+        self,
+        reference: str | None = None,
+        input_: str | None = None,
+    ) -> None:
+        """Check if the evaluation arguments are valid.
+
+        Args:
+            reference: The reference label.
+            input_: The input string.
+
+        Raises:
+            ValueError: If the evaluator requires an input string but none is provided,
+                or if the evaluator requires a reference label but none is provided.
+        """
+        if self.requires_input and input_ is None:
+            msg = f"{self.__class__.__name__} requires an input string."
+            raise ValueError(msg)
+        if input_ is not None and not self.requires_input:
+            warn(self._skip_input_warning, stacklevel=3)
+        if self.requires_reference and reference is None:
+            msg = f"{self.__class__.__name__} requires a reference string."
+            raise ValueError(msg)
+        if reference is not None and not self.requires_reference:
+            warn(self._skip_reference_warning, stacklevel=3)
+
+
+class StringEvaluator(_EvalArgsMixin, ABC):
+    """String evaluator interface.
+
+    Grade, tag, or otherwise evaluate predictions relative to their inputs
+    and/or reference labels.
+    """
+
+    @property
+    def evaluation_name(self) -> str:
+        """The name of the evaluation."""
+        return self.__class__.__name__
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @abstractmethod
+    def _evaluate_strings(
+        self,
+        *,
+        prediction: str | Any,
+        reference: str | Any | None = None,
+        input: str | Any | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction: The LLM or chain prediction to evaluate.
+            reference: The reference label to evaluate against.
+            input: The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+
+        Returns:
+            The evaluation results containing the score or value.
+            It is recommended that the dictionary contain the following keys:
+                 - score: the score of the evaluation, if applicable.
+                 - value: the string value of the evaluation, if applicable.
+                 - reasoning: the reasoning for the evaluation, if applicable.
+        """
+
+    async def _aevaluate_strings(
+        self,
+        *,
+        prediction: str | Any,
+        reference: str | Any | None = None,
+        input: str | Any | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction: The LLM or chain prediction to evaluate.
+            reference: The reference label to evaluate against.
+            input: The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+
+        Returns:
+            The evaluation results containing the score or value.
+            It is recommended that the dictionary contain the following keys:
+                 - score: the score of the evaluation, if applicable.
+                 - value: the string value of the evaluation, if applicable.
+                 - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501
+        return await run_in_executor(
+            None,
+            self._evaluate_strings,
+            prediction=prediction,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    def evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: str | None = None,
+        input: str | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction: The LLM or chain prediction to evaluate.
+            reference: The reference label to evaluate against.
+            input: The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+
+        Returns:
+            The evaluation results containing the score or value.
+        """
+        self._check_evaluation_args(reference=reference, input_=input)
+        return self._evaluate_strings(
+            prediction=prediction,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    async def aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: str | None = None,
+        input: str | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction: The LLM or chain prediction to evaluate.
+            reference: The reference label to evaluate against.
+            input: The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+
+        Returns:
+            The evaluation results containing the score or value.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input_=input)
+        return await self._aevaluate_strings(
+            prediction=prediction,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+
+class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
+    """Compare the output of two models (or two outputs of the same model)."""
+
+    @abstractmethod
+    def _evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: str | None = None,
+        input: str | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the output string pairs.
+
+        Args:
+            prediction: The output string from the first model.
+            prediction_b: The output string from the second model.
+            reference: The expected output / reference string.
+            input: The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
+
+        Returns:
+            `dict` containing the preference, scores, and/or other information.
+        """  # noqa: E501
+
+    async def _aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: str | None = None,
+        input: str | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the output string pairs.
+
+        Args:
+            prediction: The output string from the first model.
+            prediction_b: The output string from the second model.
+            reference: The expected output / reference string.
+            input: The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
+
+        Returns:
+            `dict` containing the preference, scores, and/or other information.
+        """  # noqa: E501
+        return await run_in_executor(
+            None,
+            self._evaluate_string_pairs,
+            prediction=prediction,
+            prediction_b=prediction_b,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    def evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: str | None = None,
+        input: str | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the output string pairs.
+
+        Args:
+            prediction: The output string from the first model.
+            prediction_b: The output string from the second model.
+            reference: The expected output / reference string.
+            input: The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
+
+        Returns:
+            `dict` containing the preference, scores, and/or other information.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input_=input)
+        return self._evaluate_string_pairs(
+            prediction=prediction,
+            prediction_b=prediction_b,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    async def aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: str | None = None,
+        input: str | None = None,  # noqa: A002
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the output string pairs.
+
+        Args:
+            prediction: The output string from the first model.
+            prediction_b: The output string from the second model.
+            reference: The expected output / reference string.
+            input: The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
+
+        Returns:
+            `dict` containing the preference, scores, and/or other information.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input_=input)
+        return await self._aevaluate_string_pairs(
+            prediction=prediction,
+            prediction_b=prediction_b,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+
+class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
+    """Interface for evaluating agent trajectories."""
+
+    @property
+    def requires_input(self) -> bool:
+        """Whether this evaluator requires an input string."""
+        return True
+
+    @abstractmethod
+    def _evaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[tuple[AgentAction, str]],
+        input: str,  # noqa: A002
+        reference: str | None = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            prediction: The final predicted response.
+            agent_trajectory:
+                The intermediate steps forming the agent trajectory.
+            input: The input to the agent.
+            reference: The reference answer.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            The evaluation result.
+        """
+
+    async def _aevaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[tuple[AgentAction, str]],
+        input: str,  # noqa: A002
+        reference: str | None = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            prediction: The final predicted response.
+            agent_trajectory:
+                The intermediate steps forming the agent trajectory.
+            input: The input to the agent.
+            reference: The reference answer.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            The evaluation result.
+        """
+        return await run_in_executor(
+            None,
+            self._evaluate_agent_trajectory,
+            prediction=prediction,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    def evaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[tuple[AgentAction, str]],
+        input: str,  # noqa: A002
+        reference: str | None = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            prediction: The final predicted response.
+            agent_trajectory:
+                The intermediate steps forming the agent trajectory.
+            input: The input to the agent.
+            reference: The reference answer.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            The evaluation result.
+        """
+        self._check_evaluation_args(reference=reference, input_=input)
+        return self._evaluate_agent_trajectory(
+            prediction=prediction,
+            input=input,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            **kwargs,
+        )
+
+    async def aevaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[tuple[AgentAction, str]],
+        input: str,  # noqa: A002
+        reference: str | None = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            prediction: The final predicted response.
+            agent_trajectory:
+                The intermediate steps forming the agent trajectory.
+            input: The input to the agent.
+            reference: The reference answer.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            The evaluation result.
+        """
+        self._check_evaluation_args(reference=reference, input_=input)
+        return await self._aevaluate_agent_trajectory(
+            prediction=prediction,
+            input=input,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            **kwargs,
+        )