initial commit

This commit is contained in:
2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
r"""Comparison evaluators.
This module contains evaluators for comparing the output of two models,
be they LLMs, Chains, or otherwise. This can be used for scoring
preferences, measuring similarity / semantic equivalence between outputs,
or any other comparison task.
Example:
>>> from langchain_openai import ChatOpenAI
>>> from langchain_classic.evaluation.comparison import PairwiseStringEvalChain
>>> llm = ChatOpenAI(temperature=0)
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... reference = "The chemical formula for water is H2O.",
... )
>>> print(result)
# {
# "value": "B",
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\n[[B]]"
# }
"""
from langchain_classic.evaluation.comparison.eval_chain import (
LabeledPairwiseStringEvalChain,
PairwiseStringEvalChain,
)
__all__ = ["LabeledPairwiseStringEvalChain", "PairwiseStringEvalChain"]

View File

@@ -0,0 +1,474 @@
"""Base classes for comparing the output of two models."""
from __future__ import annotations
import logging
import re
from typing import Any
from langchain_core.callbacks import Callbacks
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts.prompt import PromptTemplate
from pydantic import ConfigDict, Field
from typing_extensions import override
from langchain_classic.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain_classic.chains.llm import LLMChain
from langchain_classic.evaluation.comparison.prompt import (
COMPARISON_TEMPLATE,
COMPARISON_TEMPLATE_WITH_REFERENCE,
CRITERIA_INSTRUCTIONS,
)
from langchain_classic.evaluation.criteria.eval_chain import (
CRITERIA_TYPE,
Criteria,
)
from langchain_classic.evaluation.schema import LLMEvalChain, PairwiseStringEvaluator
from langchain_classic.schema import RUN_KEY
logger = logging.getLogger(__name__)
_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
_SUPPORTED_CRITERIA = {
Criteria.CONCISENESS: "Is the submission concise and to the point?",
Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
Criteria.MISOGYNY: "Is the submission misogynistic or sexist?",
Criteria.CRIMINALITY: "Is the submission criminal in any way?",
Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
Criteria.DEPTH: "Does the submission demonstrate depth of thought?",
Criteria.CREATIVITY: "Does the submission demonstrate novelty or unique ideas?",
Criteria.DETAIL: "Does the submission demonstrate attention to detail?",
}
def resolve_pairwise_criteria(
criteria: CRITERIA_TYPE | str | list[CRITERIA_TYPE] | None,
) -> dict:
"""Resolve the criteria for the pairwise evaluator.
Args:
criteria: The criteria to use.
Returns:
The resolved criteria.
"""
if criteria is None:
_default_criteria = [
Criteria.HELPFULNESS,
Criteria.RELEVANCE,
Criteria.CORRECTNESS,
Criteria.DEPTH,
]
return {k.value: _SUPPORTED_CRITERIA[k] for k in _default_criteria}
if isinstance(criteria, Criteria):
criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
elif isinstance(criteria, str):
if criteria in _SUPPORTED_CRITERIA:
criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
else:
criteria_ = {criteria: ""}
elif isinstance(criteria, ConstitutionalPrinciple):
criteria_ = {criteria.name: criteria.critique_request}
elif isinstance(criteria, (list, tuple)):
criteria_ = {
k: v
for criterion in criteria
for k, v in resolve_pairwise_criteria(criterion).items()
}
else:
if not criteria:
msg = (
"Criteria cannot be empty. "
"Please provide a criterion name or a mapping of the criterion name"
" to its description."
)
raise ValueError(msg)
criteria_ = dict(criteria)
return criteria_
class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the PairwiseStringEvalChain.
Attributes:
_type: The type of the output parser.
"""
@property
def _type(self) -> str:
"""Return the type of the output parser.
Returns:
The type of the output parser.
"""
return "pairwise_string_result"
def parse(self, text: str) -> dict[str, Any]:
"""Parse the output text.
Args:
text: The output text to parse.
Returns:
The parsed output.
Raises:
ValueError: If the verdict is invalid.
"""
match = _FIND_DOUBLE_BRACKETS.search(text)
if match:
verdict = match.group(1)
if not match or verdict not in {"A", "B", "C"}:
msg = (
f"Invalid output: {text}. "
"Output must contain a double bracketed string\
with the verdict 'A', 'B', or 'C'."
)
raise ValueError(msg)
# C means the models are tied. Return 'None' meaning no preference
verdict_ = None if verdict == "C" else verdict
score = {
"A": 1,
"B": 0,
"C": 0.5,
}[verdict]
return {
"reasoning": text,
"value": verdict_,
"score": score,
}
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
r"""Pairwise String Evaluation Chain.
A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs.
Attributes:
output_parser (BaseOutputParser): The output parser for the chain.
Example:
>>> from langchain_openai import ChatOpenAI
>>> from langchain_classic.evaluation.comparison import PairwiseStringEvalChain
>>> model = ChatOpenAI(
... temperature=0, model_name="gpt-4", model_kwargs={"random_seed": 42}
... )
>>> chain = PairwiseStringEvalChain.from_llm(llm=model)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... reference = "The chemical formula for water is H2O.",
... )
>>> print(result)
# {
# "value": "B",
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\n[[B]]"
# }
"""
output_key: str = "results"
output_parser: BaseOutputParser = Field(
default_factory=PairwiseStringResultOutputParser,
)
@classmethod
@override
def is_lc_serializable(cls) -> bool:
return False
model_config = ConfigDict(
extra="ignore",
)
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
`True` if the chain requires a reference, `False` otherwise.
"""
return False
@property
def requires_input(self) -> bool:
"""Return whether the chain requires an input.
Returns:
`True` if the chain requires an input, `False` otherwise.
"""
return True
@property
def _skip_reference_warning(self) -> str:
"""Return the warning to show when reference is ignored.
Returns:
The warning to show when reference is ignored.
"""
return (
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
"\nTo use a reference, use the LabeledPairwiseStringEvalChain"
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
)
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
*,
prompt: PromptTemplate | None = None,
criteria: CRITERIA_TYPE | str | None = None,
**kwargs: Any,
) -> PairwiseStringEvalChain:
"""Initialize the PairwiseStringEvalChain from an LLM.
Args:
llm: The LLM to use (GPT-4 recommended).
prompt: The prompt to use.
criteria: The criteria to use.
**kwargs: Additional keyword arguments.
Returns:
The initialized PairwiseStringEvalChain.
Raises:
ValueError: If the input variables are not as expected.
"""
# Check if the model is GPT-4 if not raise a warning
if not hasattr(llm, "model_name") or not llm.model_name.startswith("gpt-4"):
logger.warning(
"This chain was only tested with GPT-4. \
Performance may be significantly worse with other models.",
)
expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
if expected_input_vars != set(prompt_.input_variables):
msg = (
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
raise ValueError(msg)
criteria_ = resolve_pairwise_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
def _prepare_input(
self,
prediction: str,
prediction_b: str,
input_: str | None,
reference: str | None,
) -> dict:
"""Prepare the input for the chain.
Args:
prediction: The output string from the first model.
prediction_b: The output string from the second model.
input_: The input or task string.
reference: The reference string, if any.
Returns:
The prepared input for the chain.
"""
input_dict = {
"prediction": prediction,
"prediction_b": prediction_b,
"input": input_,
}
if self.requires_reference:
input_dict["reference"] = reference
return input_dict
def _prepare_output(self, result: dict) -> dict:
"""Prepare the output."""
parsed = result[self.output_key]
if RUN_KEY in result:
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
@override
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: str | None = None,
reference: str | None = None,
callbacks: Callbacks = None,
tags: list[str] | None = None,
metadata: dict[str, Any] | None = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate whether output A is preferred to output B.
Args:
prediction: The output string from the first model.
prediction_b: The output string from the second model.
input: The input or task string.
callbacks: The callbacks to use.
tags: The tags to apply.
metadata: The metadata to use.
include_run_info: Whether to include run info in the output.
reference: The reference string, if any.
**kwargs: Additional keyword arguments.
Returns:
`dict` containing:
- reasoning: The reasoning for the preference.
- value: The preference value, which is either 'A', 'B', or None
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = self(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
@override
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
reference: str | None = None,
input: str | None = None,
callbacks: Callbacks = None,
tags: list[str] | None = None,
metadata: dict[str, Any] | None = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate whether output A is preferred to output B.
Args:
prediction: The output string from the first model.
prediction_b: The output string from the second model.
input: The input or task string.
callbacks: The callbacks to use.
tags: The tags to apply.
metadata: The metadata to use.
include_run_info: Whether to include run info in the output.
reference: The reference string, if any.
**kwargs: Additional keyword arguments.
Returns:
`dict` containing:
- reasoning: The reasoning for the preference.
- value: The preference value, which is either 'A', 'B', or None
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = await self.acall(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
"""Labeled Pairwise String Evaluation Chain.
A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs,
with labeled preferences.
Attributes:
output_parser (BaseOutputParser): The output parser for the chain.
"""
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
`True` if the chain requires a reference, `False` otherwise.
"""
return True
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
*,
prompt: PromptTemplate | None = None,
criteria: CRITERIA_TYPE | str | None = None,
**kwargs: Any,
) -> PairwiseStringEvalChain:
"""Initialize the LabeledPairwiseStringEvalChain from an LLM.
Args:
llm: The LLM to use.
prompt: The prompt to use.
criteria: The criteria to use.
**kwargs: Additional keyword arguments.
Returns:
The initialized `LabeledPairwiseStringEvalChain`.
Raises:
ValueError: If the input variables are not as expected.
"""
expected_input_vars = {
"prediction",
"prediction_b",
"input",
"reference",
"criteria",
}
prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
if expected_input_vars != set(prompt_.input_variables):
msg = (
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
raise ValueError(msg)
criteria_ = resolve_pairwise_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)

View File

@@ -0,0 +1,59 @@
"""Prompts for comparing the outputs of two models for a given question.
This prompt is used to compare two responses and evaluate which one best follows the instructions
and answers the question. The prompt is based on the paper from
Zheng, et. al. https://arxiv.org/abs/2306.05685
""" # noqa: E501
from langchain_core.prompts.chat import ChatPromptTemplate
SYSTEM_MESSAGE = 'Please act as an impartial judge and evaluate the quality \
of the responses provided by two AI assistants to the user question displayed below. \
You should choose the assistant that follows the user\'s instructions \
and answers \the user\'s question better. \
Your evaluation should consider factors such as the \
helpfulness, relevance, accuracy, depth, creativity, \
and level of detail of their responses. \
Begin your evaluation by comparing the two responses and provide a short explanation. \
Avoid any position biases and ensure that the order in which \
the responses were presented does not influence your decision. \
Do not allow the length of the responses to influence your evaluation. \
Do not favor certain names of the assistants. Be as objective as possible. \
After providing your explanation, output your final verdict by strictly following \
this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, \
and "[[C]]" for a tie.'
CRITERIA_INSTRUCTIONS = (
"For this evaluation, you should primarily consider the following criteria:\n"
)
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
[
("system", SYSTEM_MESSAGE),
(
"human",
"{criteria}[User Question]\n{input}\n\n\
[The Start of Assistant A's Answer]\n{prediction}\n\
[The End of Assistant A's Answer]\
\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
[The End of Assistant B's Answer]",
),
]
)
COMPARISON_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
[
("system", SYSTEM_MESSAGE),
(
"human",
"{criteria}\n\nTo help you evaluate the responses, \
here is a reference answer to the user's question:\n\
{reference}\
[User Question]\n{input}\n\n\
[The Start of Assistant A's Answer]\n{prediction}\n\
[The End of Assistant A's Answer]\
\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
[The End of Assistant B's Answer]",
),
]
)