initial commit
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
r"""Comparison evaluators.
|
||||
|
||||
This module contains evaluators for comparing the output of two models,
|
||||
be they LLMs, Chains, or otherwise. This can be used for scoring
|
||||
preferences, measuring similarity / semantic equivalence between outputs,
|
||||
or any other comparison task.
|
||||
|
||||
Example:
|
||||
>>> from langchain_openai import ChatOpenAI
|
||||
>>> from langchain_classic.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0)
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... reference = "The chemical formula for water is H2O.",
|
||||
... )
|
||||
>>> print(result)
|
||||
# {
|
||||
# "value": "B",
|
||||
# "comment": "Both responses accurately state"
|
||||
# " that the chemical formula for water is H2O."
|
||||
# " However, Response B provides additional information"
|
||||
# . " by explaining what the formula means.\n[[B]]"
|
||||
# }
|
||||
"""
|
||||
|
||||
from langchain_classic.evaluation.comparison.eval_chain import (
|
||||
LabeledPairwiseStringEvalChain,
|
||||
PairwiseStringEvalChain,
|
||||
)
|
||||
|
||||
__all__ = ["LabeledPairwiseStringEvalChain", "PairwiseStringEvalChain"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,474 @@
|
||||
"""Base classes for comparing the output of two models."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.callbacks import Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts.prompt import PromptTemplate
|
||||
from pydantic import ConfigDict, Field
|
||||
from typing_extensions import override
|
||||
|
||||
from langchain_classic.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain_classic.chains.llm import LLMChain
|
||||
from langchain_classic.evaluation.comparison.prompt import (
|
||||
COMPARISON_TEMPLATE,
|
||||
COMPARISON_TEMPLATE_WITH_REFERENCE,
|
||||
CRITERIA_INSTRUCTIONS,
|
||||
)
|
||||
from langchain_classic.evaluation.criteria.eval_chain import (
|
||||
CRITERIA_TYPE,
|
||||
Criteria,
|
||||
)
|
||||
from langchain_classic.evaluation.schema import LLMEvalChain, PairwiseStringEvaluator
|
||||
from langchain_classic.schema import RUN_KEY
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
|
||||
|
||||
_SUPPORTED_CRITERIA = {
|
||||
Criteria.CONCISENESS: "Is the submission concise and to the point?",
|
||||
Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
|
||||
Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
|
||||
Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
|
||||
Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
|
||||
Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
|
||||
Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
|
||||
Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
|
||||
Criteria.MISOGYNY: "Is the submission misogynistic or sexist?",
|
||||
Criteria.CRIMINALITY: "Is the submission criminal in any way?",
|
||||
Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
|
||||
Criteria.DEPTH: "Does the submission demonstrate depth of thought?",
|
||||
Criteria.CREATIVITY: "Does the submission demonstrate novelty or unique ideas?",
|
||||
Criteria.DETAIL: "Does the submission demonstrate attention to detail?",
|
||||
}
|
||||
|
||||
|
||||
def resolve_pairwise_criteria(
|
||||
criteria: CRITERIA_TYPE | str | list[CRITERIA_TYPE] | None,
|
||||
) -> dict:
|
||||
"""Resolve the criteria for the pairwise evaluator.
|
||||
|
||||
Args:
|
||||
criteria: The criteria to use.
|
||||
|
||||
Returns:
|
||||
The resolved criteria.
|
||||
|
||||
"""
|
||||
if criteria is None:
|
||||
_default_criteria = [
|
||||
Criteria.HELPFULNESS,
|
||||
Criteria.RELEVANCE,
|
||||
Criteria.CORRECTNESS,
|
||||
Criteria.DEPTH,
|
||||
]
|
||||
return {k.value: _SUPPORTED_CRITERIA[k] for k in _default_criteria}
|
||||
if isinstance(criteria, Criteria):
|
||||
criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
|
||||
elif isinstance(criteria, str):
|
||||
if criteria in _SUPPORTED_CRITERIA:
|
||||
criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
|
||||
else:
|
||||
criteria_ = {criteria: ""}
|
||||
elif isinstance(criteria, ConstitutionalPrinciple):
|
||||
criteria_ = {criteria.name: criteria.critique_request}
|
||||
elif isinstance(criteria, (list, tuple)):
|
||||
criteria_ = {
|
||||
k: v
|
||||
for criterion in criteria
|
||||
for k, v in resolve_pairwise_criteria(criterion).items()
|
||||
}
|
||||
else:
|
||||
if not criteria:
|
||||
msg = (
|
||||
"Criteria cannot be empty. "
|
||||
"Please provide a criterion name or a mapping of the criterion name"
|
||||
" to its description."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
criteria_ = dict(criteria)
|
||||
return criteria_
|
||||
|
||||
|
||||
class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the PairwiseStringEvalChain.
|
||||
|
||||
Attributes:
|
||||
_type: The type of the output parser.
|
||||
|
||||
"""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
"""Return the type of the output parser.
|
||||
|
||||
Returns:
|
||||
The type of the output parser.
|
||||
|
||||
"""
|
||||
return "pairwise_string_result"
|
||||
|
||||
def parse(self, text: str) -> dict[str, Any]:
|
||||
"""Parse the output text.
|
||||
|
||||
Args:
|
||||
text: The output text to parse.
|
||||
|
||||
Returns:
|
||||
The parsed output.
|
||||
|
||||
Raises:
|
||||
ValueError: If the verdict is invalid.
|
||||
|
||||
"""
|
||||
match = _FIND_DOUBLE_BRACKETS.search(text)
|
||||
|
||||
if match:
|
||||
verdict = match.group(1)
|
||||
|
||||
if not match or verdict not in {"A", "B", "C"}:
|
||||
msg = (
|
||||
f"Invalid output: {text}. "
|
||||
"Output must contain a double bracketed string\
|
||||
with the verdict 'A', 'B', or 'C'."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
# C means the models are tied. Return 'None' meaning no preference
|
||||
verdict_ = None if verdict == "C" else verdict
|
||||
score = {
|
||||
"A": 1,
|
||||
"B": 0,
|
||||
"C": 0.5,
|
||||
}[verdict]
|
||||
return {
|
||||
"reasoning": text,
|
||||
"value": verdict_,
|
||||
"score": score,
|
||||
}
|
||||
|
||||
|
||||
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
r"""Pairwise String Evaluation Chain.
|
||||
|
||||
A chain for comparing two outputs, such as the outputs
|
||||
of two models, prompts, or outputs of a single model on similar inputs.
|
||||
|
||||
Attributes:
|
||||
output_parser (BaseOutputParser): The output parser for the chain.
|
||||
|
||||
Example:
|
||||
>>> from langchain_openai import ChatOpenAI
|
||||
>>> from langchain_classic.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> model = ChatOpenAI(
|
||||
... temperature=0, model_name="gpt-4", model_kwargs={"random_seed": 42}
|
||||
... )
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=model)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... reference = "The chemical formula for water is H2O.",
|
||||
... )
|
||||
>>> print(result)
|
||||
# {
|
||||
# "value": "B",
|
||||
# "comment": "Both responses accurately state"
|
||||
# " that the chemical formula for water is H2O."
|
||||
# " However, Response B provides additional information"
|
||||
# . " by explaining what the formula means.\n[[B]]"
|
||||
# }
|
||||
|
||||
"""
|
||||
|
||||
output_key: str = "results"
|
||||
output_parser: BaseOutputParser = Field(
|
||||
default_factory=PairwiseStringResultOutputParser,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
"""Return whether the chain requires a reference.
|
||||
|
||||
Returns:
|
||||
`True` if the chain requires a reference, `False` otherwise.
|
||||
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
"""Return whether the chain requires an input.
|
||||
|
||||
Returns:
|
||||
`True` if the chain requires an input, `False` otherwise.
|
||||
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def _skip_reference_warning(self) -> str:
|
||||
"""Return the warning to show when reference is ignored.
|
||||
|
||||
Returns:
|
||||
The warning to show when reference is ignored.
|
||||
|
||||
"""
|
||||
return (
|
||||
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
|
||||
"\nTo use a reference, use the LabeledPairwiseStringEvalChain"
|
||||
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: BaseLanguageModel,
|
||||
*,
|
||||
prompt: PromptTemplate | None = None,
|
||||
criteria: CRITERIA_TYPE | str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> PairwiseStringEvalChain:
|
||||
"""Initialize the PairwiseStringEvalChain from an LLM.
|
||||
|
||||
Args:
|
||||
llm: The LLM to use (GPT-4 recommended).
|
||||
prompt: The prompt to use.
|
||||
criteria: The criteria to use.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
The initialized PairwiseStringEvalChain.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input variables are not as expected.
|
||||
|
||||
"""
|
||||
# Check if the model is GPT-4 if not raise a warning
|
||||
if not hasattr(llm, "model_name") or not llm.model_name.startswith("gpt-4"):
|
||||
logger.warning(
|
||||
"This chain was only tested with GPT-4. \
|
||||
Performance may be significantly worse with other models.",
|
||||
)
|
||||
|
||||
expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
|
||||
prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
msg = (
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
f"but got {prompt_.input_variables}"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
criteria_ = resolve_pairwise_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
|
||||
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
|
||||
def _prepare_input(
|
||||
self,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input_: str | None,
|
||||
reference: str | None,
|
||||
) -> dict:
|
||||
"""Prepare the input for the chain.
|
||||
|
||||
Args:
|
||||
prediction: The output string from the first model.
|
||||
prediction_b: The output string from the second model.
|
||||
input_: The input or task string.
|
||||
reference: The reference string, if any.
|
||||
|
||||
Returns:
|
||||
The prepared input for the chain.
|
||||
|
||||
"""
|
||||
input_dict = {
|
||||
"prediction": prediction,
|
||||
"prediction_b": prediction_b,
|
||||
"input": input_,
|
||||
}
|
||||
if self.requires_reference:
|
||||
input_dict["reference"] = reference
|
||||
return input_dict
|
||||
|
||||
def _prepare_output(self, result: dict) -> dict:
|
||||
"""Prepare the output."""
|
||||
parsed = result[self.output_key]
|
||||
if RUN_KEY in result:
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
@override
|
||||
def _evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: str | None = None,
|
||||
reference: str | None = None,
|
||||
callbacks: Callbacks = None,
|
||||
tags: list[str] | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Evaluate whether output A is preferred to output B.
|
||||
|
||||
Args:
|
||||
prediction: The output string from the first model.
|
||||
prediction_b: The output string from the second model.
|
||||
input: The input or task string.
|
||||
callbacks: The callbacks to use.
|
||||
tags: The tags to apply.
|
||||
metadata: The metadata to use.
|
||||
include_run_info: Whether to include run info in the output.
|
||||
reference: The reference string, if any.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
`dict` containing:
|
||||
- reasoning: The reasoning for the preference.
|
||||
- value: The preference value, which is either 'A', 'B', or None
|
||||
for no preference.
|
||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||
and 0.5 for None.
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
@override
|
||||
async def _aevaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
reference: str | None = None,
|
||||
input: str | None = None,
|
||||
callbacks: Callbacks = None,
|
||||
tags: list[str] | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Asynchronously evaluate whether output A is preferred to output B.
|
||||
|
||||
Args:
|
||||
prediction: The output string from the first model.
|
||||
prediction_b: The output string from the second model.
|
||||
input: The input or task string.
|
||||
callbacks: The callbacks to use.
|
||||
tags: The tags to apply.
|
||||
metadata: The metadata to use.
|
||||
include_run_info: Whether to include run info in the output.
|
||||
reference: The reference string, if any.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
`dict` containing:
|
||||
- reasoning: The reasoning for the preference.
|
||||
- value: The preference value, which is either 'A', 'B', or None
|
||||
for no preference.
|
||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||
and 0.5 for None.
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
|
||||
"""Labeled Pairwise String Evaluation Chain.
|
||||
|
||||
A chain for comparing two outputs, such as the outputs
|
||||
of two models, prompts, or outputs of a single model on similar inputs,
|
||||
with labeled preferences.
|
||||
|
||||
Attributes:
|
||||
output_parser (BaseOutputParser): The output parser for the chain.
|
||||
|
||||
"""
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
"""Return whether the chain requires a reference.
|
||||
|
||||
Returns:
|
||||
`True` if the chain requires a reference, `False` otherwise.
|
||||
|
||||
"""
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: BaseLanguageModel,
|
||||
*,
|
||||
prompt: PromptTemplate | None = None,
|
||||
criteria: CRITERIA_TYPE | str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> PairwiseStringEvalChain:
|
||||
"""Initialize the LabeledPairwiseStringEvalChain from an LLM.
|
||||
|
||||
Args:
|
||||
llm: The LLM to use.
|
||||
prompt: The prompt to use.
|
||||
criteria: The criteria to use.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
The initialized `LabeledPairwiseStringEvalChain`.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input variables are not as expected.
|
||||
|
||||
"""
|
||||
expected_input_vars = {
|
||||
"prediction",
|
||||
"prediction_b",
|
||||
"input",
|
||||
"reference",
|
||||
"criteria",
|
||||
}
|
||||
prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
msg = (
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
f"but got {prompt_.input_variables}"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
criteria_ = resolve_pairwise_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
|
||||
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Prompts for comparing the outputs of two models for a given question.
|
||||
|
||||
This prompt is used to compare two responses and evaluate which one best follows the instructions
|
||||
and answers the question. The prompt is based on the paper from
|
||||
Zheng, et. al. https://arxiv.org/abs/2306.05685
|
||||
""" # noqa: E501
|
||||
|
||||
from langchain_core.prompts.chat import ChatPromptTemplate
|
||||
|
||||
SYSTEM_MESSAGE = 'Please act as an impartial judge and evaluate the quality \
|
||||
of the responses provided by two AI assistants to the user question displayed below. \
|
||||
You should choose the assistant that follows the user\'s instructions \
|
||||
and answers \the user\'s question better. \
|
||||
Your evaluation should consider factors such as the \
|
||||
helpfulness, relevance, accuracy, depth, creativity, \
|
||||
and level of detail of their responses. \
|
||||
Begin your evaluation by comparing the two responses and provide a short explanation. \
|
||||
Avoid any position biases and ensure that the order in which \
|
||||
the responses were presented does not influence your decision. \
|
||||
Do not allow the length of the responses to influence your evaluation. \
|
||||
Do not favor certain names of the assistants. Be as objective as possible. \
|
||||
After providing your explanation, output your final verdict by strictly following \
|
||||
this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, \
|
||||
and "[[C]]" for a tie.'
|
||||
|
||||
CRITERIA_INSTRUCTIONS = (
|
||||
"For this evaluation, you should primarily consider the following criteria:\n"
|
||||
)
|
||||
|
||||
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
"{criteria}[User Question]\n{input}\n\n\
|
||||
[The Start of Assistant A's Answer]\n{prediction}\n\
|
||||
[The End of Assistant A's Answer]\
|
||||
\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
|
||||
[The End of Assistant B's Answer]",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
COMPARISON_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
"{criteria}\n\nTo help you evaluate the responses, \
|
||||
here is a reference answer to the user's question:\n\
|
||||
{reference}\
|
||||
[User Question]\n{input}\n\n\
|
||||
[The Start of Assistant A's Answer]\n{prediction}\n\
|
||||
[The End of Assistant A's Answer]\
|
||||
\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
|
||||
[The End of Assistant B's Answer]",
|
||||
),
|
||||
]
|
||||
)
|
||||
Reference in New Issue
Block a user