initial commit

This commit is contained in:
2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions

View File

@@ -0,0 +1,89 @@
"""Evaluation Helpers."""
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from langsmith.evaluation._arunner import (
aevaluate,
aevaluate_existing,
)
from langsmith.evaluation._runner import (
evaluate,
evaluate_comparative,
evaluate_existing,
)
from langsmith.evaluation.evaluator import (
EvaluationResult,
EvaluationResults,
RunEvaluator,
run_evaluator,
)
def __getattr__(
name: str,
) -> Any:
""".. deprecated:: 0.5.0.
Importing from langsmith.evaluation is deprecated. Use client.evaluate() instead.
"""
if name == "evaluate":
from langsmith.evaluation._runner import evaluate
return evaluate
elif name == "evaluate_existing":
from langsmith.evaluation._runner import evaluate_existing
return evaluate_existing
elif name == "aevaluate":
from langsmith.evaluation._arunner import aevaluate
return aevaluate
elif name == "aevaluate_existing":
from langsmith.evaluation._arunner import aevaluate_existing
return aevaluate_existing
elif name == "evaluate_comparative":
from langsmith.evaluation._runner import evaluate_comparative
return evaluate_comparative
elif name == "EvaluationResult":
from langsmith.evaluation.evaluator import EvaluationResult
return EvaluationResult
elif name == "EvaluationResults":
from langsmith.evaluation.evaluator import EvaluationResults
return EvaluationResults
elif name == "RunEvaluator":
from langsmith.evaluation.evaluator import RunEvaluator
return RunEvaluator
elif name == "run_evaluator":
from langsmith.evaluation.evaluator import run_evaluator
return run_evaluator
elif name == "StringEvaluator":
from langsmith.evaluation.string_evaluator import StringEvaluator
return StringEvaluator
raise AttributeError(f"module {__name__} has no attribute {name}")
__all__ = [
"run_evaluator",
"EvaluationResult",
"EvaluationResults",
"RunEvaluator",
"StringEvaluator",
"aevaluate",
"aevaluate_existing",
"evaluate",
"evaluate_existing",
"evaluate_comparative",
]
def __dir__() -> list[str]:
return __all__

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,727 @@
import random
adjectives = [
"abandoned",
"aching",
"advanced",
"ample",
"artistic",
"back",
"best",
"bold",
"brief",
"clear",
"cold",
"complicated",
"cooked",
"crazy",
"crushing",
"damp",
"dear",
"definite",
"dependable",
"diligent",
"drab",
"earnest",
"elderly",
"enchanted",
"essential",
"excellent",
"extraneous",
"fixed",
"flowery",
"formal",
"fresh",
"frosty",
"giving",
"glossy",
"healthy",
"helpful",
"impressionable",
"kind",
"large",
"left",
"long",
"loyal",
"mealy",
"memorable",
"monthly",
"new",
"notable",
"only",
"ordinary",
"passionate",
"perfect",
"pertinent",
"proper",
"puzzled",
"reflecting",
"respectful",
"roasted",
"scholarly",
"shiny",
"slight",
"sparkling",
"spotless",
"stupendous",
"sunny",
"tart",
"terrific",
"timely",
"unique",
"upbeat",
"vacant",
"virtual",
"warm",
"weary",
"whispered",
"worthwhile",
"yellow",
]
nouns = [
"account",
"acknowledgment",
"address",
"advertising",
"airplane",
"animal",
"appointment",
"arrival",
"artist",
"attachment",
"attitude",
"availability",
"backpack",
"bag",
"balance",
"bass",
"bean",
"beauty",
"bibliography",
"bill",
"bite",
"blossom",
"boat",
"book",
"box",
"boy",
"bread",
"bridge",
"broccoli",
"building",
"butter",
"button",
"cabbage",
"cake",
"camera",
"camp",
"candle",
"candy",
"canvas",
"car",
"card",
"carrot",
"cart",
"case",
"cat",
"chain",
"chair",
"chalk",
"chance",
"change",
"channel",
"character",
"charge",
"charm",
"chart",
"check",
"cheek",
"cheese",
"chef",
"cherry",
"chicken",
"child",
"church",
"circle",
"class",
"clay",
"click",
"clock",
"cloth",
"cloud",
"clove",
"club",
"coach",
"coal",
"coast",
"coat",
"cod",
"coffee",
"collar",
"color",
"comb",
"comfort",
"comic",
"committee",
"community",
"company",
"comparison",
"competition",
"condition",
"connection",
"control",
"cook",
"copper",
"copy",
"corn",
"cough",
"country",
"cover",
"crate",
"crayon",
"cream",
"creator",
"crew",
"crown",
"current",
"curtain",
"curve",
"cushion",
"dad",
"daughter",
"day",
"death",
"debt",
"decision",
"deer",
"degree",
"design",
"desire",
"desk",
"detail",
"development",
"digestion",
"dime",
"dinner",
"direction",
"dirt",
"discovery",
"discussion",
"disease",
"disgust",
"distance",
"distribution",
"division",
"doctor",
"dog",
"door",
"drain",
"drawer",
"dress",
"drink",
"driving",
"dust",
"ear",
"earth",
"edge",
"education",
"effect",
"egg",
"end",
"energy",
"engine",
"error",
"event",
"example",
"exchange",
"existence",
"expansion",
"experience",
"expert",
"eye",
"face",
"fact",
"fall",
"family",
"farm",
"father",
"fear",
"feeling",
"field",
"finger",
"fire",
"fish",
"flag",
"flight",
"floor",
"flower",
"fold",
"food",
"football",
"force",
"form",
"frame",
"friend",
"frog",
"fruit",
"fuel",
"furniture",
"game",
"garden",
"gate",
"girl",
"glass",
"glove",
"goat",
"gold",
"government",
"grade",
"grain",
"grass",
"green",
"grip",
"group",
"growth",
"guide",
"guitar",
"hair",
"hall",
"hand",
"harbor",
"harmony",
"hat",
"head",
"health",
"heart",
"heat",
"hill",
"history",
"hobbies",
"hole",
"hope",
"horn",
"horse",
"hospital",
"hour",
"house",
"humor",
"idea",
"impulse",
"income",
"increase",
"industry",
"ink",
"insect",
"instrument",
"insurance",
"interest",
"invention",
"iron",
"island",
"jelly",
"jet",
"jewel",
"join",
"judge",
"juice",
"jump",
"kettle",
"key",
"kick",
"kiss",
"kitten",
"knee",
"knife",
"knowledge",
"land",
"language",
"laugh",
"law",
"lead",
"learning",
"leather",
"leg",
"lettuce",
"level",
"library",
"lift",
"light",
"limit",
"line",
"linen",
"lip",
"liquid",
"list",
"look",
"loss",
"love",
"lunch",
"machine",
"man",
"manager",
"map",
"marble",
"mark",
"market",
"mass",
"match",
"meal",
"measure",
"meat",
"meeting",
"memory",
"metal",
"middle",
"milk",
"mind",
"mine",
"minute",
"mist",
"mitten",
"mom",
"money",
"monkey",
"month",
"moon",
"morning",
"mother",
"motion",
"mountain",
"mouth",
"muscle",
"music",
"nail",
"name",
"nation",
"neck",
"need",
"news",
"night",
"noise",
"note",
"number",
"nut",
"observation",
"offer",
"oil",
"operation",
"opinion",
"orange",
"order",
"organization",
"ornament",
"oven",
"page",
"pail",
"pain",
"paint",
"pan",
"pancake",
"paper",
"parcel",
"parent",
"part",
"passenger",
"paste",
"payment",
"peace",
"pear",
"pen",
"pencil",
"person",
"pest",
"pet",
"picture",
"pie",
"pin",
"pipe",
"pizza",
"place",
"plane",
"plant",
"plastic",
"plate",
"play",
"pleasure",
"plot",
"plough",
"pocket",
"point",
"poison",
"police",
"pollution",
"popcorn",
"porter",
"position",
"pot",
"potato",
"powder",
"power",
"price",
"print",
"process",
"produce",
"product",
"profit",
"property",
"prose",
"protest",
"pull",
"pump",
"punishment",
"purpose",
"push",
"quarter",
"question",
"quiet",
"quill",
"quilt",
"quince",
"rabbit",
"rail",
"rain",
"range",
"rat",
"rate",
"ray",
"reaction",
"reading",
"reason",
"record",
"regret",
"relation",
"religion",
"representative",
"request",
"respect",
"rest",
"reward",
"rhythm",
"rice",
"river",
"road",
"roll",
"room",
"root",
"rose",
"route",
"rub",
"rule",
"run",
"sack",
"sail",
"salt",
"sand",
"scale",
"scarecrow",
"scarf",
"scene",
"scent",
"school",
"science",
"scissors",
"screw",
"sea",
"seat",
"secretary",
"seed",
"selection",
"self",
"sense",
"servant",
"shade",
"shake",
"shame",
"shape",
"sheep",
"sheet",
"shelf",
"ship",
"shirt",
"shock",
"shoe",
"shop",
"show",
"side",
"sign",
"silk",
"sink",
"sister",
"size",
"sky",
"sleep",
"smash",
"smell",
"smile",
"smoke",
"snail",
"snake",
"sneeze",
"snow",
"soap",
"society",
"sock",
"soda",
"sofa",
"son",
"song",
"sort",
"sound",
"soup",
"space",
"spark",
"speed",
"sponge",
"spoon",
"spray",
"spring",
"spy",
"square",
"stamp",
"star",
"start",
"statement",
"station",
"steam",
"steel",
"stem",
"step",
"stew",
"stick",
"stitch",
"stocking",
"stomach",
"stone",
"stop",
"store",
"story",
"stove",
"stranger",
"straw",
"stream",
"street",
"stretch",
"string",
"structure",
"substance",
"sugar",
"suggestion",
"suit",
"summer",
"sun",
"support",
"surprise",
"sweater",
"swim",
"system",
"table",
"tail",
"talk",
"tank",
"taste",
"tax",
"tea",
"teaching",
"team",
"tendency",
"test",
"texture",
"theory",
"thing",
"thought",
"thread",
"throat",
"thumb",
"thunder",
"ticket",
"time",
"tin",
"title",
"toad",
"toe",
"tooth",
"toothpaste",
"touch",
"town",
"toy",
"trade",
"train",
"transport",
"tray",
"treatment",
"tree",
"trick",
"trip",
"trouble",
"trousers",
"truck",
"tub",
"turkey",
"turn",
"twist",
"umbrella",
"uncle",
"underwear",
"unit",
"use",
"vacation",
"value",
"van",
"vase",
"vegetable",
"veil",
"vein",
"verse",
"vessel",
"view",
"visitor",
"voice",
"volcano",
"walk",
"wall",
"war",
"wash",
"waste",
"watch",
"water",
"wave",
"wax",
"way",
"wealth",
"weather",
"week",
"weight",
"wheel",
"whip",
"whistle",
"window",
"wine",
"wing",
"winter",
"wire",
"wish",
"woman",
"wood",
"wool",
"word",
"work",
"worm",
"wound",
"wrist",
"writer",
"yard",
"yoke",
"zebra",
"zinc",
"zipper",
"zone",
]
def random_name() -> str:
"""Generate a random name."""
adjective = random.choice(adjectives)
noun = random.choice(nouns)
number = random.randint(1, 100)
return f"{adjective}-{noun}-{number}"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,302 @@
"""Contains the `LLMEvaluator` class for building LLM-as-a-judge evaluators."""
from typing import Any, Callable, Optional, Union, cast
from pydantic import BaseModel
from langsmith._internal._beta_decorator import warn_beta
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
from langsmith.schemas import Example, Run
class CategoricalScoreConfig(BaseModel):
"""Configuration for a categorical score."""
key: str
choices: list[str]
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None
class ContinuousScoreConfig(BaseModel):
"""Configuration for a continuous score."""
key: str
min: float = 0
max: float = 1
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None
def _create_score_json_schema(
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
) -> dict:
properties: dict[str, Any] = {}
if isinstance(score_config, CategoricalScoreConfig):
properties["score"] = {
"type": "string",
"enum": score_config.choices,
"description": f"The score for the evaluation, one of "
f"{', '.join(score_config.choices)}.",
}
elif isinstance(score_config, ContinuousScoreConfig):
properties["score"] = {
"type": "number",
"minimum": score_config.min,
"maximum": score_config.max,
"description": f"The score for the evaluation, between "
f"{score_config.min} and {score_config.max}, inclusive.",
}
else:
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
if score_config.include_explanation:
properties["explanation"] = {
"type": "string",
"description": (
"The explanation for the score."
if score_config.explanation_description is None
else score_config.explanation_description
),
}
return {
"title": score_config.key,
"description": score_config.description,
"type": "object",
"properties": properties,
"required": (
["score", "explanation"] if score_config.include_explanation else ["score"]
),
}
class LLMEvaluator(RunEvaluator):
"""A class for building LLM-as-a-judge evaluators.
.. deprecated:: 0.5.0
LLMEvaluator is deprecated. Use openevals instead: https://github.com/langchain-ai/openevals
"""
def __init__(
self,
*,
prompt_template: Union[str, list[tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
model_name: str = "gpt-4o",
model_provider: str = "openai",
**kwargs,
):
"""Initialize the `LLMEvaluator`.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a human / user message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example], dict]], optional):
A function that maps the run and example to the variables in the
prompt.
If `None`, it is assumed that the prompt only requires 'input',
'output', and 'expected'.
model_name (Optional[str], optional): The model to use for the evaluation.
model_provider (Optional[str], optional): The model provider to use
for the evaluation.
"""
try:
from langchain.chat_models import ( # type: ignore[import-not-found]
init_chat_model,
)
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain to be installed. "
"Please install langchain by running `pip install langchain`."
) from e
chat_model = init_chat_model(
model=model_name, model_provider=model_provider, **kwargs
)
self._initialize(prompt_template, score_config, map_variables, chat_model)
@classmethod
def from_model(
cls,
model: Any,
*,
prompt_template: Union[str, list[tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
):
"""Create an `LLMEvaluator` instance from a `BaseChatModel` instance.
Args:
model (BaseChatModel): The chat model instance to use for the evaluation.
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a system message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example]], dict]], optional):
A function that maps the run and example to the variables in the
prompt.
If `None`, it is assumed that the prompt only requires 'input',
'output', and 'expected'.
Returns:
LLMEvaluator: An instance of `LLMEvaluator`.
"""
instance = cls.__new__(cls)
instance._initialize(prompt_template, score_config, map_variables, model)
return instance
def _initialize(
self,
prompt_template: Union[str, list[tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
chat_model: Any,
):
"""Shared initialization code for `__init__` and `from_model`.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The score configuration.
map_variables (Optional[Callable[[Run, Example]], dict]]):
Function to map variables.
chat_model (BaseChatModel): The chat model instance.
"""
try:
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain-core to be installed. "
"Please install langchain-core by running `pip install langchain-core`."
) from e
if not (
isinstance(chat_model, BaseChatModel)
and hasattr(chat_model, "with_structured_output")
):
raise ValueError(
"chat_model must be an instance of "
"BaseLanguageModel and support structured output."
)
if isinstance(prompt_template, str):
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
else:
self.prompt = ChatPromptTemplate.from_messages(prompt_template)
if set(self.prompt.input_variables) - {"input", "output", "expected"}:
if not map_variables:
raise ValueError(
"map_inputs must be provided if the prompt template contains "
"variables other than 'input', 'output', and 'expected'"
)
self.map_variables = map_variables
self.score_config = score_config
self.score_schema = _create_score_json_schema(self.score_config)
chat_model = chat_model.with_structured_output(self.score_schema)
self.runnable = self.prompt | chat_model
@warn_beta
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, self.runnable.invoke(variables))
return self._parse_output(output)
@warn_beta
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Asynchronously evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, await self.runnable.ainvoke(variables))
return self._parse_output(output)
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
"""Prepare variables for model invocation."""
if self.map_variables:
return self.map_variables(run, example)
variables = {}
if "input" in self.prompt.input_variables:
if len(run.inputs) == 0:
raise ValueError(
"No input keys are present in run.inputs but the prompt "
"requires 'input'."
)
if len(run.inputs) != 1:
raise ValueError(
"Multiple input keys are present in run.inputs. Please provide "
"a map_variables function."
)
variables["input"] = list(run.inputs.values())[0]
if "output" in self.prompt.input_variables:
if not run.outputs:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) == 0:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) != 1:
raise ValueError(
"Multiple output keys are present in run.outputs. Please "
"provide a map_variables function."
)
variables["output"] = list(run.outputs.values())[0]
if "expected" in self.prompt.input_variables:
if not example or not example.outputs:
raise ValueError(
"No example or example outputs is provided but the prompt "
"requires 'expected'."
)
if len(example.outputs) == 0:
raise ValueError(
"No output keys are present in example.outputs but the prompt "
"requires 'expected'."
)
if len(example.outputs) != 1:
raise ValueError(
"Multiple output keys are present in example.outputs. Please "
"provide a map_variables function."
)
variables["expected"] = list(example.outputs.values())[0]
return variables
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
"""Parse the model output into an evaluation result."""
if isinstance(self.score_config, CategoricalScoreConfig):
value = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, value=value, comment=explanation
)
elif isinstance(self.score_config, ContinuousScoreConfig):
score = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, score=score, comment=explanation
)

View File

@@ -0,0 +1,47 @@
"""This module contains the StringEvaluator class."""
import uuid
from typing import Callable, Optional
from pydantic import BaseModel
from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run
class StringEvaluator(RunEvaluator, BaseModel):
"""Grades the run's string input, output, and optional answer.
.. deprecated:: 0.5.0
StringEvaluator is deprecated. Use openevals instead: https://github.com/langchain-ai/openevals
"""
evaluation_name: Optional[str] = None
"""The name evaluation, such as `'Accuracy'` or `'Salience'`."""
input_key: str = "input"
"""The key in the run inputs to extract the input string."""
prediction_key: str = "output"
"""The key in the run outputs to extra the prediction string."""
answer_key: Optional[str] = "output"
"""The key in the example outputs the answer string."""
grading_function: Callable[[str, str, Optional[str]], dict]
"""Function that grades the run output against the example output."""
def evaluate_run(
self,
run: Run,
example: Optional[Example] = None,
evaluator_run_id: Optional[uuid.UUID] = None,
) -> EvaluationResult:
"""Evaluate a single run."""
if run.outputs is None:
raise ValueError("Run outputs cannot be None.")
if not example or example.outputs is None or self.answer_key is None:
answer = None
else:
answer = example.outputs.get(self.answer_key)
run_input = run.inputs[self.input_key]
run_output = run.outputs[self.prediction_key]
grading_results = self.grading_function(run_input, run_output, answer)
return EvaluationResult(**{"key": self.evaluation_name, **grading_results})