initial commit
This commit is contained in:
89
venv/Lib/site-packages/langsmith/evaluation/__init__.py
Normal file
89
venv/Lib/site-packages/langsmith/evaluation/__init__.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""Evaluation Helpers."""
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langsmith.evaluation._arunner import (
|
||||
aevaluate,
|
||||
aevaluate_existing,
|
||||
)
|
||||
from langsmith.evaluation._runner import (
|
||||
evaluate,
|
||||
evaluate_comparative,
|
||||
evaluate_existing,
|
||||
)
|
||||
from langsmith.evaluation.evaluator import (
|
||||
EvaluationResult,
|
||||
EvaluationResults,
|
||||
RunEvaluator,
|
||||
run_evaluator,
|
||||
)
|
||||
|
||||
|
||||
def __getattr__(
|
||||
name: str,
|
||||
) -> Any:
|
||||
""".. deprecated:: 0.5.0.
|
||||
|
||||
Importing from langsmith.evaluation is deprecated. Use client.evaluate() instead.
|
||||
"""
|
||||
if name == "evaluate":
|
||||
from langsmith.evaluation._runner import evaluate
|
||||
|
||||
return evaluate
|
||||
elif name == "evaluate_existing":
|
||||
from langsmith.evaluation._runner import evaluate_existing
|
||||
|
||||
return evaluate_existing
|
||||
elif name == "aevaluate":
|
||||
from langsmith.evaluation._arunner import aevaluate
|
||||
|
||||
return aevaluate
|
||||
elif name == "aevaluate_existing":
|
||||
from langsmith.evaluation._arunner import aevaluate_existing
|
||||
|
||||
return aevaluate_existing
|
||||
elif name == "evaluate_comparative":
|
||||
from langsmith.evaluation._runner import evaluate_comparative
|
||||
|
||||
return evaluate_comparative
|
||||
elif name == "EvaluationResult":
|
||||
from langsmith.evaluation.evaluator import EvaluationResult
|
||||
|
||||
return EvaluationResult
|
||||
elif name == "EvaluationResults":
|
||||
from langsmith.evaluation.evaluator import EvaluationResults
|
||||
|
||||
return EvaluationResults
|
||||
elif name == "RunEvaluator":
|
||||
from langsmith.evaluation.evaluator import RunEvaluator
|
||||
|
||||
return RunEvaluator
|
||||
elif name == "run_evaluator":
|
||||
from langsmith.evaluation.evaluator import run_evaluator
|
||||
|
||||
return run_evaluator
|
||||
elif name == "StringEvaluator":
|
||||
from langsmith.evaluation.string_evaluator import StringEvaluator
|
||||
|
||||
return StringEvaluator
|
||||
|
||||
raise AttributeError(f"module {__name__} has no attribute {name}")
|
||||
|
||||
|
||||
__all__ = [
|
||||
"run_evaluator",
|
||||
"EvaluationResult",
|
||||
"EvaluationResults",
|
||||
"RunEvaluator",
|
||||
"StringEvaluator",
|
||||
"aevaluate",
|
||||
"aevaluate_existing",
|
||||
"evaluate",
|
||||
"evaluate_existing",
|
||||
"evaluate_comparative",
|
||||
]
|
||||
|
||||
|
||||
def __dir__() -> list[str]:
|
||||
return __all__
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1356
venv/Lib/site-packages/langsmith/evaluation/_arunner.py
Normal file
1356
venv/Lib/site-packages/langsmith/evaluation/_arunner.py
Normal file
File diff suppressed because it is too large
Load Diff
727
venv/Lib/site-packages/langsmith/evaluation/_name_generation.py
Normal file
727
venv/Lib/site-packages/langsmith/evaluation/_name_generation.py
Normal file
@@ -0,0 +1,727 @@
|
||||
import random
|
||||
|
||||
adjectives = [
|
||||
"abandoned",
|
||||
"aching",
|
||||
"advanced",
|
||||
"ample",
|
||||
"artistic",
|
||||
"back",
|
||||
"best",
|
||||
"bold",
|
||||
"brief",
|
||||
"clear",
|
||||
"cold",
|
||||
"complicated",
|
||||
"cooked",
|
||||
"crazy",
|
||||
"crushing",
|
||||
"damp",
|
||||
"dear",
|
||||
"definite",
|
||||
"dependable",
|
||||
"diligent",
|
||||
"drab",
|
||||
"earnest",
|
||||
"elderly",
|
||||
"enchanted",
|
||||
"essential",
|
||||
"excellent",
|
||||
"extraneous",
|
||||
"fixed",
|
||||
"flowery",
|
||||
"formal",
|
||||
"fresh",
|
||||
"frosty",
|
||||
"giving",
|
||||
"glossy",
|
||||
"healthy",
|
||||
"helpful",
|
||||
"impressionable",
|
||||
"kind",
|
||||
"large",
|
||||
"left",
|
||||
"long",
|
||||
"loyal",
|
||||
"mealy",
|
||||
"memorable",
|
||||
"monthly",
|
||||
"new",
|
||||
"notable",
|
||||
"only",
|
||||
"ordinary",
|
||||
"passionate",
|
||||
"perfect",
|
||||
"pertinent",
|
||||
"proper",
|
||||
"puzzled",
|
||||
"reflecting",
|
||||
"respectful",
|
||||
"roasted",
|
||||
"scholarly",
|
||||
"shiny",
|
||||
"slight",
|
||||
"sparkling",
|
||||
"spotless",
|
||||
"stupendous",
|
||||
"sunny",
|
||||
"tart",
|
||||
"terrific",
|
||||
"timely",
|
||||
"unique",
|
||||
"upbeat",
|
||||
"vacant",
|
||||
"virtual",
|
||||
"warm",
|
||||
"weary",
|
||||
"whispered",
|
||||
"worthwhile",
|
||||
"yellow",
|
||||
]
|
||||
|
||||
nouns = [
|
||||
"account",
|
||||
"acknowledgment",
|
||||
"address",
|
||||
"advertising",
|
||||
"airplane",
|
||||
"animal",
|
||||
"appointment",
|
||||
"arrival",
|
||||
"artist",
|
||||
"attachment",
|
||||
"attitude",
|
||||
"availability",
|
||||
"backpack",
|
||||
"bag",
|
||||
"balance",
|
||||
"bass",
|
||||
"bean",
|
||||
"beauty",
|
||||
"bibliography",
|
||||
"bill",
|
||||
"bite",
|
||||
"blossom",
|
||||
"boat",
|
||||
"book",
|
||||
"box",
|
||||
"boy",
|
||||
"bread",
|
||||
"bridge",
|
||||
"broccoli",
|
||||
"building",
|
||||
"butter",
|
||||
"button",
|
||||
"cabbage",
|
||||
"cake",
|
||||
"camera",
|
||||
"camp",
|
||||
"candle",
|
||||
"candy",
|
||||
"canvas",
|
||||
"car",
|
||||
"card",
|
||||
"carrot",
|
||||
"cart",
|
||||
"case",
|
||||
"cat",
|
||||
"chain",
|
||||
"chair",
|
||||
"chalk",
|
||||
"chance",
|
||||
"change",
|
||||
"channel",
|
||||
"character",
|
||||
"charge",
|
||||
"charm",
|
||||
"chart",
|
||||
"check",
|
||||
"cheek",
|
||||
"cheese",
|
||||
"chef",
|
||||
"cherry",
|
||||
"chicken",
|
||||
"child",
|
||||
"church",
|
||||
"circle",
|
||||
"class",
|
||||
"clay",
|
||||
"click",
|
||||
"clock",
|
||||
"cloth",
|
||||
"cloud",
|
||||
"clove",
|
||||
"club",
|
||||
"coach",
|
||||
"coal",
|
||||
"coast",
|
||||
"coat",
|
||||
"cod",
|
||||
"coffee",
|
||||
"collar",
|
||||
"color",
|
||||
"comb",
|
||||
"comfort",
|
||||
"comic",
|
||||
"committee",
|
||||
"community",
|
||||
"company",
|
||||
"comparison",
|
||||
"competition",
|
||||
"condition",
|
||||
"connection",
|
||||
"control",
|
||||
"cook",
|
||||
"copper",
|
||||
"copy",
|
||||
"corn",
|
||||
"cough",
|
||||
"country",
|
||||
"cover",
|
||||
"crate",
|
||||
"crayon",
|
||||
"cream",
|
||||
"creator",
|
||||
"crew",
|
||||
"crown",
|
||||
"current",
|
||||
"curtain",
|
||||
"curve",
|
||||
"cushion",
|
||||
"dad",
|
||||
"daughter",
|
||||
"day",
|
||||
"death",
|
||||
"debt",
|
||||
"decision",
|
||||
"deer",
|
||||
"degree",
|
||||
"design",
|
||||
"desire",
|
||||
"desk",
|
||||
"detail",
|
||||
"development",
|
||||
"digestion",
|
||||
"dime",
|
||||
"dinner",
|
||||
"direction",
|
||||
"dirt",
|
||||
"discovery",
|
||||
"discussion",
|
||||
"disease",
|
||||
"disgust",
|
||||
"distance",
|
||||
"distribution",
|
||||
"division",
|
||||
"doctor",
|
||||
"dog",
|
||||
"door",
|
||||
"drain",
|
||||
"drawer",
|
||||
"dress",
|
||||
"drink",
|
||||
"driving",
|
||||
"dust",
|
||||
"ear",
|
||||
"earth",
|
||||
"edge",
|
||||
"education",
|
||||
"effect",
|
||||
"egg",
|
||||
"end",
|
||||
"energy",
|
||||
"engine",
|
||||
"error",
|
||||
"event",
|
||||
"example",
|
||||
"exchange",
|
||||
"existence",
|
||||
"expansion",
|
||||
"experience",
|
||||
"expert",
|
||||
"eye",
|
||||
"face",
|
||||
"fact",
|
||||
"fall",
|
||||
"family",
|
||||
"farm",
|
||||
"father",
|
||||
"fear",
|
||||
"feeling",
|
||||
"field",
|
||||
"finger",
|
||||
"fire",
|
||||
"fish",
|
||||
"flag",
|
||||
"flight",
|
||||
"floor",
|
||||
"flower",
|
||||
"fold",
|
||||
"food",
|
||||
"football",
|
||||
"force",
|
||||
"form",
|
||||
"frame",
|
||||
"friend",
|
||||
"frog",
|
||||
"fruit",
|
||||
"fuel",
|
||||
"furniture",
|
||||
"game",
|
||||
"garden",
|
||||
"gate",
|
||||
"girl",
|
||||
"glass",
|
||||
"glove",
|
||||
"goat",
|
||||
"gold",
|
||||
"government",
|
||||
"grade",
|
||||
"grain",
|
||||
"grass",
|
||||
"green",
|
||||
"grip",
|
||||
"group",
|
||||
"growth",
|
||||
"guide",
|
||||
"guitar",
|
||||
"hair",
|
||||
"hall",
|
||||
"hand",
|
||||
"harbor",
|
||||
"harmony",
|
||||
"hat",
|
||||
"head",
|
||||
"health",
|
||||
"heart",
|
||||
"heat",
|
||||
"hill",
|
||||
"history",
|
||||
"hobbies",
|
||||
"hole",
|
||||
"hope",
|
||||
"horn",
|
||||
"horse",
|
||||
"hospital",
|
||||
"hour",
|
||||
"house",
|
||||
"humor",
|
||||
"idea",
|
||||
"impulse",
|
||||
"income",
|
||||
"increase",
|
||||
"industry",
|
||||
"ink",
|
||||
"insect",
|
||||
"instrument",
|
||||
"insurance",
|
||||
"interest",
|
||||
"invention",
|
||||
"iron",
|
||||
"island",
|
||||
"jelly",
|
||||
"jet",
|
||||
"jewel",
|
||||
"join",
|
||||
"judge",
|
||||
"juice",
|
||||
"jump",
|
||||
"kettle",
|
||||
"key",
|
||||
"kick",
|
||||
"kiss",
|
||||
"kitten",
|
||||
"knee",
|
||||
"knife",
|
||||
"knowledge",
|
||||
"land",
|
||||
"language",
|
||||
"laugh",
|
||||
"law",
|
||||
"lead",
|
||||
"learning",
|
||||
"leather",
|
||||
"leg",
|
||||
"lettuce",
|
||||
"level",
|
||||
"library",
|
||||
"lift",
|
||||
"light",
|
||||
"limit",
|
||||
"line",
|
||||
"linen",
|
||||
"lip",
|
||||
"liquid",
|
||||
"list",
|
||||
"look",
|
||||
"loss",
|
||||
"love",
|
||||
"lunch",
|
||||
"machine",
|
||||
"man",
|
||||
"manager",
|
||||
"map",
|
||||
"marble",
|
||||
"mark",
|
||||
"market",
|
||||
"mass",
|
||||
"match",
|
||||
"meal",
|
||||
"measure",
|
||||
"meat",
|
||||
"meeting",
|
||||
"memory",
|
||||
"metal",
|
||||
"middle",
|
||||
"milk",
|
||||
"mind",
|
||||
"mine",
|
||||
"minute",
|
||||
"mist",
|
||||
"mitten",
|
||||
"mom",
|
||||
"money",
|
||||
"monkey",
|
||||
"month",
|
||||
"moon",
|
||||
"morning",
|
||||
"mother",
|
||||
"motion",
|
||||
"mountain",
|
||||
"mouth",
|
||||
"muscle",
|
||||
"music",
|
||||
"nail",
|
||||
"name",
|
||||
"nation",
|
||||
"neck",
|
||||
"need",
|
||||
"news",
|
||||
"night",
|
||||
"noise",
|
||||
"note",
|
||||
"number",
|
||||
"nut",
|
||||
"observation",
|
||||
"offer",
|
||||
"oil",
|
||||
"operation",
|
||||
"opinion",
|
||||
"orange",
|
||||
"order",
|
||||
"organization",
|
||||
"ornament",
|
||||
"oven",
|
||||
"page",
|
||||
"pail",
|
||||
"pain",
|
||||
"paint",
|
||||
"pan",
|
||||
"pancake",
|
||||
"paper",
|
||||
"parcel",
|
||||
"parent",
|
||||
"part",
|
||||
"passenger",
|
||||
"paste",
|
||||
"payment",
|
||||
"peace",
|
||||
"pear",
|
||||
"pen",
|
||||
"pencil",
|
||||
"person",
|
||||
"pest",
|
||||
"pet",
|
||||
"picture",
|
||||
"pie",
|
||||
"pin",
|
||||
"pipe",
|
||||
"pizza",
|
||||
"place",
|
||||
"plane",
|
||||
"plant",
|
||||
"plastic",
|
||||
"plate",
|
||||
"play",
|
||||
"pleasure",
|
||||
"plot",
|
||||
"plough",
|
||||
"pocket",
|
||||
"point",
|
||||
"poison",
|
||||
"police",
|
||||
"pollution",
|
||||
"popcorn",
|
||||
"porter",
|
||||
"position",
|
||||
"pot",
|
||||
"potato",
|
||||
"powder",
|
||||
"power",
|
||||
"price",
|
||||
"print",
|
||||
"process",
|
||||
"produce",
|
||||
"product",
|
||||
"profit",
|
||||
"property",
|
||||
"prose",
|
||||
"protest",
|
||||
"pull",
|
||||
"pump",
|
||||
"punishment",
|
||||
"purpose",
|
||||
"push",
|
||||
"quarter",
|
||||
"question",
|
||||
"quiet",
|
||||
"quill",
|
||||
"quilt",
|
||||
"quince",
|
||||
"rabbit",
|
||||
"rail",
|
||||
"rain",
|
||||
"range",
|
||||
"rat",
|
||||
"rate",
|
||||
"ray",
|
||||
"reaction",
|
||||
"reading",
|
||||
"reason",
|
||||
"record",
|
||||
"regret",
|
||||
"relation",
|
||||
"religion",
|
||||
"representative",
|
||||
"request",
|
||||
"respect",
|
||||
"rest",
|
||||
"reward",
|
||||
"rhythm",
|
||||
"rice",
|
||||
"river",
|
||||
"road",
|
||||
"roll",
|
||||
"room",
|
||||
"root",
|
||||
"rose",
|
||||
"route",
|
||||
"rub",
|
||||
"rule",
|
||||
"run",
|
||||
"sack",
|
||||
"sail",
|
||||
"salt",
|
||||
"sand",
|
||||
"scale",
|
||||
"scarecrow",
|
||||
"scarf",
|
||||
"scene",
|
||||
"scent",
|
||||
"school",
|
||||
"science",
|
||||
"scissors",
|
||||
"screw",
|
||||
"sea",
|
||||
"seat",
|
||||
"secretary",
|
||||
"seed",
|
||||
"selection",
|
||||
"self",
|
||||
"sense",
|
||||
"servant",
|
||||
"shade",
|
||||
"shake",
|
||||
"shame",
|
||||
"shape",
|
||||
"sheep",
|
||||
"sheet",
|
||||
"shelf",
|
||||
"ship",
|
||||
"shirt",
|
||||
"shock",
|
||||
"shoe",
|
||||
"shop",
|
||||
"show",
|
||||
"side",
|
||||
"sign",
|
||||
"silk",
|
||||
"sink",
|
||||
"sister",
|
||||
"size",
|
||||
"sky",
|
||||
"sleep",
|
||||
"smash",
|
||||
"smell",
|
||||
"smile",
|
||||
"smoke",
|
||||
"snail",
|
||||
"snake",
|
||||
"sneeze",
|
||||
"snow",
|
||||
"soap",
|
||||
"society",
|
||||
"sock",
|
||||
"soda",
|
||||
"sofa",
|
||||
"son",
|
||||
"song",
|
||||
"sort",
|
||||
"sound",
|
||||
"soup",
|
||||
"space",
|
||||
"spark",
|
||||
"speed",
|
||||
"sponge",
|
||||
"spoon",
|
||||
"spray",
|
||||
"spring",
|
||||
"spy",
|
||||
"square",
|
||||
"stamp",
|
||||
"star",
|
||||
"start",
|
||||
"statement",
|
||||
"station",
|
||||
"steam",
|
||||
"steel",
|
||||
"stem",
|
||||
"step",
|
||||
"stew",
|
||||
"stick",
|
||||
"stitch",
|
||||
"stocking",
|
||||
"stomach",
|
||||
"stone",
|
||||
"stop",
|
||||
"store",
|
||||
"story",
|
||||
"stove",
|
||||
"stranger",
|
||||
"straw",
|
||||
"stream",
|
||||
"street",
|
||||
"stretch",
|
||||
"string",
|
||||
"structure",
|
||||
"substance",
|
||||
"sugar",
|
||||
"suggestion",
|
||||
"suit",
|
||||
"summer",
|
||||
"sun",
|
||||
"support",
|
||||
"surprise",
|
||||
"sweater",
|
||||
"swim",
|
||||
"system",
|
||||
"table",
|
||||
"tail",
|
||||
"talk",
|
||||
"tank",
|
||||
"taste",
|
||||
"tax",
|
||||
"tea",
|
||||
"teaching",
|
||||
"team",
|
||||
"tendency",
|
||||
"test",
|
||||
"texture",
|
||||
"theory",
|
||||
"thing",
|
||||
"thought",
|
||||
"thread",
|
||||
"throat",
|
||||
"thumb",
|
||||
"thunder",
|
||||
"ticket",
|
||||
"time",
|
||||
"tin",
|
||||
"title",
|
||||
"toad",
|
||||
"toe",
|
||||
"tooth",
|
||||
"toothpaste",
|
||||
"touch",
|
||||
"town",
|
||||
"toy",
|
||||
"trade",
|
||||
"train",
|
||||
"transport",
|
||||
"tray",
|
||||
"treatment",
|
||||
"tree",
|
||||
"trick",
|
||||
"trip",
|
||||
"trouble",
|
||||
"trousers",
|
||||
"truck",
|
||||
"tub",
|
||||
"turkey",
|
||||
"turn",
|
||||
"twist",
|
||||
"umbrella",
|
||||
"uncle",
|
||||
"underwear",
|
||||
"unit",
|
||||
"use",
|
||||
"vacation",
|
||||
"value",
|
||||
"van",
|
||||
"vase",
|
||||
"vegetable",
|
||||
"veil",
|
||||
"vein",
|
||||
"verse",
|
||||
"vessel",
|
||||
"view",
|
||||
"visitor",
|
||||
"voice",
|
||||
"volcano",
|
||||
"walk",
|
||||
"wall",
|
||||
"war",
|
||||
"wash",
|
||||
"waste",
|
||||
"watch",
|
||||
"water",
|
||||
"wave",
|
||||
"wax",
|
||||
"way",
|
||||
"wealth",
|
||||
"weather",
|
||||
"week",
|
||||
"weight",
|
||||
"wheel",
|
||||
"whip",
|
||||
"whistle",
|
||||
"window",
|
||||
"wine",
|
||||
"wing",
|
||||
"winter",
|
||||
"wire",
|
||||
"wish",
|
||||
"woman",
|
||||
"wood",
|
||||
"wool",
|
||||
"word",
|
||||
"work",
|
||||
"worm",
|
||||
"wound",
|
||||
"wrist",
|
||||
"writer",
|
||||
"yard",
|
||||
"yoke",
|
||||
"zebra",
|
||||
"zinc",
|
||||
"zipper",
|
||||
"zone",
|
||||
]
|
||||
|
||||
|
||||
def random_name() -> str:
|
||||
"""Generate a random name."""
|
||||
adjective = random.choice(adjectives)
|
||||
noun = random.choice(nouns)
|
||||
number = random.randint(1, 100)
|
||||
return f"{adjective}-{noun}-{number}"
|
||||
2278
venv/Lib/site-packages/langsmith/evaluation/_runner.py
Normal file
2278
venv/Lib/site-packages/langsmith/evaluation/_runner.py
Normal file
File diff suppressed because it is too large
Load Diff
1001
venv/Lib/site-packages/langsmith/evaluation/evaluator.py
Normal file
1001
venv/Lib/site-packages/langsmith/evaluation/evaluator.py
Normal file
File diff suppressed because it is too large
Load Diff
302
venv/Lib/site-packages/langsmith/evaluation/llm_evaluator.py
Normal file
302
venv/Lib/site-packages/langsmith/evaluation/llm_evaluator.py
Normal file
@@ -0,0 +1,302 @@
|
||||
"""Contains the `LLMEvaluator` class for building LLM-as-a-judge evaluators."""
|
||||
|
||||
from typing import Any, Callable, Optional, Union, cast
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langsmith._internal._beta_decorator import warn_beta
|
||||
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
|
||||
class CategoricalScoreConfig(BaseModel):
|
||||
"""Configuration for a categorical score."""
|
||||
|
||||
key: str
|
||||
choices: list[str]
|
||||
description: str
|
||||
include_explanation: bool = False
|
||||
explanation_description: Optional[str] = None
|
||||
|
||||
|
||||
class ContinuousScoreConfig(BaseModel):
|
||||
"""Configuration for a continuous score."""
|
||||
|
||||
key: str
|
||||
min: float = 0
|
||||
max: float = 1
|
||||
description: str
|
||||
include_explanation: bool = False
|
||||
explanation_description: Optional[str] = None
|
||||
|
||||
|
||||
def _create_score_json_schema(
|
||||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
||||
) -> dict:
|
||||
properties: dict[str, Any] = {}
|
||||
if isinstance(score_config, CategoricalScoreConfig):
|
||||
properties["score"] = {
|
||||
"type": "string",
|
||||
"enum": score_config.choices,
|
||||
"description": f"The score for the evaluation, one of "
|
||||
f"{', '.join(score_config.choices)}.",
|
||||
}
|
||||
elif isinstance(score_config, ContinuousScoreConfig):
|
||||
properties["score"] = {
|
||||
"type": "number",
|
||||
"minimum": score_config.min,
|
||||
"maximum": score_config.max,
|
||||
"description": f"The score for the evaluation, between "
|
||||
f"{score_config.min} and {score_config.max}, inclusive.",
|
||||
}
|
||||
else:
|
||||
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
|
||||
|
||||
if score_config.include_explanation:
|
||||
properties["explanation"] = {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"The explanation for the score."
|
||||
if score_config.explanation_description is None
|
||||
else score_config.explanation_description
|
||||
),
|
||||
}
|
||||
|
||||
return {
|
||||
"title": score_config.key,
|
||||
"description": score_config.description,
|
||||
"type": "object",
|
||||
"properties": properties,
|
||||
"required": (
|
||||
["score", "explanation"] if score_config.include_explanation else ["score"]
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class LLMEvaluator(RunEvaluator):
|
||||
"""A class for building LLM-as-a-judge evaluators.
|
||||
|
||||
.. deprecated:: 0.5.0
|
||||
|
||||
LLMEvaluator is deprecated. Use openevals instead: https://github.com/langchain-ai/openevals
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
prompt_template: Union[str, list[tuple[str, str]]],
|
||||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
||||
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
|
||||
model_name: str = "gpt-4o",
|
||||
model_provider: str = "openai",
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the `LLMEvaluator`.
|
||||
|
||||
Args:
|
||||
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
|
||||
template to use for the evaluation. If a string is provided, it is
|
||||
assumed to be a human / user message.
|
||||
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
|
||||
The configuration for the score, either categorical or continuous.
|
||||
map_variables (Optional[Callable[[Run, Example], dict]], optional):
|
||||
A function that maps the run and example to the variables in the
|
||||
prompt.
|
||||
|
||||
If `None`, it is assumed that the prompt only requires 'input',
|
||||
'output', and 'expected'.
|
||||
model_name (Optional[str], optional): The model to use for the evaluation.
|
||||
model_provider (Optional[str], optional): The model provider to use
|
||||
for the evaluation.
|
||||
"""
|
||||
try:
|
||||
from langchain.chat_models import ( # type: ignore[import-not-found]
|
||||
init_chat_model,
|
||||
)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"LLMEvaluator requires langchain to be installed. "
|
||||
"Please install langchain by running `pip install langchain`."
|
||||
) from e
|
||||
|
||||
chat_model = init_chat_model(
|
||||
model=model_name, model_provider=model_provider, **kwargs
|
||||
)
|
||||
|
||||
self._initialize(prompt_template, score_config, map_variables, chat_model)
|
||||
|
||||
@classmethod
|
||||
def from_model(
|
||||
cls,
|
||||
model: Any,
|
||||
*,
|
||||
prompt_template: Union[str, list[tuple[str, str]]],
|
||||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
||||
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
|
||||
):
|
||||
"""Create an `LLMEvaluator` instance from a `BaseChatModel` instance.
|
||||
|
||||
Args:
|
||||
model (BaseChatModel): The chat model instance to use for the evaluation.
|
||||
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
|
||||
template to use for the evaluation. If a string is provided, it is
|
||||
assumed to be a system message.
|
||||
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
|
||||
The configuration for the score, either categorical or continuous.
|
||||
map_variables (Optional[Callable[[Run, Example]], dict]], optional):
|
||||
A function that maps the run and example to the variables in the
|
||||
prompt.
|
||||
|
||||
If `None`, it is assumed that the prompt only requires 'input',
|
||||
'output', and 'expected'.
|
||||
|
||||
Returns:
|
||||
LLMEvaluator: An instance of `LLMEvaluator`.
|
||||
"""
|
||||
instance = cls.__new__(cls)
|
||||
instance._initialize(prompt_template, score_config, map_variables, model)
|
||||
return instance
|
||||
|
||||
def _initialize(
|
||||
self,
|
||||
prompt_template: Union[str, list[tuple[str, str]]],
|
||||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
|
||||
map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
|
||||
chat_model: Any,
|
||||
):
|
||||
"""Shared initialization code for `__init__` and `from_model`.
|
||||
|
||||
Args:
|
||||
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
|
||||
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
|
||||
The score configuration.
|
||||
map_variables (Optional[Callable[[Run, Example]], dict]]):
|
||||
Function to map variables.
|
||||
chat_model (BaseChatModel): The chat model instance.
|
||||
"""
|
||||
try:
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"LLMEvaluator requires langchain-core to be installed. "
|
||||
"Please install langchain-core by running `pip install langchain-core`."
|
||||
) from e
|
||||
|
||||
if not (
|
||||
isinstance(chat_model, BaseChatModel)
|
||||
and hasattr(chat_model, "with_structured_output")
|
||||
):
|
||||
raise ValueError(
|
||||
"chat_model must be an instance of "
|
||||
"BaseLanguageModel and support structured output."
|
||||
)
|
||||
|
||||
if isinstance(prompt_template, str):
|
||||
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
|
||||
else:
|
||||
self.prompt = ChatPromptTemplate.from_messages(prompt_template)
|
||||
|
||||
if set(self.prompt.input_variables) - {"input", "output", "expected"}:
|
||||
if not map_variables:
|
||||
raise ValueError(
|
||||
"map_inputs must be provided if the prompt template contains "
|
||||
"variables other than 'input', 'output', and 'expected'"
|
||||
)
|
||||
self.map_variables = map_variables
|
||||
|
||||
self.score_config = score_config
|
||||
self.score_schema = _create_score_json_schema(self.score_config)
|
||||
|
||||
chat_model = chat_model.with_structured_output(self.score_schema)
|
||||
self.runnable = self.prompt | chat_model
|
||||
|
||||
@warn_beta
|
||||
def evaluate_run(
|
||||
self, run: Run, example: Optional[Example] = None
|
||||
) -> Union[EvaluationResult, EvaluationResults]:
|
||||
"""Evaluate a run."""
|
||||
variables = self._prepare_variables(run, example)
|
||||
output: dict = cast(dict, self.runnable.invoke(variables))
|
||||
return self._parse_output(output)
|
||||
|
||||
@warn_beta
|
||||
async def aevaluate_run(
|
||||
self, run: Run, example: Optional[Example] = None
|
||||
) -> Union[EvaluationResult, EvaluationResults]:
|
||||
"""Asynchronously evaluate a run."""
|
||||
variables = self._prepare_variables(run, example)
|
||||
output: dict = cast(dict, await self.runnable.ainvoke(variables))
|
||||
return self._parse_output(output)
|
||||
|
||||
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
|
||||
"""Prepare variables for model invocation."""
|
||||
if self.map_variables:
|
||||
return self.map_variables(run, example)
|
||||
|
||||
variables = {}
|
||||
if "input" in self.prompt.input_variables:
|
||||
if len(run.inputs) == 0:
|
||||
raise ValueError(
|
||||
"No input keys are present in run.inputs but the prompt "
|
||||
"requires 'input'."
|
||||
)
|
||||
if len(run.inputs) != 1:
|
||||
raise ValueError(
|
||||
"Multiple input keys are present in run.inputs. Please provide "
|
||||
"a map_variables function."
|
||||
)
|
||||
variables["input"] = list(run.inputs.values())[0]
|
||||
|
||||
if "output" in self.prompt.input_variables:
|
||||
if not run.outputs:
|
||||
raise ValueError(
|
||||
"No output keys are present in run.outputs but the prompt "
|
||||
"requires 'output'."
|
||||
)
|
||||
if len(run.outputs) == 0:
|
||||
raise ValueError(
|
||||
"No output keys are present in run.outputs but the prompt "
|
||||
"requires 'output'."
|
||||
)
|
||||
if len(run.outputs) != 1:
|
||||
raise ValueError(
|
||||
"Multiple output keys are present in run.outputs. Please "
|
||||
"provide a map_variables function."
|
||||
)
|
||||
variables["output"] = list(run.outputs.values())[0]
|
||||
|
||||
if "expected" in self.prompt.input_variables:
|
||||
if not example or not example.outputs:
|
||||
raise ValueError(
|
||||
"No example or example outputs is provided but the prompt "
|
||||
"requires 'expected'."
|
||||
)
|
||||
if len(example.outputs) == 0:
|
||||
raise ValueError(
|
||||
"No output keys are present in example.outputs but the prompt "
|
||||
"requires 'expected'."
|
||||
)
|
||||
if len(example.outputs) != 1:
|
||||
raise ValueError(
|
||||
"Multiple output keys are present in example.outputs. Please "
|
||||
"provide a map_variables function."
|
||||
)
|
||||
variables["expected"] = list(example.outputs.values())[0]
|
||||
|
||||
return variables
|
||||
|
||||
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
|
||||
"""Parse the model output into an evaluation result."""
|
||||
if isinstance(self.score_config, CategoricalScoreConfig):
|
||||
value = output["score"]
|
||||
explanation = output.get("explanation", None)
|
||||
return EvaluationResult(
|
||||
key=self.score_config.key, value=value, comment=explanation
|
||||
)
|
||||
elif isinstance(self.score_config, ContinuousScoreConfig):
|
||||
score = output["score"]
|
||||
explanation = output.get("explanation", None)
|
||||
return EvaluationResult(
|
||||
key=self.score_config.key, score=score, comment=explanation
|
||||
)
|
||||
@@ -0,0 +1,47 @@
|
||||
"""This module contains the StringEvaluator class."""
|
||||
|
||||
import uuid
|
||||
from typing import Callable, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
|
||||
class StringEvaluator(RunEvaluator, BaseModel):
|
||||
"""Grades the run's string input, output, and optional answer.
|
||||
|
||||
.. deprecated:: 0.5.0
|
||||
|
||||
StringEvaluator is deprecated. Use openevals instead: https://github.com/langchain-ai/openevals
|
||||
"""
|
||||
|
||||
evaluation_name: Optional[str] = None
|
||||
"""The name evaluation, such as `'Accuracy'` or `'Salience'`."""
|
||||
input_key: str = "input"
|
||||
"""The key in the run inputs to extract the input string."""
|
||||
prediction_key: str = "output"
|
||||
"""The key in the run outputs to extra the prediction string."""
|
||||
answer_key: Optional[str] = "output"
|
||||
"""The key in the example outputs the answer string."""
|
||||
grading_function: Callable[[str, str, Optional[str]], dict]
|
||||
"""Function that grades the run output against the example output."""
|
||||
|
||||
def evaluate_run(
|
||||
self,
|
||||
run: Run,
|
||||
example: Optional[Example] = None,
|
||||
evaluator_run_id: Optional[uuid.UUID] = None,
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate a single run."""
|
||||
if run.outputs is None:
|
||||
raise ValueError("Run outputs cannot be None.")
|
||||
if not example or example.outputs is None or self.answer_key is None:
|
||||
answer = None
|
||||
else:
|
||||
answer = example.outputs.get(self.answer_key)
|
||||
run_input = run.inputs[self.input_key]
|
||||
run_output = run.outputs[self.prediction_key]
|
||||
grading_results = self.grading_function(run_input, run_output, answer)
|
||||
return EvaluationResult(**{"key": self.evaluation_name, **grading_results})
|
||||
Reference in New Issue
Block a user