initial commit

2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/init.py
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/init.py
@@ -0,0 +1,72 @@
+"""LangSmith evaluation utilities.
+
+This module provides utilities for evaluating Chains and other language model
+applications using LangChain evaluators and LangSmith.
+
+For more information on the LangSmith API, see the
+[LangSmith API documentation](https://docs.langchain.com/langsmith/home).
+
+**Example**
+
+```python
+from langsmith import Client
+from langchain_openai import ChatOpenAI
+from langchain_classic.chains import LLMChain
+from langchain_classic.smith import EvaluatorType, RunEvalConfig, run_on_dataset
+
+
+def construct_chain():
+    model = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(model, "What's the answer to {your_input_key}")
+    return chain
+
+
+evaluation_config = RunEvalConfig(
+    evaluators=[
+        EvaluatorType.QA,  # "Correctness" against a reference answer
+        EvaluatorType.EMBEDDING_DISTANCE,
+        RunEvalConfig.Criteria("helpfulness"),
+        RunEvalConfig.Criteria(
+            {
+                "fifth-grader-score": "Do you have to be smarter than a fifth "
+                "grader to answer this question?"
+            }
+        ),
+    ]
+)
+
+client = Client()
+run_on_dataset(
+    client, "<my_dataset_name>", construct_chain, evaluation=evaluation_config
+)
+```
+
+**Attributes**
+
+- `arun_on_dataset`: Asynchronous function to evaluate a chain or other LangChain
+    component over a dataset.
+- `run_on_dataset`: Function to evaluate a chain or other LangChain component over a
+    dataset.
+- `RunEvalConfig`: Class representing the configuration for running evaluation.
+- `StringRunEvaluatorChain`: Class representing a string run evaluator chain.
+- `InputFormatError`: Exception raised when the input format is incorrect.
+
+"""
+
+from langchain_classic.smith.evaluation.config import RunEvalConfig
+from langchain_classic.smith.evaluation.runner_utils import (
+    InputFormatError,
+    arun_on_dataset,
+    run_on_dataset,
+)
+from langchain_classic.smith.evaluation.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)
+
+__all__ = [
+    "InputFormatError",
+    "RunEvalConfig",
+    "StringRunEvaluatorChain",
+    "arun_on_dataset",
+    "run_on_dataset",
+]
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/init.cpython-311.pyc
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/init.cpython-311.pyc
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/config.cpython-311.pyc
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/config.cpython-311.pyc
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/name_generation.cpython-311.pyc
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/name_generation.cpython-311.pyc
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/progress.cpython-311.pyc
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/progress.cpython-311.pyc
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/runner_utils.cpython-311.pyc
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/runner_utils.cpython-311.pyc
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/string_run_evaluator.cpython-311.pyc
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/pycache/string_run_evaluator.cpython-311.pyc
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/config.py
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/config.py
@@ -0,0 +1,273 @@
+"""Configuration for run evaluators."""
+
+from collections.abc import Callable, Sequence
+from typing import Any
+
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models import BaseLanguageModel
+from langchain_core.prompts import BasePromptTemplate
+from langsmith import RunEvaluator
+from langsmith.evaluation.evaluator import EvaluationResult, EvaluationResults
+from langsmith.schemas import Example, Run
+from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import override
+
+from langchain_classic.evaluation.criteria.eval_chain import CRITERIA_TYPE
+from langchain_classic.evaluation.embedding_distance.base import (
+    EmbeddingDistance as EmbeddingDistanceEnum,
+)
+from langchain_classic.evaluation.schema import EvaluatorType, StringEvaluator
+from langchain_classic.evaluation.string_distance.base import (
+    StringDistance as StringDistanceEnum,
+)
+
+RUN_EVALUATOR_LIKE = Callable[
+    [Run, Example | None],
+    EvaluationResult | EvaluationResults | dict,
+]
+BATCH_EVALUATOR_LIKE = Callable[
+    [Sequence[Run], Sequence[Example] | None],
+    EvaluationResult | EvaluationResults | dict,
+]
+
+
+class EvalConfig(BaseModel):
+    """Configuration for a given run evaluator.
+
+    Attributes:
+        evaluator_type: The type of evaluator to use.
+    """
+
+    evaluator_type: EvaluatorType
+
+    def get_kwargs(self) -> dict[str, Any]:
+        """Get the keyword arguments for the `load_evaluator` call.
+
+        Returns:
+            The keyword arguments for the `load_evaluator` call.
+        """
+        kwargs = {}
+        for field, val in self:
+            if field == "evaluator_type" or val is None:
+                continue
+            kwargs[field] = val
+        return kwargs
+
+
+class SingleKeyEvalConfig(EvalConfig):
+    """Configuration for a run evaluator that only requires a single key."""
+
+    reference_key: str | None = None
+    """The key in the dataset run to use as the reference string.
+    If not provided, we will attempt to infer automatically."""
+    prediction_key: str | None = None
+    """The key from the traced run's outputs dictionary to use to
+    represent the prediction. If not provided, it will be inferred
+    automatically."""
+    input_key: str | None = None
+    """The key from the traced run's inputs dictionary to use to represent the
+    input. If not provided, it will be inferred automatically."""
+
+    @override
+    def get_kwargs(self) -> dict[str, Any]:
+        kwargs = super().get_kwargs()
+        # Filer out the keys that are not needed for the evaluator.
+        for key in ["reference_key", "prediction_key", "input_key"]:
+            kwargs.pop(key, None)
+        return kwargs
+
+
+CUSTOM_EVALUATOR_TYPE = RUN_EVALUATOR_LIKE | RunEvaluator | StringEvaluator
+SINGLE_EVAL_CONFIG_TYPE = EvaluatorType | str | EvalConfig
+
+
+class RunEvalConfig(BaseModel):
+    """Configuration for a run evaluation."""
+
+    evaluators: list[SINGLE_EVAL_CONFIG_TYPE | CUSTOM_EVALUATOR_TYPE] = Field(
+        default_factory=list
+    )
+    """Configurations for which evaluators to apply to the dataset run.
+    Each can be the string of an
+    `EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
+    as `EvaluatorType.QA`, the evaluator type string ("qa"), or a configuration for a
+    given evaluator
+    (e.g.,
+    `RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`)."""
+    custom_evaluators: list[CUSTOM_EVALUATOR_TYPE] | None = None
+    """Custom evaluators to apply to the dataset run."""
+    batch_evaluators: list[BATCH_EVALUATOR_LIKE] | None = None
+    """Evaluators that run on an aggregate/batch level.
+
+    These generate one or more metrics that are assigned to the full test run.
+    As a result, they are not associated with individual traces.
+    """
+
+    reference_key: str | None = None
+    """The key in the dataset run to use as the reference string.
+    If not provided, we will attempt to infer automatically."""
+    prediction_key: str | None = None
+    """The key from the traced run's outputs dictionary to use to
+    represent the prediction. If not provided, it will be inferred
+    automatically."""
+    input_key: str | None = None
+    """The key from the traced run's inputs dictionary to use to represent the
+    input. If not provided, it will be inferred automatically."""
+    eval_llm: BaseLanguageModel | None = None
+    """The language model to pass to any evaluators that require one."""
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+    )
+
+    class Criteria(SingleKeyEvalConfig):
+        """Configuration for a reference-free criteria evaluator.
+
+        Attributes:
+            criteria: The criteria to evaluate.
+            llm: The language model to use for the evaluation chain.
+        """
+
+        criteria: CRITERIA_TYPE | None = None
+        llm: BaseLanguageModel | None = None
+        evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
+
+    class LabeledCriteria(SingleKeyEvalConfig):
+        """Configuration for a labeled (with references) criteria evaluator.
+
+        Attributes:
+            criteria: The criteria to evaluate.
+            llm: The language model to use for the evaluation chain.
+        """
+
+        criteria: CRITERIA_TYPE | None = None
+        llm: BaseLanguageModel | None = None
+        evaluator_type: EvaluatorType = EvaluatorType.LABELED_CRITERIA
+
+    class EmbeddingDistance(SingleKeyEvalConfig):
+        """Configuration for an embedding distance evaluator.
+
+        Attributes:
+            embeddings: The embeddings to use for computing the distance.
+            distance_metric: The distance metric to use for computing the distance.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.EMBEDDING_DISTANCE
+        embeddings: Embeddings | None = None
+        distance_metric: EmbeddingDistanceEnum | None = None
+
+        model_config = ConfigDict(
+            arbitrary_types_allowed=True,
+        )
+
+    class StringDistance(SingleKeyEvalConfig):
+        """Configuration for a string distance evaluator.
+
+        Attributes:
+            distance: The string distance metric to use (`damerau_levenshtein`,
+                `levenshtein`, `jaro`, or `jaro_winkler`).
+            normalize_score: Whether to normalize the distance to between 0 and 1.
+                Applies only to the Levenshtein and Damerau-Levenshtein distances.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
+        distance: StringDistanceEnum | None = None
+        normalize_score: bool = True
+
+    class QA(SingleKeyEvalConfig):
+        """Configuration for a QA evaluator.
+
+        Attributes:
+            prompt: The prompt template to use for generating the question.
+            llm: The language model to use for the evaluation chain.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.QA
+        llm: BaseLanguageModel | None = None
+        prompt: BasePromptTemplate | None = None
+
+    class ContextQA(SingleKeyEvalConfig):
+        """Configuration for a context-based QA evaluator.
+
+        Attributes:
+            prompt: The prompt template to use for generating the question.
+            llm: The language model to use for the evaluation chain.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
+        llm: BaseLanguageModel | None = None
+        prompt: BasePromptTemplate | None = None
+
+    class CoTQA(SingleKeyEvalConfig):
+        """Configuration for a context-based QA evaluator.
+
+        Attributes:
+            prompt: The prompt template to use for generating the question.
+            llm: The language model to use for the evaluation chain.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
+        llm: BaseLanguageModel | None = None
+        prompt: BasePromptTemplate | None = None
+
+    class JsonValidity(SingleKeyEvalConfig):
+        """Configuration for a json validity evaluator."""
+
+        evaluator_type: EvaluatorType = EvaluatorType.JSON_VALIDITY
+
+    class JsonEqualityEvaluator(EvalConfig):
+        """Configuration for a json equality evaluator."""
+
+        evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
+
+    class ExactMatch(SingleKeyEvalConfig):
+        """Configuration for an exact match string evaluator.
+
+        Attributes:
+            ignore_case: Whether to ignore case when comparing strings.
+            ignore_punctuation: Whether to ignore punctuation when comparing strings.
+            ignore_numbers: Whether to ignore numbers when comparing strings.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.EXACT_MATCH
+        ignore_case: bool = False
+        ignore_punctuation: bool = False
+        ignore_numbers: bool = False
+
+    class RegexMatch(SingleKeyEvalConfig):
+        """Configuration for a regex match string evaluator.
+
+        Attributes:
+            flags: The flags to pass to the regex. Example: `re.IGNORECASE`.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
+        flags: int = 0
+
+    class ScoreString(SingleKeyEvalConfig):
+        """Configuration for a score string evaluator.
+
+        This is like the criteria evaluator but it is configured by
+        default to return a score on the scale from 1-10.
+
+        It is recommended to normalize these scores
+        by setting `normalize_by` to 10.
+
+        Attributes:
+            criteria: The criteria to evaluate.
+            llm: The language model to use for the evaluation chain.
+            normalize_by: If you want to normalize the score, the denominator to use.
+                If not provided, the score will be between 1 and 10.
+            prompt: The prompt template to use for evaluation.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.SCORE_STRING
+        criteria: CRITERIA_TYPE | None = None
+        llm: BaseLanguageModel | None = None
+        normalize_by: float | None = None
+        prompt: BasePromptTemplate | None = None
+
+    class LabeledScoreString(ScoreString):
+        """Configuration for a labeled score string evaluator."""
+
+        evaluator_type: EvaluatorType = EvaluatorType.LABELED_SCORE_STRING
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/name_generation.py
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/name_generation.py
@@ -0,0 +1,727 @@
+import random
+
+adjectives = [
+    "abandoned",
+    "aching",
+    "advanced",
+    "ample",
+    "artistic",
+    "back",
+    "best",
+    "bold",
+    "brief",
+    "clear",
+    "cold",
+    "complicated",
+    "cooked",
+    "crazy",
+    "crushing",
+    "damp",
+    "dear",
+    "definite",
+    "dependable",
+    "diligent",
+    "drab",
+    "earnest",
+    "elderly",
+    "enchanted",
+    "essential",
+    "excellent",
+    "extraneous",
+    "fixed",
+    "flowery",
+    "formal",
+    "fresh",
+    "frosty",
+    "giving",
+    "glossy",
+    "healthy",
+    "helpful",
+    "impressionable",
+    "kind",
+    "large",
+    "left",
+    "long",
+    "loyal",
+    "mealy",
+    "memorable",
+    "monthly",
+    "new",
+    "notable",
+    "only",
+    "ordinary",
+    "passionate",
+    "perfect",
+    "pertinent",
+    "proper",
+    "puzzled",
+    "reflecting",
+    "respectful",
+    "roasted",
+    "scholarly",
+    "shiny",
+    "slight",
+    "sparkling",
+    "spotless",
+    "stupendous",
+    "sunny",
+    "tart",
+    "terrific",
+    "timely",
+    "unique",
+    "upbeat",
+    "vacant",
+    "virtual",
+    "warm",
+    "weary",
+    "whispered",
+    "worthwhile",
+    "yellow",
+]
+
+nouns = [
+    "account",
+    "acknowledgment",
+    "address",
+    "advertising",
+    "airplane",
+    "animal",
+    "appointment",
+    "arrival",
+    "artist",
+    "attachment",
+    "attitude",
+    "availability",
+    "backpack",
+    "bag",
+    "balance",
+    "bass",
+    "bean",
+    "beauty",
+    "bibliography",
+    "bill",
+    "bite",
+    "blossom",
+    "boat",
+    "book",
+    "box",
+    "boy",
+    "bread",
+    "bridge",
+    "broccoli",
+    "building",
+    "butter",
+    "button",
+    "cabbage",
+    "cake",
+    "camera",
+    "camp",
+    "candle",
+    "candy",
+    "canvas",
+    "car",
+    "card",
+    "carrot",
+    "cart",
+    "case",
+    "cat",
+    "chain",
+    "chair",
+    "chalk",
+    "chance",
+    "change",
+    "channel",
+    "character",
+    "charge",
+    "charm",
+    "chart",
+    "check",
+    "cheek",
+    "cheese",
+    "chef",
+    "cherry",
+    "chicken",
+    "child",
+    "church",
+    "circle",
+    "class",
+    "clay",
+    "click",
+    "clock",
+    "cloth",
+    "cloud",
+    "clove",
+    "club",
+    "coach",
+    "coal",
+    "coast",
+    "coat",
+    "cod",
+    "coffee",
+    "collar",
+    "color",
+    "comb",
+    "comfort",
+    "comic",
+    "committee",
+    "community",
+    "company",
+    "comparison",
+    "competition",
+    "condition",
+    "connection",
+    "control",
+    "cook",
+    "copper",
+    "copy",
+    "corn",
+    "cough",
+    "country",
+    "cover",
+    "crate",
+    "crayon",
+    "cream",
+    "creator",
+    "crew",
+    "crown",
+    "current",
+    "curtain",
+    "curve",
+    "cushion",
+    "dad",
+    "daughter",
+    "day",
+    "death",
+    "debt",
+    "decision",
+    "deer",
+    "degree",
+    "design",
+    "desire",
+    "desk",
+    "detail",
+    "development",
+    "digestion",
+    "dime",
+    "dinner",
+    "direction",
+    "dirt",
+    "discovery",
+    "discussion",
+    "disease",
+    "disgust",
+    "distance",
+    "distribution",
+    "division",
+    "doctor",
+    "dog",
+    "door",
+    "drain",
+    "drawer",
+    "dress",
+    "drink",
+    "driving",
+    "dust",
+    "ear",
+    "earth",
+    "edge",
+    "education",
+    "effect",
+    "egg",
+    "end",
+    "energy",
+    "engine",
+    "error",
+    "event",
+    "example",
+    "exchange",
+    "existence",
+    "expansion",
+    "experience",
+    "expert",
+    "eye",
+    "face",
+    "fact",
+    "fall",
+    "family",
+    "farm",
+    "father",
+    "fear",
+    "feeling",
+    "field",
+    "finger",
+    "fire",
+    "fish",
+    "flag",
+    "flight",
+    "floor",
+    "flower",
+    "fold",
+    "food",
+    "football",
+    "force",
+    "form",
+    "frame",
+    "friend",
+    "frog",
+    "fruit",
+    "fuel",
+    "furniture",
+    "game",
+    "garden",
+    "gate",
+    "girl",
+    "glass",
+    "glove",
+    "goat",
+    "gold",
+    "government",
+    "grade",
+    "grain",
+    "grass",
+    "green",
+    "grip",
+    "group",
+    "growth",
+    "guide",
+    "guitar",
+    "hair",
+    "hall",
+    "hand",
+    "harbor",
+    "harmony",
+    "hat",
+    "head",
+    "health",
+    "heart",
+    "heat",
+    "hill",
+    "history",
+    "hobbies",
+    "hole",
+    "hope",
+    "horn",
+    "horse",
+    "hospital",
+    "hour",
+    "house",
+    "humor",
+    "idea",
+    "impulse",
+    "income",
+    "increase",
+    "industry",
+    "ink",
+    "insect",
+    "instrument",
+    "insurance",
+    "interest",
+    "invention",
+    "iron",
+    "island",
+    "jelly",
+    "jet",
+    "jewel",
+    "join",
+    "judge",
+    "juice",
+    "jump",
+    "kettle",
+    "key",
+    "kick",
+    "kiss",
+    "kitten",
+    "knee",
+    "knife",
+    "knowledge",
+    "land",
+    "language",
+    "laugh",
+    "law",
+    "lead",
+    "learning",
+    "leather",
+    "leg",
+    "lettuce",
+    "level",
+    "library",
+    "lift",
+    "light",
+    "limit",
+    "line",
+    "linen",
+    "lip",
+    "liquid",
+    "list",
+    "look",
+    "loss",
+    "love",
+    "lunch",
+    "machine",
+    "man",
+    "manager",
+    "map",
+    "marble",
+    "mark",
+    "market",
+    "mass",
+    "match",
+    "meal",
+    "measure",
+    "meat",
+    "meeting",
+    "memory",
+    "metal",
+    "middle",
+    "milk",
+    "mind",
+    "mine",
+    "minute",
+    "mist",
+    "mitten",
+    "mom",
+    "money",
+    "monkey",
+    "month",
+    "moon",
+    "morning",
+    "mother",
+    "motion",
+    "mountain",
+    "mouth",
+    "muscle",
+    "music",
+    "nail",
+    "name",
+    "nation",
+    "neck",
+    "need",
+    "news",
+    "night",
+    "noise",
+    "note",
+    "number",
+    "nut",
+    "observation",
+    "offer",
+    "oil",
+    "operation",
+    "opinion",
+    "orange",
+    "order",
+    "organization",
+    "ornament",
+    "oven",
+    "page",
+    "pail",
+    "pain",
+    "paint",
+    "pan",
+    "pancake",
+    "paper",
+    "parcel",
+    "parent",
+    "part",
+    "passenger",
+    "paste",
+    "payment",
+    "peace",
+    "pear",
+    "pen",
+    "pencil",
+    "person",
+    "pest",
+    "pet",
+    "picture",
+    "pie",
+    "pin",
+    "pipe",
+    "pizza",
+    "place",
+    "plane",
+    "plant",
+    "plastic",
+    "plate",
+    "play",
+    "pleasure",
+    "plot",
+    "plough",
+    "pocket",
+    "point",
+    "poison",
+    "police",
+    "pollution",
+    "popcorn",
+    "porter",
+    "position",
+    "pot",
+    "potato",
+    "powder",
+    "power",
+    "price",
+    "print",
+    "process",
+    "produce",
+    "product",
+    "profit",
+    "property",
+    "prose",
+    "protest",
+    "pull",
+    "pump",
+    "punishment",
+    "purpose",
+    "push",
+    "quarter",
+    "question",
+    "quiet",
+    "quill",
+    "quilt",
+    "quince",
+    "rabbit",
+    "rail",
+    "rain",
+    "range",
+    "rat",
+    "rate",
+    "ray",
+    "reaction",
+    "reading",
+    "reason",
+    "record",
+    "regret",
+    "relation",
+    "religion",
+    "representative",
+    "request",
+    "respect",
+    "rest",
+    "reward",
+    "rhythm",
+    "rice",
+    "river",
+    "road",
+    "roll",
+    "room",
+    "root",
+    "rose",
+    "route",
+    "rub",
+    "rule",
+    "run",
+    "sack",
+    "sail",
+    "salt",
+    "sand",
+    "scale",
+    "scarecrow",
+    "scarf",
+    "scene",
+    "scent",
+    "school",
+    "science",
+    "scissors",
+    "screw",
+    "sea",
+    "seat",
+    "secretary",
+    "seed",
+    "selection",
+    "self",
+    "sense",
+    "servant",
+    "shade",
+    "shake",
+    "shame",
+    "shape",
+    "sheep",
+    "sheet",
+    "shelf",
+    "ship",
+    "shirt",
+    "shock",
+    "shoe",
+    "shop",
+    "show",
+    "side",
+    "sign",
+    "silk",
+    "sink",
+    "sister",
+    "size",
+    "sky",
+    "sleep",
+    "smash",
+    "smell",
+    "smile",
+    "smoke",
+    "snail",
+    "snake",
+    "sneeze",
+    "snow",
+    "soap",
+    "society",
+    "sock",
+    "soda",
+    "sofa",
+    "son",
+    "song",
+    "sort",
+    "sound",
+    "soup",
+    "space",
+    "spark",
+    "speed",
+    "sponge",
+    "spoon",
+    "spray",
+    "spring",
+    "spy",
+    "square",
+    "stamp",
+    "star",
+    "start",
+    "statement",
+    "station",
+    "steam",
+    "steel",
+    "stem",
+    "step",
+    "stew",
+    "stick",
+    "stitch",
+    "stocking",
+    "stomach",
+    "stone",
+    "stop",
+    "store",
+    "story",
+    "stove",
+    "stranger",
+    "straw",
+    "stream",
+    "street",
+    "stretch",
+    "string",
+    "structure",
+    "substance",
+    "sugar",
+    "suggestion",
+    "suit",
+    "summer",
+    "sun",
+    "support",
+    "surprise",
+    "sweater",
+    "swim",
+    "system",
+    "table",
+    "tail",
+    "talk",
+    "tank",
+    "taste",
+    "tax",
+    "tea",
+    "teaching",
+    "team",
+    "tendency",
+    "test",
+    "texture",
+    "theory",
+    "thing",
+    "thought",
+    "thread",
+    "throat",
+    "thumb",
+    "thunder",
+    "ticket",
+    "time",
+    "tin",
+    "title",
+    "toad",
+    "toe",
+    "tooth",
+    "toothpaste",
+    "touch",
+    "town",
+    "toy",
+    "trade",
+    "train",
+    "transport",
+    "tray",
+    "treatment",
+    "tree",
+    "trick",
+    "trip",
+    "trouble",
+    "trousers",
+    "truck",
+    "tub",
+    "turkey",
+    "turn",
+    "twist",
+    "umbrella",
+    "uncle",
+    "underwear",
+    "unit",
+    "use",
+    "vacation",
+    "value",
+    "van",
+    "vase",
+    "vegetable",
+    "veil",
+    "vein",
+    "verse",
+    "vessel",
+    "view",
+    "visitor",
+    "voice",
+    "volcano",
+    "walk",
+    "wall",
+    "war",
+    "wash",
+    "waste",
+    "watch",
+    "water",
+    "wave",
+    "wax",
+    "way",
+    "wealth",
+    "weather",
+    "week",
+    "weight",
+    "wheel",
+    "whip",
+    "whistle",
+    "window",
+    "wine",
+    "wing",
+    "winter",
+    "wire",
+    "wish",
+    "woman",
+    "wood",
+    "wool",
+    "word",
+    "work",
+    "worm",
+    "wound",
+    "wrist",
+    "writer",
+    "yard",
+    "yoke",
+    "zebra",
+    "zinc",
+    "zipper",
+    "zone",
+]
+
+
+def random_name() -> str:
+    """Generate a random name."""
+    adjective = random.choice(adjectives)  # noqa: S311
+    noun = random.choice(nouns)  # noqa: S311
+    number = random.randint(1, 100)  # noqa: S311
+    return f"{adjective}-{noun}-{number}"
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/progress.py
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/progress.py
@@ -0,0 +1,145 @@
+"""A simple progress bar for the console."""
+
+import threading
+from collections.abc import Sequence
+from typing import Any
+from uuid import UUID
+
+from langchain_core.callbacks import base as base_callbacks
+from langchain_core.documents import Document
+from langchain_core.outputs import LLMResult
+from typing_extensions import override
+
+
+class ProgressBarCallback(base_callbacks.BaseCallbackHandler):
+    """A simple progress bar for the console."""
+
+    def __init__(
+        self,
+        total: int,
+        ncols: int = 50,
+        end_with: str = "\n",
+    ):
+        """Initialize the progress bar.
+
+        Args:
+            total: The total number of items to be processed.
+            ncols: The character width of the progress bar.
+            end_with: Last string to print after progress bar reaches end.
+        """
+        self.total = total
+        self.ncols = ncols
+        self.end_with = end_with
+        self.counter = 0
+        self.lock = threading.Lock()
+        self._print_bar()
+
+    def increment(self) -> None:
+        """Increment the counter and update the progress bar."""
+        with self.lock:
+            self.counter += 1
+            self._print_bar()
+
+    def _print_bar(self) -> None:
+        """Print the progress bar to the console."""
+        progress = self.counter / self.total
+        arrow = "-" * int(round(progress * self.ncols) - 1) + ">"
+        spaces = " " * (self.ncols - len(arrow))
+        end = "" if self.counter < self.total else self.end_with
+        print(f"\r[{arrow + spaces}] {self.counter}/{self.total}", end=end)  # noqa: T201
+
+    @override
+    def on_chain_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_chain_end(
+        self,
+        outputs: dict[str, Any],
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_retriever_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_retriever_end(
+        self,
+        documents: Sequence[Document],
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_llm_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_llm_end(
+        self,
+        response: LLMResult,
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_tool_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
+
+    @override
+    def on_tool_end(
+        self,
+        output: str,
+        *,
+        run_id: UUID,
+        parent_run_id: UUID | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        if parent_run_id is None:
+            self.increment()
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/runner_utils.py
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/runner_utils.py
--- a/venv/Lib/site-packages/langchain_classic/smith/evaluation/string_run_evaluator.py
+++ b/venv/Lib/site-packages/langchain_classic/smith/evaluation/string_run_evaluator.py
@@ -0,0 +1,477 @@
+"""Run evaluator wrapper for string evaluators."""
+
+from __future__ import annotations
+
+import logging
+import uuid
+from abc import abstractmethod
+from typing import Any, cast
+
+from langchain_core.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain_core.load.dump import dumpd
+from langchain_core.load.load import load
+from langchain_core.load.serializable import Serializable
+from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict
+from langsmith import EvaluationResult, RunEvaluator
+from langsmith.schemas import DataType, Example, Run
+from typing_extensions import override
+
+from langchain_classic.chains.base import Chain
+from langchain_classic.evaluation.schema import StringEvaluator
+from langchain_classic.schema import RUN_KEY
+
+_logger = logging.getLogger(__name__)
+
+
+def _get_messages_from_run_dict(messages: list[dict]) -> list[BaseMessage]:
+    if not messages:
+        return []
+    first_message = messages[0]
+    if "lc" in first_message:
+        return [load(dumpd(message)) for message in messages]
+    return messages_from_dict(messages)
+
+
+class StringRunMapper(Serializable):
+    """Extract items to evaluate from the run object."""
+
+    @property
+    def output_keys(self) -> list[str]:
+        """The keys to extract from the run."""
+        return ["prediction", "input"]
+
+    @abstractmethod
+    def map(self, run: Run) -> dict[str, str]:
+        """Maps the Run to a dictionary."""
+
+    def __call__(self, run: Run) -> dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            msg = f"Run {run.id} has no outputs to evaluate."
+            raise ValueError(msg)
+        return self.map(run)
+
+
+class LLMStringRunMapper(StringRunMapper):
+    """Extract items to evaluate from the run object."""
+
+    def serialize_chat_messages(self, messages: list[dict] | list[list[dict]]) -> str:
+        """Extract the input messages from the run."""
+        if isinstance(messages, list) and messages:
+            if isinstance(messages[0], dict):
+                chat_messages = _get_messages_from_run_dict(
+                    cast("list[dict]", messages)
+                )
+            elif isinstance(messages[0], list):
+                # Runs from Tracer have messages as a list of lists of dicts
+                chat_messages = _get_messages_from_run_dict(messages[0])
+            else:
+                msg = f"Could not extract messages to evaluate {messages}"  # type: ignore[unreachable]
+                raise ValueError(msg)
+            return get_buffer_string(chat_messages)
+        msg = f"Could not extract messages to evaluate {messages}"
+        raise ValueError(msg)
+
+    def serialize_inputs(self, inputs: dict) -> str:
+        """Serialize inputs.
+
+        Args:
+            inputs: The inputs from the run, expected to contain prompts or messages.
+
+        Returns:
+            The serialized input text from the prompts or messages.
+
+        Raises:
+            ValueError: If neither prompts nor messages are found in the inputs.
+        """
+        if "prompts" in inputs:  # Should we even accept this?
+            input_ = "\n\n".join(inputs["prompts"])
+        elif "prompt" in inputs:
+            input_ = inputs["prompt"]
+        elif "messages" in inputs:
+            input_ = self.serialize_chat_messages(inputs["messages"])
+        else:
+            msg = "LLM Run must have either messages or prompts as inputs."
+            raise ValueError(msg)
+        return input_
+
+    def serialize_outputs(self, outputs: dict) -> str:
+        """Serialize outputs.
+
+        Args:
+            outputs: The outputs from the run, expected to contain generations.
+
+        Returns:
+            The serialized output text from the first generation.
+
+        Raises:
+            ValueError: If no generations are found in the outputs or if the generations
+                are empty.
+        """
+        if not outputs.get("generations"):
+            msg = "Cannot evaluate LLM Run without generations."
+            raise ValueError(msg)
+        generations: list[dict] | list[list[dict]] = outputs["generations"]
+        if not generations:
+            msg = "Cannot evaluate LLM run with empty generations."
+            raise ValueError(msg)
+        first_generation: dict | list[dict] = generations[0]
+        if isinstance(first_generation, list):
+            # Runs from Tracer have generations as a list of lists of dicts
+            # Whereas Runs from the API have a list of dicts
+            first_generation = first_generation[0]
+        if "message" in first_generation:
+            output_ = self.serialize_chat_messages([first_generation["message"]])
+        else:
+            output_ = first_generation["text"]
+        return output_
+
+    def map(self, run: Run) -> dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if run.run_type != "llm":
+            msg = "LLM RunMapper only supports LLM runs."
+            raise ValueError(msg)
+        if not run.outputs:
+            if run.error:
+                msg = f"Cannot evaluate errored LLM run {run.id}: {run.error}"
+                raise ValueError(msg)
+            msg = f"Run {run.id} has no outputs. Cannot evaluate this run."
+            raise ValueError(msg)
+        try:
+            inputs = self.serialize_inputs(run.inputs)
+        except Exception as e:
+            msg = f"Could not parse LM input from run inputs {run.inputs}"
+            raise ValueError(msg) from e
+        try:
+            output_ = self.serialize_outputs(run.outputs)
+        except Exception as e:
+            msg = f"Could not parse LM prediction from run outputs {run.outputs}"
+            raise ValueError(msg) from e
+        return {"input": inputs, "prediction": output_}
+
+
+class ChainStringRunMapper(StringRunMapper):
+    """Extract items to evaluate from the run object from a chain."""
+
+    input_key: str | None = None
+    """The key from the model Run's inputs to use as the eval input.
+    If not provided, will use the only input key or raise an
+    error if there are multiple."""
+    prediction_key: str | None = None
+    """The key from the model Run's outputs to use as the eval prediction.
+    If not provided, will use the only output key or raise an error
+    if there are multiple."""
+
+    def _get_key(self, source: dict, key: str | None, which: str) -> str:
+        if key is not None:
+            return source[key]
+        if len(source) == 1:
+            return next(iter(source.values()))
+        msg = (
+            f"Could not map run {which} with multiple keys: "
+            f"{source}\nPlease manually specify a {which}_key"
+        )
+        raise ValueError(msg)
+
+    def map(self, run: Run) -> dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            msg = (
+                f"Run with ID {run.id} lacks outputs required for evaluation."
+                " Ensure the Run has valid outputs."
+            )
+            raise ValueError(msg)
+        if self.input_key is not None and self.input_key not in run.inputs:
+            msg = (
+                f"Run with ID {run.id} is missing the expected input key"
+                f" '{self.input_key}'.\nAvailable input keys in this Run"
+                f"  are: {run.inputs.keys()}.\nAdjust the evaluator's"
+                f" input_key or ensure your input data includes key"
+                f" '{self.input_key}'."
+            )
+            raise ValueError(msg)
+        if self.prediction_key is not None and self.prediction_key not in run.outputs:
+            available_keys = ", ".join(run.outputs.keys())
+            msg = (
+                f"Run with ID {run.id} doesn't have the expected prediction key"
+                f" '{self.prediction_key}'. Available prediction keys in this Run are:"
+                f" {available_keys}. Adjust the evaluator's prediction_key or"
+                " ensure the Run object's outputs the expected key."
+            )
+            raise ValueError(msg)
+
+        input_ = self._get_key(run.inputs, self.input_key, "input")
+        prediction = self._get_key(run.outputs, self.prediction_key, "prediction")
+        return {
+            "input": input_,
+            "prediction": prediction,
+        }
+
+
+class ToolStringRunMapper(StringRunMapper):
+    """Map an input to the tool."""
+
+    @override
+    def map(self, run: Run) -> dict[str, str]:
+        if not run.outputs:
+            msg = f"Run {run.id} has no outputs to evaluate."
+            raise ValueError(msg)
+        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
+
+
+class StringExampleMapper(Serializable):
+    """Map an example, or row in the dataset, to the inputs of an evaluation."""
+
+    reference_key: str | None = None
+
+    @property
+    def output_keys(self) -> list[str]:
+        """The keys to extract from the run."""
+        return ["reference"]
+
+    def serialize_chat_messages(self, messages: list[dict]) -> str:
+        """Extract the input messages from the run."""
+        chat_messages = _get_messages_from_run_dict(messages)
+        return get_buffer_string(chat_messages)
+
+    def map(self, example: Example) -> dict[str, str]:
+        """Maps the Example, or dataset row to a dictionary."""
+        if not example.outputs:
+            msg = f"Example {example.id} has no outputs to use as a reference."
+            raise ValueError(msg)
+        if self.reference_key is None:
+            if len(example.outputs) > 1:
+                msg = (
+                    f"Example {example.id} has multiple outputs, so you must"
+                    " specify a reference_key."
+                )
+                raise ValueError(msg)
+            output = next(iter(example.outputs.values()))
+        elif self.reference_key not in example.outputs:
+            msg = (
+                f"Example {example.id} does not have reference key"
+                f" {self.reference_key}."
+            )
+            raise ValueError(msg)
+        else:
+            output = example.outputs[self.reference_key]
+        return {
+            "reference": self.serialize_chat_messages([output])
+            if isinstance(output, dict) and output.get("type") and output.get("data")
+            else output,
+        }
+
+    def __call__(self, example: Example) -> dict[str, str]:
+        """Maps the Run and Example to a dictionary."""
+        if not example.outputs:
+            msg = f"Example {example.id} has no outputs to use as areference label."
+            raise ValueError(msg)
+        return self.map(example)
+
+
+class StringRunEvaluatorChain(Chain, RunEvaluator):
+    """Evaluate Run and optional examples."""
+
+    run_mapper: StringRunMapper
+    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""
+    example_mapper: StringExampleMapper | None = None
+    """Maps the Example (dataset row) to a dictionary
+    with a 'reference' string."""
+    name: str
+    """The name of the evaluation metric."""
+    string_evaluator: StringEvaluator
+    """The evaluation chain."""
+
+    @property
+    @override
+    def input_keys(self) -> list[str]:
+        return ["run", "example"]
+
+    @property
+    @override
+    def output_keys(self) -> list[str]:
+        return ["feedback"]
+
+    def _prepare_input(self, inputs: dict[str, Any]) -> dict[str, str]:
+        run: Run = inputs["run"]
+        example: Example | None = inputs.get("example")
+        evaluate_strings_inputs = self.run_mapper(run)
+        if not self.string_evaluator.requires_input:
+            # Hide warning about unused input
+            evaluate_strings_inputs.pop("input", None)
+        if example and self.example_mapper and self.string_evaluator.requires_reference:
+            evaluate_strings_inputs.update(self.example_mapper(example))
+        elif self.string_evaluator.requires_reference:
+            msg = (
+                f"Evaluator {self.name} requires an reference"
+                " example from the dataset,"
+                f" but none was provided for run {run.id}."
+            )
+            raise ValueError(msg)
+        return evaluate_strings_inputs
+
+    def _prepare_output(self, output: dict[str, Any]) -> dict[str, Any]:
+        evaluation_result = EvaluationResult(
+            key=self.name,
+            comment=output.get("reasoning"),
+            **output,
+        )
+        if RUN_KEY in output:
+            # TODO: Not currently surfaced. Update
+            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
+        return {"feedback": evaluation_result}
+
+    def _call(
+        self,
+        inputs: dict[str, str],
+        run_manager: CallbackManagerForChainRun | None = None,
+    ) -> dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluate_strings_inputs = self._prepare_input(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = self.string_evaluator.evaluate_strings(
+            **evaluate_strings_inputs,
+            callbacks=callbacks,
+            include_run_info=True,
+        )
+        return self._prepare_output(chain_output)
+
+    async def _acall(
+        self,
+        inputs: dict[str, str],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluate_strings_inputs = self._prepare_input(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = await self.string_evaluator.aevaluate_strings(
+            **evaluate_strings_inputs,
+            callbacks=callbacks,
+            include_run_info=True,
+        )
+        return self._prepare_output(chain_output)
+
+    def _prepare_evaluator_output(self, output: dict[str, Any]) -> EvaluationResult:
+        feedback: EvaluationResult = output["feedback"]
+        if RUN_KEY not in feedback.evaluator_info:
+            feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]
+        return feedback
+
+    @override
+    def evaluate_run(
+        self,
+        run: Run,
+        example: Example | None = None,
+        evaluator_run_id: uuid.UUID | None = None,
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        try:
+            result = self({"run": run, "example": example}, include_run_info=True)
+            return self._prepare_evaluator_output(result)
+        except Exception as e:
+            _logger.exception("Error evaluating run %s", run.id)
+            return EvaluationResult(
+                key=self.string_evaluator.evaluation_name,
+                comment=f"Error evaluating run {run.id}: {e}",
+                # TODO: Add run ID once we can declare it via callbacks
+            )
+
+    @override
+    async def aevaluate_run(
+        self,
+        run: Run,
+        example: Example | None = None,
+        evaluator_run_id: uuid.UUID | None = None,
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        try:
+            result = await self.acall(
+                {"run": run, "example": example},
+                include_run_info=True,
+            )
+            return self._prepare_evaluator_output(result)
+        except Exception as e:
+            _logger.exception("Error evaluating run %s", run.id)
+            return EvaluationResult(
+                key=self.string_evaluator.evaluation_name,
+                comment=f"Error evaluating run {run.id}: {e}",
+            )
+
+    @classmethod
+    def from_run_and_data_type(
+        cls,
+        evaluator: StringEvaluator,
+        run_type: str,
+        data_type: DataType,
+        input_key: str | None = None,
+        prediction_key: str | None = None,
+        reference_key: str | None = None,
+        tags: list[str] | None = None,
+    ) -> StringRunEvaluatorChain:
+        """Create a StringRunEvaluatorChain.
+
+        Create a StringRunEvaluatorChain from an evaluator and the run and dataset
+        types.
+
+        This method provides an easy way to instantiate a StringRunEvaluatorChain, by
+        taking an evaluator and information about the type of run and the data.
+        The method supports LLM and chain runs.
+
+        Args:
+            evaluator: The string evaluator to use.
+            run_type: The type of run being evaluated.
+                Supported types are LLM and Chain.
+            data_type: The type of dataset used in the run.
+            input_key: The key used to map the input from the run.
+            prediction_key: The key used to map the prediction from the run.
+            reference_key: The key used to map the reference from the dataset.
+            tags: List of tags to attach to the evaluation chain.
+
+        Returns:
+            The instantiated evaluation chain.
+
+        Raises:
+            ValueError: If the run type is not supported, or if the evaluator requires a
+                reference from the dataset but the reference key is not provided.
+
+        """
+        # Configure how run inputs/predictions are passed to the evaluator
+        if run_type == "llm":
+            run_mapper: StringRunMapper = LLMStringRunMapper()
+        elif run_type == "chain":
+            run_mapper = ChainStringRunMapper(
+                input_key=input_key,
+                prediction_key=prediction_key,
+            )
+        else:
+            msg = f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."
+            raise ValueError(msg)
+
+        # Configure how example rows are fed as a reference string to the evaluator
+        if (
+            reference_key is not None
+            or data_type in (DataType.llm, DataType.chat)
+            or evaluator.requires_reference
+        ):
+            example_mapper = StringExampleMapper(reference_key=reference_key)
+        elif evaluator.requires_reference:
+            msg = (  # type: ignore[unreachable]
+                f"Evaluator {evaluator.evaluation_name} requires a reference"
+                " example from the dataset. Please specify the reference key from"
+                " amongst the dataset outputs keys."
+            )
+            raise ValueError(msg)
+        else:
+            example_mapper = None
+        return cls(
+            name=evaluator.evaluation_name,
+            run_mapper=run_mapper,
+            example_mapper=example_mapper,
+            string_evaluator=evaluator,
+            tags=tags,
+        )