initial commit

This commit is contained in:
2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions

View File

@@ -0,0 +1,454 @@
"""**Embedding models** are wrappers around embedding models
from different APIs and services.
**Embedding models** can be LLMs or not.
**Class hierarchy:**
.. code-block::
Embeddings --> <name>Embeddings # Examples: OpenAIEmbeddings, HuggingFaceEmbeddings
"""
import importlib
import logging
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from langchain_community.embeddings.aleph_alpha import (
AlephAlphaAsymmetricSemanticEmbedding,
AlephAlphaSymmetricSemanticEmbedding,
)
from langchain_community.embeddings.anyscale import (
AnyscaleEmbeddings,
)
from langchain_community.embeddings.ascend import (
AscendEmbeddings,
)
from langchain_community.embeddings.awa import (
AwaEmbeddings,
)
from langchain_community.embeddings.azure_openai import (
AzureOpenAIEmbeddings,
)
from langchain_community.embeddings.baichuan import (
BaichuanTextEmbeddings,
)
from langchain_community.embeddings.baidu_qianfan_endpoint import (
QianfanEmbeddingsEndpoint,
)
from langchain_community.embeddings.bedrock import (
BedrockEmbeddings,
)
from langchain_community.embeddings.bookend import (
BookendEmbeddings,
)
from langchain_community.embeddings.clarifai import (
ClarifaiEmbeddings,
)
from langchain_community.embeddings.clova import (
ClovaEmbeddings,
)
from langchain_community.embeddings.cohere import (
CohereEmbeddings,
)
from langchain_community.embeddings.dashscope import (
DashScopeEmbeddings,
)
from langchain_community.embeddings.databricks import (
DatabricksEmbeddings,
)
from langchain_community.embeddings.deepinfra import (
DeepInfraEmbeddings,
)
from langchain_community.embeddings.edenai import (
EdenAiEmbeddings,
)
from langchain_community.embeddings.elasticsearch import (
ElasticsearchEmbeddings,
)
from langchain_community.embeddings.embaas import (
EmbaasEmbeddings,
)
from langchain_community.embeddings.ernie import (
ErnieEmbeddings,
)
from langchain_community.embeddings.fake import (
DeterministicFakeEmbedding,
FakeEmbeddings,
)
from langchain_community.embeddings.fastembed import (
FastEmbedEmbeddings,
)
from langchain_community.embeddings.gigachat import (
GigaChatEmbeddings,
)
from langchain_community.embeddings.google_palm import (
GooglePalmEmbeddings,
)
from langchain_community.embeddings.gpt4all import (
GPT4AllEmbeddings,
)
from langchain_community.embeddings.gradient_ai import (
GradientEmbeddings,
)
from langchain_community.embeddings.huggingface import (
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInferenceAPIEmbeddings,
HuggingFaceInstructEmbeddings,
)
from langchain_community.embeddings.huggingface_hub import (
HuggingFaceHubEmbeddings,
)
from langchain_community.embeddings.hunyuan import (
HunyuanEmbeddings,
)
from langchain_community.embeddings.infinity import (
InfinityEmbeddings,
)
from langchain_community.embeddings.infinity_local import (
InfinityEmbeddingsLocal,
)
from langchain_community.embeddings.ipex_llm import IpexLLMBgeEmbeddings
from langchain_community.embeddings.itrex import (
QuantizedBgeEmbeddings,
)
from langchain_community.embeddings.javelin_ai_gateway import (
JavelinAIGatewayEmbeddings,
)
from langchain_community.embeddings.jina import (
JinaEmbeddings,
)
from langchain_community.embeddings.johnsnowlabs import (
JohnSnowLabsEmbeddings,
)
from langchain_community.embeddings.laser import (
LaserEmbeddings,
)
from langchain_community.embeddings.llamacpp import (
LlamaCppEmbeddings,
)
from langchain_community.embeddings.llamafile import (
LlamafileEmbeddings,
)
from langchain_community.embeddings.llm_rails import (
LLMRailsEmbeddings,
)
from langchain_community.embeddings.localai import (
LocalAIEmbeddings,
)
from langchain_community.embeddings.minimax import (
MiniMaxEmbeddings,
)
from langchain_community.embeddings.mlflow import (
MlflowCohereEmbeddings,
MlflowEmbeddings,
)
from langchain_community.embeddings.mlflow_gateway import (
MlflowAIGatewayEmbeddings,
)
from langchain_community.embeddings.model2vec import (
Model2vecEmbeddings,
)
from langchain_community.embeddings.modelscope_hub import (
ModelScopeEmbeddings,
)
from langchain_community.embeddings.mosaicml import (
MosaicMLInstructorEmbeddings,
)
from langchain_community.embeddings.naver import (
ClovaXEmbeddings,
)
from langchain_community.embeddings.nemo import (
NeMoEmbeddings,
)
from langchain_community.embeddings.nlpcloud import (
NLPCloudEmbeddings,
)
from langchain_community.embeddings.oci_generative_ai import (
OCIGenAIEmbeddings,
)
from langchain_community.embeddings.octoai_embeddings import (
OctoAIEmbeddings,
)
from langchain_community.embeddings.ollama import (
OllamaEmbeddings,
)
from langchain_community.embeddings.openai import (
OpenAIEmbeddings,
)
from langchain_community.embeddings.openvino import (
OpenVINOBgeEmbeddings,
OpenVINOEmbeddings,
)
from langchain_community.embeddings.optimum_intel import (
QuantizedBiEncoderEmbeddings,
)
from langchain_community.embeddings.oracleai import (
OracleEmbeddings,
)
from langchain_community.embeddings.ovhcloud import (
OVHCloudEmbeddings,
)
from langchain_community.embeddings.premai import (
PremAIEmbeddings,
)
from langchain_community.embeddings.sagemaker_endpoint import (
SagemakerEndpointEmbeddings,
)
from langchain_community.embeddings.sambanova import (
SambaStudioEmbeddings,
)
from langchain_community.embeddings.self_hosted import (
SelfHostedEmbeddings,
)
from langchain_community.embeddings.self_hosted_hugging_face import (
SelfHostedHuggingFaceEmbeddings,
SelfHostedHuggingFaceInstructEmbeddings,
)
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.embeddings.solar import (
SolarEmbeddings,
)
from langchain_community.embeddings.spacy_embeddings import (
SpacyEmbeddings,
)
from langchain_community.embeddings.sparkllm import (
SparkLLMTextEmbeddings,
)
from langchain_community.embeddings.tensorflow_hub import (
TensorflowHubEmbeddings,
)
from langchain_community.embeddings.textembed import (
TextEmbedEmbeddings,
)
from langchain_community.embeddings.titan_takeoff import (
TitanTakeoffEmbed,
)
from langchain_community.embeddings.vertexai import (
VertexAIEmbeddings,
)
from langchain_community.embeddings.volcengine import (
VolcanoEmbeddings,
)
from langchain_community.embeddings.voyageai import (
VoyageEmbeddings,
)
from langchain_community.embeddings.xinference import (
XinferenceEmbeddings,
)
from langchain_community.embeddings.yandex import (
YandexGPTEmbeddings,
)
from langchain_community.embeddings.zhipuai import (
ZhipuAIEmbeddings,
)
__all__ = [
"AlephAlphaAsymmetricSemanticEmbedding",
"AlephAlphaSymmetricSemanticEmbedding",
"AnyscaleEmbeddings",
"AscendEmbeddings",
"AwaEmbeddings",
"AzureOpenAIEmbeddings",
"BaichuanTextEmbeddings",
"BedrockEmbeddings",
"BookendEmbeddings",
"ClarifaiEmbeddings",
"ClovaEmbeddings",
"ClovaXEmbeddings",
"CohereEmbeddings",
"DashScopeEmbeddings",
"DatabricksEmbeddings",
"DeepInfraEmbeddings",
"DeterministicFakeEmbedding",
"EdenAiEmbeddings",
"ElasticsearchEmbeddings",
"EmbaasEmbeddings",
"ErnieEmbeddings",
"FakeEmbeddings",
"FastEmbedEmbeddings",
"GPT4AllEmbeddings",
"GigaChatEmbeddings",
"GooglePalmEmbeddings",
"GradientEmbeddings",
"HuggingFaceBgeEmbeddings",
"HuggingFaceEmbeddings",
"HuggingFaceHubEmbeddings",
"HuggingFaceInferenceAPIEmbeddings",
"HuggingFaceInstructEmbeddings",
"InfinityEmbeddings",
"InfinityEmbeddingsLocal",
"IpexLLMBgeEmbeddings",
"JavelinAIGatewayEmbeddings",
"JinaEmbeddings",
"JohnSnowLabsEmbeddings",
"LLMRailsEmbeddings",
"LaserEmbeddings",
"LlamaCppEmbeddings",
"LlamafileEmbeddings",
"LocalAIEmbeddings",
"MiniMaxEmbeddings",
"MlflowAIGatewayEmbeddings",
"MlflowCohereEmbeddings",
"MlflowEmbeddings",
"Model2vecEmbeddings",
"ModelScopeEmbeddings",
"MosaicMLInstructorEmbeddings",
"NLPCloudEmbeddings",
"NeMoEmbeddings",
"OCIGenAIEmbeddings",
"OctoAIEmbeddings",
"OllamaEmbeddings",
"OpenAIEmbeddings",
"OpenVINOBgeEmbeddings",
"OpenVINOEmbeddings",
"OracleEmbeddings",
"OVHCloudEmbeddings",
"PremAIEmbeddings",
"QianfanEmbeddingsEndpoint",
"QuantizedBgeEmbeddings",
"QuantizedBiEncoderEmbeddings",
"SagemakerEndpointEmbeddings",
"SambaStudioEmbeddings",
"SelfHostedEmbeddings",
"SelfHostedHuggingFaceEmbeddings",
"SelfHostedHuggingFaceInstructEmbeddings",
"SentenceTransformerEmbeddings",
"SolarEmbeddings",
"SpacyEmbeddings",
"SparkLLMTextEmbeddings",
"TensorflowHubEmbeddings",
"TextEmbedEmbeddings",
"TitanTakeoffEmbed",
"VertexAIEmbeddings",
"VolcanoEmbeddings",
"VoyageEmbeddings",
"XinferenceEmbeddings",
"YandexGPTEmbeddings",
"ZhipuAIEmbeddings",
"HunyuanEmbeddings",
]
_module_lookup = {
"AlephAlphaAsymmetricSemanticEmbedding": "langchain_community.embeddings.aleph_alpha", # noqa: E501
"AlephAlphaSymmetricSemanticEmbedding": "langchain_community.embeddings.aleph_alpha", # noqa: E501
"AnyscaleEmbeddings": "langchain_community.embeddings.anyscale",
"AwaEmbeddings": "langchain_community.embeddings.awa",
"AzureOpenAIEmbeddings": "langchain_community.embeddings.azure_openai",
"BaichuanTextEmbeddings": "langchain_community.embeddings.baichuan",
"BedrockEmbeddings": "langchain_community.embeddings.bedrock",
"BookendEmbeddings": "langchain_community.embeddings.bookend",
"ClarifaiEmbeddings": "langchain_community.embeddings.clarifai",
"ClovaEmbeddings": "langchain_community.embeddings.clova",
"ClovaXEmbeddings": "langchain_community.embeddings.naver",
"CohereEmbeddings": "langchain_community.embeddings.cohere",
"DashScopeEmbeddings": "langchain_community.embeddings.dashscope",
"DatabricksEmbeddings": "langchain_community.embeddings.databricks",
"DeepInfraEmbeddings": "langchain_community.embeddings.deepinfra",
"DeterministicFakeEmbedding": "langchain_community.embeddings.fake",
"EdenAiEmbeddings": "langchain_community.embeddings.edenai",
"ElasticsearchEmbeddings": "langchain_community.embeddings.elasticsearch",
"EmbaasEmbeddings": "langchain_community.embeddings.embaas",
"ErnieEmbeddings": "langchain_community.embeddings.ernie",
"FakeEmbeddings": "langchain_community.embeddings.fake",
"FastEmbedEmbeddings": "langchain_community.embeddings.fastembed",
"GPT4AllEmbeddings": "langchain_community.embeddings.gpt4all",
"GooglePalmEmbeddings": "langchain_community.embeddings.google_palm",
"GradientEmbeddings": "langchain_community.embeddings.gradient_ai",
"GigaChatEmbeddings": "langchain_community.embeddings.gigachat",
"HuggingFaceBgeEmbeddings": "langchain_community.embeddings.huggingface",
"HuggingFaceEmbeddings": "langchain_community.embeddings.huggingface",
"HuggingFaceHubEmbeddings": "langchain_community.embeddings.huggingface_hub",
"HuggingFaceInferenceAPIEmbeddings": "langchain_community.embeddings.huggingface",
"HuggingFaceInstructEmbeddings": "langchain_community.embeddings.huggingface",
"InfinityEmbeddings": "langchain_community.embeddings.infinity",
"InfinityEmbeddingsLocal": "langchain_community.embeddings.infinity_local",
"IpexLLMBgeEmbeddings": "langchain_community.embeddings.ipex_llm",
"JavelinAIGatewayEmbeddings": "langchain_community.embeddings.javelin_ai_gateway",
"JinaEmbeddings": "langchain_community.embeddings.jina",
"JohnSnowLabsEmbeddings": "langchain_community.embeddings.johnsnowlabs",
"LLMRailsEmbeddings": "langchain_community.embeddings.llm_rails",
"LaserEmbeddings": "langchain_community.embeddings.laser",
"LlamaCppEmbeddings": "langchain_community.embeddings.llamacpp",
"LlamafileEmbeddings": "langchain_community.embeddings.llamafile",
"LocalAIEmbeddings": "langchain_community.embeddings.localai",
"MiniMaxEmbeddings": "langchain_community.embeddings.minimax",
"MlflowAIGatewayEmbeddings": "langchain_community.embeddings.mlflow_gateway",
"MlflowCohereEmbeddings": "langchain_community.embeddings.mlflow",
"MlflowEmbeddings": "langchain_community.embeddings.mlflow",
"Model2vecEmbeddings": "langchain_community.embeddings.model2vec",
"ModelScopeEmbeddings": "langchain_community.embeddings.modelscope_hub",
"MosaicMLInstructorEmbeddings": "langchain_community.embeddings.mosaicml",
"NLPCloudEmbeddings": "langchain_community.embeddings.nlpcloud",
"NeMoEmbeddings": "langchain_community.embeddings.nemo",
"OCIGenAIEmbeddings": "langchain_community.embeddings.oci_generative_ai",
"OctoAIEmbeddings": "langchain_community.embeddings.octoai_embeddings",
"OllamaEmbeddings": "langchain_community.embeddings.ollama",
"OpenAIEmbeddings": "langchain_community.embeddings.openai",
"OpenVINOEmbeddings": "langchain_community.embeddings.openvino",
"OpenVINOBgeEmbeddings": "langchain_community.embeddings.openvino",
"QianfanEmbeddingsEndpoint": "langchain_community.embeddings.baidu_qianfan_endpoint", # noqa: E501
"QuantizedBgeEmbeddings": "langchain_community.embeddings.itrex",
"QuantizedBiEncoderEmbeddings": "langchain_community.embeddings.optimum_intel",
"OracleEmbeddings": "langchain_community.embeddings.oracleai",
"OVHCloudEmbeddings": "langchain_community.embeddings.ovhcloud",
"SagemakerEndpointEmbeddings": "langchain_community.embeddings.sagemaker_endpoint",
"SambaStudioEmbeddings": "langchain_community.embeddings.sambanova",
"SelfHostedEmbeddings": "langchain_community.embeddings.self_hosted",
"SelfHostedHuggingFaceEmbeddings": "langchain_community.embeddings.self_hosted_hugging_face", # noqa: E501
"SelfHostedHuggingFaceInstructEmbeddings": "langchain_community.embeddings.self_hosted_hugging_face", # noqa: E501
"SentenceTransformerEmbeddings": "langchain_community.embeddings.sentence_transformer", # noqa: E501
"SolarEmbeddings": "langchain_community.embeddings.solar",
"SpacyEmbeddings": "langchain_community.embeddings.spacy_embeddings",
"SparkLLMTextEmbeddings": "langchain_community.embeddings.sparkllm",
"TensorflowHubEmbeddings": "langchain_community.embeddings.tensorflow_hub",
"VertexAIEmbeddings": "langchain_community.embeddings.vertexai",
"VolcanoEmbeddings": "langchain_community.embeddings.volcengine",
"VoyageEmbeddings": "langchain_community.embeddings.voyageai",
"XinferenceEmbeddings": "langchain_community.embeddings.xinference",
"TextEmbedEmbeddings": "langchain_community.embeddings.textembed",
"TitanTakeoffEmbed": "langchain_community.embeddings.titan_takeoff",
"PremAIEmbeddings": "langchain_community.embeddings.premai",
"YandexGPTEmbeddings": "langchain_community.embeddings.yandex",
"AscendEmbeddings": "langchain_community.embeddings.ascend",
"ZhipuAIEmbeddings": "langchain_community.embeddings.zhipuai",
"HunyuanEmbeddings": "langchain_community.embeddings.hunyuan",
}
def __getattr__(name: str) -> Any:
if name in _module_lookup:
module = importlib.import_module(_module_lookup[name])
return getattr(module, name)
raise AttributeError(f"module {__name__} has no attribute {name}")
logger = logging.getLogger(__name__)
# TODO: this is in here to maintain backwards compatibility
class HypotheticalDocumentEmbedder:
def __init__(self, *args: Any, **kwargs: Any):
logger.warning(
"Using a deprecated class. Please use "
"`from langchain_classic.chains import HypotheticalDocumentEmbedder` "
"instead"
)
from langchain_classic.chains.hyde.base import HypotheticalDocumentEmbedder as H
return H(*args, **kwargs) # type: ignore[return-value]
@classmethod
def from_llm(cls, *args: Any, **kwargs: Any) -> Any:
logger.warning(
"Using a deprecated class. Please use "
"`from langchain_classic.chains import HypotheticalDocumentEmbedder` "
"instead"
)
from langchain_classic.chains.hyde.base import HypotheticalDocumentEmbedder as H
return H.from_llm(*args, **kwargs)

View File

@@ -0,0 +1,256 @@
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, model_validator
class AlephAlphaAsymmetricSemanticEmbedding(BaseModel, Embeddings):
"""Aleph Alpha's asymmetric semantic embedding.
AA provides you with an endpoint to embed a document and a query.
The models were optimized to make the embeddings of documents and
the query for a document as similar as possible.
To learn more, check out: https://docs.aleph-alpha.com/docs/tasks/semantic_embed/
Example:
.. code-block:: python
from aleph_alpha import AlephAlphaAsymmetricSemanticEmbedding
embeddings = AlephAlphaAsymmetricSemanticEmbedding(
normalize=True, compress_to_size=128
)
document = "This is a content of the document"
query = "What is the content of the document?"
doc_result = embeddings.embed_documents([document])
query_result = embeddings.embed_query(query)
"""
client: Any #: :meta private:
# Embedding params
model: str = "luminous-base"
"""Model name to use."""
compress_to_size: Optional[int] = None
"""Should the returned embeddings come back as an original 5120-dim vector,
or should it be compressed to 128-dim."""
normalize: bool = False
"""Should returned embeddings be normalized"""
contextual_control_threshold: Optional[int] = None
"""Attention control parameters only apply to those tokens that have
explicitly been set in the request."""
control_log_additive: bool = True
"""Apply controls on prompt items by adding the log(control_factor)
to attention scores."""
# Client params
aleph_alpha_api_key: Optional[str] = None
"""API key for Aleph Alpha API."""
host: str = "https://api.aleph-alpha.com"
"""The hostname of the API host.
The default one is "https://api.aleph-alpha.com")"""
hosting: Optional[str] = None
"""Determines in which datacenters the request may be processed.
You can either set the parameter to "aleph-alpha" or omit it (defaulting to None).
Not setting this value, or setting it to None, gives us maximal flexibility
in processing your request in our
own datacenters and on servers hosted with other providers.
Choose this option for maximal availability.
Setting it to "aleph-alpha" allows us to only process the request
in our own datacenters.
Choose this option for maximal data privacy."""
request_timeout_seconds: int = 305
"""Client timeout that will be set for HTTP requests in the
`requests` library's API calls.
Server will close all requests after 300 seconds with an internal server error."""
total_retries: int = 8
"""The number of retries made in case requests fail with certain retryable
status codes. If the last
retry fails a corresponding exception is raised. Note, that between retries
an exponential backoff
is applied, starting with 0.5 s after the first retry and doubling for each
retry made. So with the
default setting of 8 retries a total wait time of 63.5 s is added between
the retries."""
nice: bool = False
"""Setting this to True, will signal to the API that you intend to be
nice to other users
by de-prioritizing your request below concurrent ones."""
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate that api key and python package exists in environment."""
aleph_alpha_api_key = get_from_dict_or_env(
values, "aleph_alpha_api_key", "ALEPH_ALPHA_API_KEY"
)
try:
from aleph_alpha_client import Client
values["client"] = Client(
token=aleph_alpha_api_key,
host=values["host"],
hosting=values["hosting"],
request_timeout_seconds=values["request_timeout_seconds"],
total_retries=values["total_retries"],
nice=values["nice"],
)
except ImportError:
raise ImportError(
"Could not import aleph_alpha_client python package. "
"Please install it with `pip install aleph_alpha_client`."
)
return values
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Call out to Aleph Alpha's asymmetric Document endpoint.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
try:
from aleph_alpha_client import (
Prompt,
SemanticEmbeddingRequest,
SemanticRepresentation,
)
except ImportError:
raise ImportError(
"Could not import aleph_alpha_client python package. "
"Please install it with `pip install aleph_alpha_client`."
)
document_embeddings = []
for text in texts:
document_params = {
"prompt": Prompt.from_text(text),
"representation": SemanticRepresentation.Document,
"compress_to_size": self.compress_to_size,
"normalize": self.normalize,
"contextual_control_threshold": self.contextual_control_threshold,
"control_log_additive": self.control_log_additive,
}
document_request = SemanticEmbeddingRequest(**document_params)
document_response = self.client.semantic_embed(
request=document_request, model=self.model
)
document_embeddings.append(document_response.embedding)
return document_embeddings
def embed_query(self, text: str) -> List[float]:
"""Call out to Aleph Alpha's asymmetric, query embedding endpoint
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
try:
from aleph_alpha_client import (
Prompt,
SemanticEmbeddingRequest,
SemanticRepresentation,
)
except ImportError:
raise ImportError(
"Could not import aleph_alpha_client python package. "
"Please install it with `pip install aleph_alpha_client`."
)
symmetric_params = {
"prompt": Prompt.from_text(text),
"representation": SemanticRepresentation.Query,
"compress_to_size": self.compress_to_size,
"normalize": self.normalize,
"contextual_control_threshold": self.contextual_control_threshold,
"control_log_additive": self.control_log_additive,
}
symmetric_request = SemanticEmbeddingRequest(**symmetric_params)
symmetric_response = self.client.semantic_embed(
request=symmetric_request, model=self.model
)
return symmetric_response.embedding
class AlephAlphaSymmetricSemanticEmbedding(AlephAlphaAsymmetricSemanticEmbedding):
"""Symmetric version of the Aleph Alpha's semantic embeddings.
The main difference is that here, both the documents and
queries are embedded with a SemanticRepresentation.Symmetric
Example:
.. code-block:: python
from aleph_alpha import AlephAlphaSymmetricSemanticEmbedding
embeddings = AlephAlphaAsymmetricSemanticEmbedding(
normalize=True, compress_to_size=128
)
text = "This is a test text"
doc_result = embeddings.embed_documents([text])
query_result = embeddings.embed_query(text)
"""
def _embed(self, text: str) -> List[float]:
try:
from aleph_alpha_client import (
Prompt,
SemanticEmbeddingRequest,
SemanticRepresentation,
)
except ImportError:
raise ImportError(
"Could not import aleph_alpha_client python package. "
"Please install it with `pip install aleph_alpha_client`."
)
query_params = {
"prompt": Prompt.from_text(text),
"representation": SemanticRepresentation.Symmetric,
"compress_to_size": self.compress_to_size,
"normalize": self.normalize,
"contextual_control_threshold": self.contextual_control_threshold,
"control_log_additive": self.control_log_additive,
}
query_request = SemanticEmbeddingRequest(**query_params)
query_response = self.client.semantic_embed(
request=query_request, model=self.model
)
return query_response.embedding
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Call out to Aleph Alpha's Document endpoint.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
document_embeddings = []
for text in texts:
document_embeddings.append(self._embed(text))
return document_embeddings
def embed_query(self, text: str) -> List[float]:
"""Call out to Aleph Alpha's asymmetric, query embedding endpoint
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self._embed(text)

View File

@@ -0,0 +1,76 @@
"""Anyscale embeddings wrapper."""
from __future__ import annotations
from typing import Dict, Optional
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
from pydantic import Field, SecretStr
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.utils.openai import is_openai_v1
DEFAULT_API_BASE = "https://api.endpoints.anyscale.com/v1"
DEFAULT_MODEL = "thenlper/gte-large"
class AnyscaleEmbeddings(OpenAIEmbeddings):
"""`Anyscale` Embeddings API."""
anyscale_api_key: Optional[SecretStr] = Field(default=None)
"""AnyScale Endpoints API keys."""
model: str = Field(default=DEFAULT_MODEL)
"""Model name to use."""
anyscale_api_base: str = Field(default=DEFAULT_API_BASE)
"""Base URL path for API requests."""
tiktoken_enabled: bool = False
"""Set this to False for non-OpenAI implementations of the embeddings API"""
embedding_ctx_length: int = 500
"""The maximum number of tokens to embed at once."""
@property
def lc_secrets(self) -> Dict[str, str]:
return {
"anyscale_api_key": "ANYSCALE_API_KEY",
}
@pre_init
def validate_environment(cls, values: dict) -> dict:
"""Validate that api key and python package exists in environment."""
values["anyscale_api_key"] = convert_to_secret_str(
get_from_dict_or_env(
values,
"anyscale_api_key",
"ANYSCALE_API_KEY",
)
)
values["anyscale_api_base"] = get_from_dict_or_env(
values,
"anyscale_api_base",
"ANYSCALE_API_BASE",
default=DEFAULT_API_BASE,
)
try:
import openai
except ImportError:
raise ImportError(
"Could not import openai python package. "
"Please install it with `pip install openai`."
)
if is_openai_v1():
# For backwards compatibility.
client_params = {
"api_key": values["anyscale_api_key"].get_secret_value(),
"base_url": values["anyscale_api_base"],
}
values["client"] = openai.OpenAI(**client_params).embeddings
else:
values["openai_api_base"] = values["anyscale_api_base"]
values["openai_api_key"] = values["anyscale_api_key"].get_secret_value()
values["client"] = openai.Embedding
return values
@property
def _llm_type(self) -> str:
return "anyscale-embedding"

View File

@@ -0,0 +1,137 @@
import os
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from pydantic import BaseModel, ConfigDict, model_validator
class AscendEmbeddings(Embeddings, BaseModel):
"""
Ascend NPU accelerate Embedding model
Please ensure that you have installed CANN and torch_npu.
Example:
from langchain_community.embeddings import AscendEmbeddings
model = AscendEmbeddings(model_path=<path_to_model>,
device_id=0,
query_instruction="Represent this sentence for searching relevant passages: "
)
"""
"""model path"""
model_path: str
"""Ascend NPU device id."""
device_id: int = 0
"""Unstruntion to used for embedding query."""
query_instruction: str = ""
"""Unstruntion to used for embedding document."""
document_instruction: str = ""
use_fp16: bool = True
pooling_method: Optional[str] = "cls"
batch_size: int = 32
model: Any
tokenizer: Any
model_config = ConfigDict(protected_namespaces=())
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
try:
from transformers import AutoModel, AutoTokenizer
except ImportError as e:
raise ImportError(
"Unable to import transformers, please install with "
"`pip install -U transformers`."
) from e
try:
self.model = AutoModel.from_pretrained(self.model_path).npu().eval()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
except Exception as e:
raise Exception(
f"Failed to load model [self.model_path], due to following error:{e}"
)
if self.use_fp16:
self.model.half()
self.encode([f"warmup {i} times" for i in range(10)])
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
if "model_path" not in values:
raise ValueError("model_path is required")
if not os.access(values["model_path"], os.F_OK):
raise FileNotFoundError(
f"Unable to find valid model path in [{values['model_path']}]"
)
try:
import torch_npu
except ImportError:
raise ModuleNotFoundError("torch_npu not found, please install torch_npu")
except Exception as e:
raise e
try:
torch_npu.npu.set_device(values["device_id"])
except Exception as e:
raise Exception(f"set device failed due to {e}")
return values
def encode(self, sentences: Any) -> Any:
inputs = self.tokenizer(
sentences,
padding=True,
truncation=True,
return_tensors="pt",
max_length=512,
)
try:
import torch
except ImportError as e:
raise ImportError(
"Unable to import torch, please install with `pip install -U torch`."
) from e
last_hidden_state = self.model(
inputs.input_ids.npu(), inputs.attention_mask.npu(), return_dict=True
).last_hidden_state
tmp = self.pooling(last_hidden_state, inputs["attention_mask"].npu())
embeddings = torch.nn.functional.normalize(tmp, dim=-1)
return embeddings.cpu().detach().numpy()
def pooling(self, last_hidden_state: Any, attention_mask: Any = None) -> Any:
try:
import torch
except ImportError as e:
raise ImportError(
"Unable to import torch, please install with `pip install -U torch`."
) from e
if self.pooling_method == "cls":
return last_hidden_state[:, 0]
elif self.pooling_method == "mean":
s = torch.sum(
last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=-1
)
d = attention_mask.sum(dim=1, keepdim=True).float()
return s / d
else:
raise NotImplementedError(
f"Pooling method [{self.pooling_method}] not implemented"
)
def embed_documents(self, texts: List[str]) -> List[List[float]]:
try:
import numpy as np
except ImportError as e:
raise ImportError(
"Unable to import numpy, please install with `pip install -U numpy`."
) from e
embedding_list = []
for i in range(0, len(texts), self.batch_size):
texts_ = texts[i : i + self.batch_size]
emb = self.encode([self.document_instruction + text for text in texts_])
embedding_list.append(emb)
return np.concatenate(embedding_list)
def embed_query(self, text: str) -> List[float]:
return self.encode([self.query_instruction + text])[0]

View File

@@ -0,0 +1,64 @@
from typing import Any, Dict, List
from langchain_core.embeddings import Embeddings
from pydantic import BaseModel, model_validator
class AwaEmbeddings(BaseModel, Embeddings):
"""Embedding documents and queries with Awa DB.
Attributes:
client: The AwaEmbedding client.
model: The name of the model used for embedding.
Default is "all-mpnet-base-v2".
"""
client: Any #: :meta private:
model: str = "all-mpnet-base-v2"
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate that awadb library is installed."""
try:
from awadb import AwaEmbedding
except ImportError as exc:
raise ImportError(
"Could not import awadb library. "
"Please install it with `pip install awadb`"
) from exc
values["client"] = AwaEmbedding()
return values
def set_model(self, model_name: str) -> None:
"""Set the model used for embedding.
The default model used is all-mpnet-base-v2
Args:
model_name: A string which represents the name of model.
"""
self.model = model_name
self.client.model_name = model_name
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of documents using AwaEmbedding.
Args:
texts: The list of texts need to be embedded
Returns:
List of embeddings, one for each text.
"""
return self.client.EmbeddingBatch(texts)
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using AwaEmbedding.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.client.Embedding(text)

View File

@@ -0,0 +1,187 @@
"""Azure OpenAI embeddings wrapper."""
from __future__ import annotations
import os
import warnings
from typing import Any, Awaitable, Callable, Dict, Optional, Union
from langchain_core._api.deprecation import deprecated
from langchain_core.utils import get_from_dict_or_env
from pydantic import Field, model_validator
from typing_extensions import Self
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.utils.openai import is_openai_v1
@deprecated(
since="0.0.9",
removal="1.0",
alternative_import="langchain_openai.AzureOpenAIEmbeddings",
)
class AzureOpenAIEmbeddings(OpenAIEmbeddings):
"""`Azure OpenAI` Embeddings API."""
azure_endpoint: Union[str, None] = None
"""Your Azure endpoint, including the resource.
Automatically inferred from env var `AZURE_OPENAI_ENDPOINT` if not provided.
Example: `https://example-resource.azure.openai.com/`
"""
deployment: Optional[str] = Field(default=None, alias="azure_deployment")
"""A model deployment.
If given sets the base client URL to include `/deployments/{azure_deployment}`.
Note: this means you won't be able to use non-deployment endpoints.
"""
openai_api_key: Union[str, None] = Field(default=None, alias="api_key")
"""Automatically inferred from env var `AZURE_OPENAI_API_KEY` if not provided."""
azure_ad_token: Union[str, None] = None
"""Your Azure Active Directory token.
Automatically inferred from env var `AZURE_OPENAI_AD_TOKEN` if not provided.
For more:
https://www.microsoft.com/en-us/security/business/identity-access/microsoft-entra-id.
"""
azure_ad_token_provider: Union[Callable[[], str], None] = None
"""A function that returns an Azure Active Directory token.
Will be invoked on every sync request. For async requests,
will be invoked if `azure_ad_async_token_provider` is not provided.
"""
azure_ad_async_token_provider: Union[Callable[[], Awaitable[str]], None] = None
"""A function that returns an Azure Active Directory token.
Will be invoked on every async request.
"""
openai_api_version: Optional[str] = Field(default=None, alias="api_version")
"""Automatically inferred from env var `OPENAI_API_VERSION` if not provided."""
validate_base_url: bool = True
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate that api key and python package exists in environment."""
# Check OPENAI_KEY for backwards compatibility.
# TODO: Remove OPENAI_API_KEY support to avoid possible conflict when using
# other forms of azure credentials.
values["openai_api_key"] = (
values.get("openai_api_key")
or os.getenv("AZURE_OPENAI_API_KEY")
or os.getenv("OPENAI_API_KEY")
)
values["openai_api_base"] = values.get("openai_api_base") or os.getenv(
"OPENAI_API_BASE"
)
values["openai_api_version"] = values.get("openai_api_version") or os.getenv(
"OPENAI_API_VERSION", default="2023-05-15"
)
values["openai_api_type"] = get_from_dict_or_env(
values, "openai_api_type", "OPENAI_API_TYPE", default="azure"
)
values["openai_organization"] = (
values.get("openai_organization")
or os.getenv("OPENAI_ORG_ID")
or os.getenv("OPENAI_ORGANIZATION")
)
values["openai_proxy"] = get_from_dict_or_env(
values,
"openai_proxy",
"OPENAI_PROXY",
default="",
)
values["azure_endpoint"] = values.get("azure_endpoint") or os.getenv(
"AZURE_OPENAI_ENDPOINT"
)
values["azure_ad_token"] = values.get("azure_ad_token") or os.getenv(
"AZURE_OPENAI_AD_TOKEN"
)
# Azure OpenAI embedding models allow a maximum of 2048 texts
# at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
values["chunk_size"] = min(values["chunk_size"], 2048)
try:
import openai # noqa: F401
except ImportError:
raise ImportError(
"Could not import openai python package. "
"Please install it with `pip install openai`."
)
if is_openai_v1():
# For backwards compatibility. Before openai v1, no distinction was made
# between azure_endpoint and base_url (openai_api_base).
openai_api_base = values["openai_api_base"]
if openai_api_base and values["validate_base_url"]:
if "/openai" not in openai_api_base:
values["openai_api_base"] += "/openai"
warnings.warn(
"As of openai>=1.0.0, Azure endpoints should be specified via "
f"the `azure_endpoint` param not `openai_api_base` "
f"(or alias `base_url`). Updating `openai_api_base` from "
f"{openai_api_base} to {values['openai_api_base']}."
)
if values["deployment"]:
warnings.warn(
"As of openai>=1.0.0, if `deployment` (or alias "
"`azure_deployment`) is specified then "
"`openai_api_base` (or alias `base_url`) should not be. "
"Instead use `deployment` (or alias `azure_deployment`) "
"and `azure_endpoint`."
)
if values["deployment"] not in values["openai_api_base"]:
warnings.warn(
"As of openai>=1.0.0, if `openai_api_base` "
"(or alias `base_url`) is specified it is expected to be "
"of the form "
"https://example-resource.azure.openai.com/openai/deployments/example-deployment. " # noqa: E501
f"Updating {openai_api_base} to "
f"{values['openai_api_base']}."
)
values["openai_api_base"] += (
"/deployments/" + values["deployment"]
)
values["deployment"] = None
return values
@model_validator(mode="after")
def post_init_validator(self) -> Self:
"""Validate that the base url is set."""
import openai
if is_openai_v1():
client_params = {
"api_version": self.openai_api_version,
"azure_endpoint": self.azure_endpoint,
"azure_deployment": self.deployment,
"api_key": self.openai_api_key,
"azure_ad_token": self.azure_ad_token,
"azure_ad_token_provider": self.azure_ad_token_provider,
"organization": self.openai_organization,
"base_url": self.openai_api_base,
"timeout": self.request_timeout,
"max_retries": self.max_retries,
"default_headers": {
**(self.default_headers or {}),
"User-Agent": "langchain-comm-python-azure-openai",
},
"default_query": self.default_query,
"http_client": self.http_client,
}
self.client = openai.AzureOpenAI(**client_params).embeddings
if self.azure_ad_async_token_provider:
client_params["azure_ad_token_provider"] = (
self.azure_ad_async_token_provider
)
self.async_client = openai.AsyncAzureOpenAI(**client_params).embeddings
else:
self.client = openai.Embedding
return self
@property
def _llm_type(self) -> str:
return "azure-openai-chat"

View File

@@ -0,0 +1,150 @@
from typing import Any, List, Optional
import requests
from langchain_core.embeddings import Embeddings
from langchain_core.utils import (
secret_from_env,
)
from pydantic import (
BaseModel,
ConfigDict,
Field,
SecretStr,
model_validator,
)
from requests import RequestException
from typing_extensions import Self
BAICHUAN_API_URL: str = "https://api.baichuan-ai.com/v1/embeddings"
# BaichuanTextEmbeddings is an embedding model provided by Baichuan Inc. (https://www.baichuan-ai.com/home).
# As of today (Jan 25th, 2024) BaichuanTextEmbeddings ranks #1 in C-MTEB
# (Chinese Multi-Task Embedding Benchmark) leaderboard.
# Leaderboard (Under Overall -> Chinese section): https://huggingface.co/spaces/mteb/leaderboard
# Official Website: https://platform.baichuan-ai.com/docs/text-Embedding
# An API-key is required to use this embedding model. You can get one by registering
# at https://platform.baichuan-ai.com/docs/text-Embedding.
# BaichuanTextEmbeddings support 512 token window and produces vectors with
# 1024 dimensions.
# NOTE!! BaichuanTextEmbeddings only supports Chinese text embedding.
# Multi-language support is coming soon.
class BaichuanTextEmbeddings(BaseModel, Embeddings):
"""Baichuan Text Embedding models.
Setup:
To use, you should set the environment variable ``BAICHUAN_API_KEY`` to
your API key or pass it as a named parameter to the constructor.
.. code-block:: bash
export BAICHUAN_API_KEY="your-api-key"
Instantiate:
.. code-block:: python
from langchain_community.embeddings import BaichuanTextEmbeddings
embeddings = BaichuanTextEmbeddings()
Embed:
.. code-block:: python
# embed the documents
vectors = embeddings.embed_documents([text1, text2, ...])
# embed the query
vectors = embeddings.embed_query(text)
""" # noqa: E501
session: Any = None #: :meta private:
model_name: str = Field(default="Baichuan-Text-Embedding", alias="model")
"""The model used to embed the documents."""
baichuan_api_key: SecretStr = Field(
alias="api_key",
default_factory=secret_from_env(["BAICHUAN_API_KEY", "BAICHUAN_AUTH_TOKEN"]),
)
"""Automatically inferred from env var `BAICHUAN_API_KEY` if not provided."""
chunk_size: int = 16
"""Chunk size when multiple texts are input"""
model_config = ConfigDict(populate_by_name=True, protected_namespaces=())
@model_validator(mode="after")
def validate_environment(self) -> Self:
"""Validate that auth token exists in environment."""
session = requests.Session()
session.headers.update(
{
"Authorization": f"Bearer {self.baichuan_api_key.get_secret_value()}",
"Accept-Encoding": "identity",
"Content-type": "application/json",
}
)
self.session = session
return self
def _embed(self, texts: List[str]) -> Optional[List[List[float]]]:
"""Internal method to call Baichuan Embedding API and return embeddings.
Args:
texts: A list of texts to embed.
Returns:
A list of list of floats representing the embeddings, or None if an
error occurs.
"""
chunk_texts = [
texts[i : i + self.chunk_size]
for i in range(0, len(texts), self.chunk_size)
]
embed_results = []
for chunk in chunk_texts:
response = self.session.post(
BAICHUAN_API_URL, json={"input": chunk, "model": self.model_name}
)
# Raise exception if response status code from 400 to 600
response.raise_for_status()
# Check if the response status code indicates success
if response.status_code == 200:
resp = response.json()
embeddings = resp.get("data", [])
# Sort resulting embeddings by index
sorted_embeddings = sorted(embeddings, key=lambda e: e.get("index", 0))
# Return just the embeddings
embed_results.extend(
[result.get("embedding", []) for result in sorted_embeddings]
)
else:
# Log error or handle unsuccessful response appropriately
# Handle 100 <= status_code < 400, not include 200
raise RequestException(
f"Error: Received status code {response.status_code} from "
"`BaichuanEmbedding` API"
)
return embed_results
def embed_documents(self, texts: List[str]) -> Optional[List[List[float]]]: # type: ignore[override]
"""Public method to get embeddings for a list of documents.
Args:
texts: The list of texts to embed.
Returns:
A list of embeddings, one for each text, or None if an error occurs.
"""
return self._embed(texts)
def embed_query(self, text: str) -> Optional[List[float]]: # type: ignore[override]
"""Public method to get embedding for a single query text.
Args:
text: The text to embed.
Returns:
Embeddings for the text, or None if an error occurs.
"""
result = self._embed([text])
return result[0] if result is not None else None

View File

@@ -0,0 +1,186 @@
from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
from pydantic import BaseModel, ConfigDict, Field, SecretStr
logger = logging.getLogger(__name__)
class QianfanEmbeddingsEndpoint(BaseModel, Embeddings):
"""Baidu Qianfan Embeddings embedding models.
Setup:
To use, you should have the ``qianfan`` python package installed, and set
environment variables ``QIANFAN_AK``, ``QIANFAN_SK``.
.. code-block:: bash
pip install qianfan
export QIANFAN_AK="your-api-key"
export QIANFAN_SK="your-secret_key"
Instantiate:
.. code-block:: python
from langchain_community.embeddings import QianfanEmbeddingsEndpoint
embeddings = QianfanEmbeddingsEndpoint()
Embed:
.. code-block:: python
# embed the documents
vectors = embeddings.embed_documents([text1, text2, ...])
# embed the query
vectors = embeddings.embed_query(text)
# embed the documents with async
vectors = await embeddings.aembed_documents([text1, text2, ...])
# embed the query with async
vectors = await embeddings.aembed_query(text)
""" # noqa: E501
qianfan_ak: Optional[SecretStr] = Field(default=None, alias="api_key")
"""Qianfan application apikey"""
qianfan_sk: Optional[SecretStr] = Field(default=None, alias="secret_key")
"""Qianfan application secretkey"""
chunk_size: int = 16
"""Chunk size when multiple texts are input"""
model: Optional[str] = Field(default=None)
"""Model name
you could get from https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Nlks5zkzu
for now, we support Embedding-V1 and
- Embedding-V1 (默认模型)
- bge-large-en
- bge-large-zh
preset models are mapping to an endpoint.
`model` will be ignored if `endpoint` is set
"""
endpoint: str = ""
"""Endpoint of the Qianfan Embedding, required if custom model used."""
client: Any = None
"""Qianfan client"""
init_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""init kwargs for qianfan client init, such as `query_per_second` which is
associated with qianfan resource object to limit QPS"""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""extra params for model invoke using with `do`."""
model_config = ConfigDict(protected_namespaces=())
@pre_init
def validate_environment(cls, values: Dict) -> Dict:
"""
Validate whether qianfan_ak and qianfan_sk in the environment variables or
configuration file are available or not.
init qianfan embedding client with `ak`, `sk`, `model`, `endpoint`
Args:
values: a dictionary containing configuration information, must include the
fields of qianfan_ak and qianfan_sk
Returns:
a dictionary containing configuration information. If qianfan_ak and
qianfan_sk are not provided in the environment variables or configuration
file,the original values will be returned; otherwise, values containing
qianfan_ak and qianfan_sk will be returned.
Raises:
ValueError: qianfan package not found, please install it with `pip install
qianfan`
"""
values["qianfan_ak"] = convert_to_secret_str(
get_from_dict_or_env(
values,
"qianfan_ak",
"QIANFAN_AK",
default="",
)
)
values["qianfan_sk"] = convert_to_secret_str(
get_from_dict_or_env(
values,
"qianfan_sk",
"QIANFAN_SK",
default="",
)
)
try:
import qianfan
params = {
**values.get("init_kwargs", {}),
"model": values["model"],
}
if values["qianfan_ak"].get_secret_value() != "":
params["ak"] = values["qianfan_ak"].get_secret_value()
if values["qianfan_sk"].get_secret_value() != "":
params["sk"] = values["qianfan_sk"].get_secret_value()
if values["endpoint"] is not None and values["endpoint"] != "":
params["endpoint"] = values["endpoint"]
values["client"] = qianfan.Embedding(**params)
except ImportError:
raise ImportError(
"qianfan package not found, please install it with "
"`pip install qianfan`"
)
return values
def embed_query(self, text: str) -> List[float]:
resp = self.embed_documents([text])
return resp[0]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Embeds a list of text documents using the AutoVOT algorithm.
Args:
texts (List[str]): A list of text documents to embed.
Returns:
List[List[float]]: A list of embeddings for each document in the input list.
Each embedding is represented as a list of float values.
"""
text_in_chunks = [
texts[i : i + self.chunk_size]
for i in range(0, len(texts), self.chunk_size)
]
lst = []
for chunk in text_in_chunks:
resp = self.client.do(texts=chunk, **self.model_kwargs)
lst.extend([res["embedding"] for res in resp["data"]])
return lst
async def aembed_query(self, text: str) -> List[float]:
embeddings = await self.aembed_documents([text])
return embeddings[0]
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
text_in_chunks = [
texts[i : i + self.chunk_size]
for i in range(0, len(texts), self.chunk_size)
]
lst = []
for chunk in text_in_chunks:
resp = await self.client.ado(texts=chunk, **self.model_kwargs)
for res in resp["data"]:
lst.extend([res["embedding"]])
return lst

View File

@@ -0,0 +1,222 @@
import asyncio
import json
import os
from typing import Any, Dict, List, Optional
import numpy as np
from langchain_core._api.deprecation import deprecated
from langchain_core.embeddings import Embeddings
from langchain_core.runnables.config import run_in_executor
from pydantic import BaseModel, ConfigDict, model_validator
from typing_extensions import Self
@deprecated(
since="0.2.11",
removal="1.0",
alternative_import="langchain_aws.BedrockEmbeddings",
)
class BedrockEmbeddings(BaseModel, Embeddings):
"""Bedrock embedding models.
To authenticate, the AWS client uses the following methods to
automatically load credentials:
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
If a specific credential profile should be used, you must pass
the name of the profile from the ~/.aws/credentials file that is to be used.
Make sure the credentials / roles used have the required policies to
access the Bedrock service.
"""
"""
Example:
.. code-block:: python
from langchain_community.bedrock_embeddings import BedrockEmbeddings
region_name ="us-east-1"
credentials_profile_name = "default"
model_id = "amazon.titan-embed-text-v1"
be = BedrockEmbeddings(
credentials_profile_name=credentials_profile_name,
region_name=region_name,
model_id=model_id
)
"""
client: Any = None #: :meta private:
"""Bedrock client."""
region_name: Optional[str] = None
"""The aws region e.g., `us-west-2`. Fallsback to AWS_DEFAULT_REGION env variable
or region specified in ~/.aws/config in case it is not provided here.
"""
credentials_profile_name: Optional[str] = None
"""The name of the profile in the ~/.aws/credentials or ~/.aws/config files, which
has either access keys or role information specified.
If not specified, the default credential profile or, if on an EC2 instance,
credentials from IMDS will be used.
See: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
"""
model_id: str = "amazon.titan-embed-text-v1"
"""Id of the model to call, e.g., amazon.titan-embed-text-v1, this is
equivalent to the modelId property in the list-foundation-models api"""
model_kwargs: Optional[Dict] = None
"""Keyword arguments to pass to the model."""
endpoint_url: Optional[str] = None
"""Needed if you don't want to default to us-east-1 endpoint"""
normalize: bool = False
"""Whether the embeddings should be normalized to unit vectors"""
model_config = ConfigDict(extra="forbid", protected_namespaces=())
@model_validator(mode="after")
def validate_environment(self) -> Self:
"""Validate that AWS credentials to and python package exists in environment."""
if self.client is not None:
return self
try:
import boto3
if self.credentials_profile_name is not None:
session = boto3.Session(profile_name=self.credentials_profile_name)
else:
# use default credentials
session = boto3.Session()
client_params = {}
if self.region_name:
client_params["region_name"] = self.region_name
if self.endpoint_url:
client_params["endpoint_url"] = self.endpoint_url
self.client = session.client("bedrock-runtime", **client_params)
except ImportError:
raise ImportError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
)
except Exception as e:
raise ValueError(
"Could not load credentials to authenticate with AWS client. "
"Please check that credentials in the specified "
f"profile name are valid. Bedrock error: {e}"
) from e
return self
def _embedding_func(self, text: str) -> List[float]:
"""Call out to Bedrock embedding endpoint."""
# replace newlines, which can negatively affect performance.
text = text.replace(os.linesep, " ")
# format input body for provider
provider = self.model_id.split(".")[0]
_model_kwargs = self.model_kwargs or {}
input_body = {**_model_kwargs}
if provider == "cohere":
if "input_type" not in input_body.keys():
input_body["input_type"] = "search_document"
input_body["texts"] = [text]
else:
# includes common provider == "amazon"
input_body["inputText"] = text
body = json.dumps(input_body)
try:
# invoke bedrock API
response = self.client.invoke_model(
body=body,
modelId=self.model_id,
accept="application/json",
contentType="application/json",
)
# format output based on provider
response_body = json.loads(response.get("body").read())
if provider == "cohere":
return response_body.get("embeddings")[0]
else:
# includes common provider == "amazon"
return response_body.get("embedding")
except Exception as e:
raise ValueError(f"Error raised by inference endpoint: {e}")
def _normalize_vector(self, embeddings: List[float]) -> List[float]:
"""Normalize the embedding to a unit vector."""
emb = np.array(embeddings)
norm_emb = emb / np.linalg.norm(emb)
return norm_emb.tolist()
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a Bedrock model.
Args:
texts: The list of texts to embed
Returns:
List of embeddings, one for each text.
"""
results = []
for text in texts:
response = self._embedding_func(text)
if self.normalize:
response = self._normalize_vector(response)
results.append(response)
return results
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a Bedrock model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
embedding = self._embedding_func(text)
if self.normalize:
return self._normalize_vector(embedding)
return embedding
async def aembed_query(self, text: str) -> List[float]:
"""Asynchronous compute query embeddings using a Bedrock model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return await run_in_executor(None, self.embed_query, text)
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
"""Asynchronous compute doc embeddings using a Bedrock model.
Args:
texts: The list of texts to embed
Returns:
List of embeddings, one for each text.
"""
result = await asyncio.gather(*[self.aembed_query(text) for text in texts])
return list(result)

View File

@@ -0,0 +1,97 @@
"""Wrapper around Bookend AI embedding models."""
import json
from typing import Any, List
import requests
from langchain_core.embeddings import Embeddings
from pydantic import BaseModel, ConfigDict, Field
API_URL = "https://api.bookend.ai/"
DEFAULT_TASK = "embeddings"
PATH = "/models/predict"
class BookendEmbeddings(BaseModel, Embeddings):
"""Bookend AI sentence_transformers embedding models.
Example:
.. code-block:: python
from langchain_community.embeddings import BookendEmbeddings
bookend = BookendEmbeddings(
domain={domain}
api_token={api_token}
model_id={model_id}
)
bookend.embed_documents([
"Please put on these earmuffs because I can't you hear.",
"Baby wipes are made of chocolate stardust.",
])
bookend.embed_query(
"She only paints with bold colors; she does not like pastels."
)
"""
domain: str
"""Request for a domain at https://bookend.ai/ to use this embeddings module."""
api_token: str
"""Request for an API token at https://bookend.ai/ to use this embeddings module."""
model_id: str
"""Embeddings model ID to use."""
auth_header: dict = Field(default_factory=dict)
model_config = ConfigDict(protected_namespaces=())
def __init__(self, **kwargs: Any):
super().__init__(**kwargs)
self.auth_header = {"Authorization": "Basic {}".format(self.api_token)}
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed documents using a Bookend deployed embeddings model.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
result = []
headers = self.auth_header
headers["Content-Type"] = "application/json; charset=utf-8"
params = {
"model_id": self.model_id,
"task": DEFAULT_TASK,
}
for text in texts:
data = json.dumps(
{
"text": text,
"question": None,
"context": None,
"instruction": None,
}
)
r = requests.request(
"POST",
API_URL + self.domain + PATH,
headers=headers,
params=params,
data=data,
)
result.append(r.json()[0]["data"])
return result
def embed_query(self, text: str) -> List[float]:
"""Embed a query using a Bookend deployed embeddings model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.embed_documents([text])[0]

View File

@@ -0,0 +1,139 @@
import logging
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from pydantic import BaseModel, ConfigDict, Field, model_validator
logger = logging.getLogger(__name__)
class ClarifaiEmbeddings(BaseModel, Embeddings):
"""Clarifai embedding models.
To use, you should have the ``clarifai`` python package installed, and the
environment variable ``CLARIFAI_PAT`` set with your personal access token or pass it
as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_community.embeddings import ClarifaiEmbeddings
clarifai = ClarifaiEmbeddings(user_id=USER_ID,
app_id=APP_ID,
model_id=MODEL_ID)
(or)
Example_URL = "https://clarifai.com/clarifai/main/models/BAAI-bge-base-en-v15"
clarifai = ClarifaiEmbeddings(model_url=EXAMPLE_URL)
"""
model_url: Optional[str] = None
"""Model url to use."""
model_id: Optional[str] = None
"""Model id to use."""
model_version_id: Optional[str] = None
"""Model version id to use."""
app_id: Optional[str] = None
"""Clarifai application id to use."""
user_id: Optional[str] = None
"""Clarifai user id to use."""
pat: Optional[str] = Field(default=None, exclude=True)
"""Clarifai personal access token to use."""
token: Optional[str] = Field(default=None, exclude=True)
"""Clarifai session token to use."""
model: Any = Field(default=None, exclude=True) #: :meta private:
api_base: str = "https://api.clarifai.com"
model_config = ConfigDict(extra="forbid", protected_namespaces=())
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate that we have all required info to access Clarifai
platform and python package exists in environment."""
try:
from clarifai.client.model import Model
except ImportError:
raise ImportError(
"Could not import clarifai python package. "
"Please install it with `pip install clarifai`."
)
user_id = values.get("user_id")
app_id = values.get("app_id")
model_id = values.get("model_id")
model_version_id = values.get("model_version_id")
model_url = values.get("model_url")
api_base = values.get("api_base")
pat = values.get("pat")
token = values.get("token")
values["model"] = Model(
url=model_url,
app_id=app_id,
user_id=user_id,
model_version=dict(id=model_version_id),
pat=pat,
token=token,
model_id=model_id,
base_url=api_base,
)
return values
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Call out to Clarifai's embedding models.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
from clarifai.client.input import Inputs
input_obj = Inputs.from_auth_helper(self.model.auth_helper)
batch_size = 32
embeddings = []
try:
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
input_batch = [
input_obj.get_text_input(input_id=str(id), raw_text=inp)
for id, inp in enumerate(batch)
]
predict_response = self.model.predict(input_batch)
embeddings.extend(
[
list(output.data.embeddings[0].vector)
for output in predict_response.outputs
]
)
except Exception as e:
logger.error(f"Predict failed, exception: {e}")
return embeddings
def embed_query(self, text: str) -> List[float]:
"""Call out to Clarifai's embedding models.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
try:
predict_response = self.model.predict_by_bytes(
bytes(text, "utf-8"), input_type="text"
)
embeddings = [
list(op.data.embeddings[0].vector) for op in predict_response.outputs
]
except Exception as e:
logger.error(f"Predict failed, exception: {e}")
return embeddings[0]

View File

@@ -0,0 +1,97 @@
from typing import Any, Dict, List
import requests
from langchain_core._api.deprecation import deprecated
from langchain_core.embeddings import Embeddings
from pydantic import BaseModel, ConfigDict
DEFAULT_MODEL_NAME = "@cf/baai/bge-base-en-v1.5"
@deprecated(
since="0.3.23",
removal="1.0",
alternative_import="langchain_cloudflare.CloudflareWorkersAIEmbeddings",
)
class CloudflareWorkersAIEmbeddings(BaseModel, Embeddings):
"""Cloudflare Workers AI embedding model.
To use, you need to provide an API token and
account ID to access Cloudflare Workers AI.
Example:
.. code-block:: python
from langchain_community.embeddings import CloudflareWorkersAIEmbeddings
account_id = "my_account_id"
api_token = "my_secret_api_token"
model_name = "@cf/baai/bge-small-en-v1.5"
cf = CloudflareWorkersAIEmbeddings(
account_id=account_id,
api_token=api_token,
model_name=model_name
)
"""
api_base_url: str = "https://api.cloudflare.com/client/v4/accounts"
account_id: str
api_token: str
model_name: str = DEFAULT_MODEL_NAME
batch_size: int = 50
strip_new_lines: bool = True
headers: Dict[str, str] = {"Authorization": "Bearer "}
def __init__(self, **kwargs: Any):
"""Initialize the Cloudflare Workers AI client."""
super().__init__(**kwargs)
self.headers = {"Authorization": f"Bearer {self.api_token}"}
model_config = ConfigDict(extra="forbid", protected_namespaces=())
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using Cloudflare Workers AI.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
if self.strip_new_lines:
texts = [text.replace("\n", " ") for text in texts]
batches = [
texts[i : i + self.batch_size]
for i in range(0, len(texts), self.batch_size)
]
embeddings = []
for batch in batches:
response = requests.post(
f"{self.api_base_url}/{self.account_id}/ai/run/{self.model_name}",
headers=self.headers,
json={"text": batch},
)
embeddings.extend(response.json()["result"]["data"])
return embeddings
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using Cloudflare Workers AI.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
text = text.replace("\n", " ") if self.strip_new_lines else text
response = requests.post(
f"{self.api_base_url}/{self.account_id}/ai/run/{self.model_name}",
headers=self.headers,
json={"text": [text]},
)
return response.json()["result"]["data"][0]

View File

@@ -0,0 +1,142 @@
from __future__ import annotations
from typing import Any, Dict, List, Optional, cast
import requests
from langchain_core._api.deprecation import deprecated
from langchain_core.embeddings import Embeddings
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from pydantic import BaseModel, ConfigDict, SecretStr, model_validator
@deprecated(
since="0.3.4",
removal="1.0.0",
alternative_import="langchain_community.ClovaXEmbeddings",
)
class ClovaEmbeddings(BaseModel, Embeddings):
"""
Clova's embedding service.
To use this service,
you should have the following environment variables
set with your API tokens and application ID,
or pass them as named parameters to the constructor:
- ``CLOVA_EMB_API_KEY``: API key for accessing Clova's embedding service.
- ``CLOVA_EMB_APIGW_API_KEY``: API gateway key for enhanced security.
- ``CLOVA_EMB_APP_ID``: Application ID for identifying your application.
Example:
.. code-block:: python
from langchain_community.embeddings import ClovaEmbeddings
embeddings = ClovaEmbeddings(
clova_emb_api_key='your_clova_emb_api_key',
clova_emb_apigw_api_key='your_clova_emb_apigw_api_key',
app_id='your_app_id'
)
query_text = "This is a test query."
query_result = embeddings.embed_query(query_text)
document_text = "This is a test document."
document_result = embeddings.embed_documents([document_text])
"""
endpoint_url: str = (
"https://clovastudio.apigw.ntruss.com/testapp/v1/api-tools/embedding"
)
"""Endpoint URL to use."""
model: str = "clir-emb-dolphin"
"""Embedding model name to use."""
clova_emb_api_key: Optional[SecretStr] = None
"""API key for accessing Clova's embedding service."""
clova_emb_apigw_api_key: Optional[SecretStr] = None
"""API gateway key for enhanced security."""
app_id: Optional[SecretStr] = None
"""Application ID for identifying your application."""
model_config = ConfigDict(
extra="forbid",
)
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate api key exists in environment."""
values["clova_emb_api_key"] = convert_to_secret_str(
get_from_dict_or_env(values, "clova_emb_api_key", "CLOVA_EMB_API_KEY")
)
values["clova_emb_apigw_api_key"] = convert_to_secret_str(
get_from_dict_or_env(
values, "clova_emb_apigw_api_key", "CLOVA_EMB_APIGW_API_KEY"
)
)
values["app_id"] = convert_to_secret_str(
get_from_dict_or_env(values, "app_id", "CLOVA_EMB_APP_ID")
)
return values
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Embed a list of texts and return their embeddings.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
embeddings = []
for text in texts:
embeddings.append(self._embed_text(text))
return embeddings
def embed_query(self, text: str) -> List[float]:
"""
Embed a single query text and return its embedding.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self._embed_text(text)
def _embed_text(self, text: str) -> List[float]:
"""
Internal method to call the embedding API and handle the response.
"""
payload = {"text": text}
# HTTP headers for authorization
headers = {
"X-NCP-CLOVASTUDIO-API-KEY": cast(
SecretStr, self.clova_emb_api_key
).get_secret_value(),
"X-NCP-APIGW-API-KEY": cast(
SecretStr, self.clova_emb_apigw_api_key
).get_secret_value(),
"Content-Type": "application/json",
}
# send request
app_id = cast(SecretStr, self.app_id).get_secret_value()
response = requests.post(
f"{self.endpoint_url}/{self.model}/{app_id}",
headers=headers,
json=payload,
)
# check for errors
if response.status_code == 200:
response_data = response.json()
if "result" in response_data and "embedding" in response_data["result"]:
return response_data["result"]["embedding"]
raise ValueError(
f"API request failed with status {response.status_code}: {response.text}"
)

View File

@@ -0,0 +1,172 @@
from typing import Any, Dict, List, Optional
from langchain_core._api.deprecation import deprecated
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, ConfigDict, model_validator
from langchain_community.llms.cohere import _create_retry_decorator
@deprecated(
since="0.0.30",
removal="1.0",
alternative_import="langchain_cohere.CohereEmbeddings",
)
class CohereEmbeddings(BaseModel, Embeddings):
"""Cohere embedding models.
To use, you should have the ``cohere`` python package installed, and the
environment variable ``COHERE_API_KEY`` set with your API key or pass it
as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_community.embeddings import CohereEmbeddings
cohere = CohereEmbeddings(
model="embed-english-light-v3.0",
cohere_api_key="my-api-key"
)
"""
client: Any = None #: :meta private:
"""Cohere client."""
async_client: Any = None #: :meta private:
"""Cohere async client."""
model: str = "embed-english-v2.0"
"""Model name to use."""
truncate: Optional[str] = None
"""Truncate embeddings that are too long from start or end ("NONE"|"START"|"END")"""
cohere_api_key: Optional[str] = None
max_retries: int = 3
"""Maximum number of retries to make when generating."""
request_timeout: Optional[float] = None
"""Timeout in seconds for the Cohere API request."""
user_agent: str = "langchain"
"""Identifier for the application making the request."""
model_config = ConfigDict(
extra="forbid",
)
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate that api key and python package exists in environment."""
cohere_api_key = get_from_dict_or_env(
values, "cohere_api_key", "COHERE_API_KEY"
)
request_timeout = values.get("request_timeout")
try:
import cohere
client_name = values["user_agent"]
values["client"] = cohere.Client(
cohere_api_key,
timeout=request_timeout,
client_name=client_name,
)
values["async_client"] = cohere.AsyncClient(
cohere_api_key,
timeout=request_timeout,
client_name=client_name,
)
except ImportError:
raise ImportError(
"Could not import cohere python package. "
"Please install it with `pip install cohere`."
)
return values
def embed_with_retry(self, **kwargs: Any) -> Any:
"""Use tenacity to retry the embed call."""
retry_decorator = _create_retry_decorator(self.max_retries)
@retry_decorator
def _embed_with_retry(**kwargs: Any) -> Any:
return self.client.embed(**kwargs)
return _embed_with_retry(**kwargs)
def aembed_with_retry(self, **kwargs: Any) -> Any:
"""Use tenacity to retry the embed call."""
retry_decorator = _create_retry_decorator(self.max_retries)
@retry_decorator
async def _embed_with_retry(**kwargs: Any) -> Any:
return await self.async_client.embed(**kwargs)
return _embed_with_retry(**kwargs)
def embed(
self, texts: List[str], *, input_type: Optional[str] = None
) -> List[List[float]]:
embeddings = self.embed_with_retry(
model=self.model,
texts=texts,
input_type=input_type,
truncate=self.truncate,
).embeddings
return [list(map(float, e)) for e in embeddings]
async def aembed(
self, texts: List[str], *, input_type: Optional[str] = None
) -> List[List[float]]:
embeddings = (
await self.aembed_with_retry(
model=self.model,
texts=texts,
input_type=input_type,
truncate=self.truncate,
)
).embeddings
return [list(map(float, e)) for e in embeddings]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of document texts.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
return self.embed(texts, input_type="search_document")
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
"""Async call out to Cohere's embedding endpoint.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
return await self.aembed(texts, input_type="search_document")
def embed_query(self, text: str) -> List[float]:
"""Call out to Cohere's embedding endpoint.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.embed([text], input_type="search_query")[0]
async def aembed_query(self, text: str) -> List[float]:
"""Async call out to Cohere's embedding endpoint.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return (await self.aembed([text], input_type="search_query"))[0]

View File

@@ -0,0 +1,173 @@
from __future__ import annotations
import logging
from typing import (
Any,
Callable,
Dict,
List,
Optional,
)
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, ConfigDict, model_validator
from requests.exceptions import HTTPError
from tenacity import (
before_sleep_log,
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
logger = logging.getLogger(__name__)
BATCH_SIZE = {
"text-embedding-v1": 25,
"text-embedding-v2": 25,
"text-embedding-v3": 10,
"text-embedding-v4": 10,
}
def _create_retry_decorator(embeddings: DashScopeEmbeddings) -> Callable[[Any], Any]:
multiplier = 1
min_seconds = 1
max_seconds = 4
# Wait 2^x * 1 second between each retry starting with
# 1 seconds, then up to 4 seconds, then 4 seconds afterwards
return retry(
reraise=True,
stop=stop_after_attempt(embeddings.max_retries),
wait=wait_exponential(multiplier, min=min_seconds, max=max_seconds),
retry=(retry_if_exception_type(HTTPError)),
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def embed_with_retry(embeddings: DashScopeEmbeddings, **kwargs: Any) -> Any:
"""Use tenacity to retry the embedding call."""
retry_decorator = _create_retry_decorator(embeddings)
@retry_decorator
def _embed_with_retry(**kwargs: Any) -> Any:
result = []
i = 0
input_data = kwargs["input"]
input_len = len(input_data) if isinstance(input_data, list) else 1
batch_size = BATCH_SIZE.get(kwargs["model"], 25)
while i < input_len:
kwargs["input"] = (
input_data[i : i + batch_size]
if isinstance(input_data, list)
else input_data
)
resp = embeddings.client.call(**kwargs)
if resp.status_code == 200:
result += resp.output["embeddings"]
elif resp.status_code in [400, 401]:
raise ValueError(
f"status_code: {resp.status_code} \n "
f"code: {resp.code} \n message: {resp.message}"
)
else:
raise HTTPError(
f"HTTP error occurred: status_code: {resp.status_code} \n "
f"code: {resp.code} \n message: {resp.message}",
response=resp,
)
i += batch_size
return result
return _embed_with_retry(**kwargs)
class DashScopeEmbeddings(BaseModel, Embeddings):
"""DashScope embedding models.
To use, you should have the ``dashscope`` python package installed, and the
environment variable ``DASHSCOPE_API_KEY`` set with your API key or pass it
as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_community.embeddings import DashScopeEmbeddings
embeddings = DashScopeEmbeddings(dashscope_api_key="my-api-key")
Example:
.. code-block:: python
import os
os.environ["DASHSCOPE_API_KEY"] = "your DashScope API KEY"
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
embeddings = DashScopeEmbeddings(
model="text-embedding-v1",
)
text = "This is a test query."
query_result = embeddings.embed_query(text)
"""
client: Any = None #: :meta private:
"""The DashScope client."""
model: str = "text-embedding-v1"
dashscope_api_key: Optional[str] = None
max_retries: int = 5
"""Maximum number of retries to make when generating."""
model_config = ConfigDict(
extra="forbid",
)
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
import dashscope
"""Validate that api key and python package exists in environment."""
values["dashscope_api_key"] = get_from_dict_or_env(
values, "dashscope_api_key", "DASHSCOPE_API_KEY"
)
dashscope.api_key = values["dashscope_api_key"]
try:
import dashscope
values["client"] = dashscope.TextEmbedding
except ImportError:
raise ImportError(
"Could not import dashscope python package. "
"Please install it with `pip install dashscope`."
)
return values
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Call out to DashScope's embedding endpoint for embedding search docs.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
embeddings = embed_with_retry(
self, input=texts, text_type="document", model=self.model
)
embedding_list = [item["embedding"] for item in embeddings]
return embedding_list
def embed_query(self, text: str) -> List[float]:
"""Call out to DashScope's embedding endpoint for embedding query text.
Args:
text: The text to embed.
Returns:
Embedding for the text.
"""
embedding = embed_with_retry(
self, input=text, text_type="query", model=self.model
)[0]["embedding"]
return embedding

View File

@@ -0,0 +1,52 @@
from __future__ import annotations
from typing import Iterator, List
from urllib.parse import urlparse
from langchain_core._api import deprecated
from langchain_community.embeddings.mlflow import MlflowEmbeddings
def _chunk(texts: List[str], size: int) -> Iterator[List[str]]:
for i in range(0, len(texts), size):
yield texts[i : i + size]
@deprecated(
since="0.3.3",
removal="1.0",
alternative_import="databricks_langchain.DatabricksEmbeddings",
)
class DatabricksEmbeddings(MlflowEmbeddings):
"""Databricks embeddings.
To use, you should have the ``mlflow`` python package installed.
For more information, see https://mlflow.org/docs/latest/llms/deployments.
Example:
.. code-block:: python
from langchain_community.embeddings import DatabricksEmbeddings
embeddings = DatabricksEmbeddings(
target_uri="databricks",
endpoint="embeddings",
)
"""
target_uri: str = "databricks"
"""The target URI to use. Defaults to ``databricks``."""
@property
def _mlflow_extras(self) -> str:
return ""
def _validate_uri(self) -> None:
if self.target_uri == "databricks":
return
if urlparse(self.target_uri).scheme != "databricks":
raise ValueError(
"Invalid target URI. The target URI must be a valid databricks URI."
)

View File

@@ -0,0 +1,140 @@
from typing import Any, Dict, List, Mapping, Optional
import requests
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env, pre_init
from pydantic import BaseModel, ConfigDict
DEFAULT_MODEL_ID = "sentence-transformers/clip-ViT-B-32"
MAX_BATCH_SIZE = 1024
class DeepInfraEmbeddings(BaseModel, Embeddings):
"""Deep Infra's embedding inference service.
To use, you should have the
environment variable ``DEEPINFRA_API_TOKEN`` set with your API token, or pass
it as a named parameter to the constructor.
There are multiple embeddings models available,
see https://deepinfra.com/models?type=embeddings.
Example:
.. code-block:: python
from langchain_community.embeddings import DeepInfraEmbeddings
deepinfra_emb = DeepInfraEmbeddings(
model_id="sentence-transformers/clip-ViT-B-32",
deepinfra_api_token="my-api-key"
)
r1 = deepinfra_emb.embed_documents(
[
"Alpha is the first letter of Greek alphabet",
"Beta is the second letter of Greek alphabet",
]
)
r2 = deepinfra_emb.embed_query(
"What is the second letter of Greek alphabet"
)
"""
model_id: str = DEFAULT_MODEL_ID
"""Embeddings model to use."""
normalize: bool = False
"""whether to normalize the computed embeddings"""
embed_instruction: str = "passage: "
"""Instruction used to embed documents."""
query_instruction: str = "query: "
"""Instruction used to embed the query."""
model_kwargs: Optional[dict] = None
"""Other model keyword args"""
deepinfra_api_token: Optional[str] = None
"""API token for Deep Infra. If not provided, the token is
fetched from the environment variable 'DEEPINFRA_API_TOKEN'."""
batch_size: int = MAX_BATCH_SIZE
"""Batch size for embedding requests."""
model_config = ConfigDict(extra="forbid", protected_namespaces=())
@pre_init
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
deepinfra_api_token = get_from_dict_or_env(
values, "deepinfra_api_token", "DEEPINFRA_API_TOKEN"
)
values["deepinfra_api_token"] = deepinfra_api_token
return values
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying parameters."""
return {"model_id": self.model_id}
def _embed(self, input: List[str]) -> List[List[float]]:
_model_kwargs = self.model_kwargs or {}
# HTTP headers for authorization
headers = {
"Authorization": f"bearer {self.deepinfra_api_token}",
"Content-Type": "application/json",
}
# send request
try:
res = requests.post(
f"https://api.deepinfra.com/v1/inference/{self.model_id}",
headers=headers,
json={"inputs": input, "normalize": self.normalize, **_model_kwargs},
)
except requests.exceptions.RequestException as e:
raise ValueError(f"Error raised by inference endpoint: {e}")
if res.status_code != 200:
raise ValueError(
"Error raised by inference API HTTP code: %s, %s"
% (res.status_code, res.text)
)
try:
t = res.json()
embeddings = t["embeddings"]
except requests.exceptions.JSONDecodeError as e:
raise ValueError(
f"Error raised by inference API: {e}.\nResponse: {res.text}"
)
return embeddings
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed documents using a Deep Infra deployed embedding model.
For larger batches, the input list of texts is chunked into smaller
batches to avoid exceeding the maximum request size.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
embeddings = []
instruction_pairs = [f"{self.embed_instruction}{text}" for text in texts]
chunks = [
instruction_pairs[i : i + self.batch_size]
for i in range(0, len(instruction_pairs), self.batch_size)
]
for chunk in chunks:
embeddings += self._embed(chunk)
return embeddings
def embed_query(self, text: str) -> List[float]:
"""Embed a query using a Deep Infra deployed embedding model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
instruction_pair = f"{self.query_instruction}{text}"
embedding = self._embed([instruction_pair])[0]
return embedding

View File

@@ -0,0 +1,114 @@
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
from pydantic import (
BaseModel,
ConfigDict,
Field,
SecretStr,
)
from langchain_community.utilities.requests import Requests
class EdenAiEmbeddings(BaseModel, Embeddings):
"""EdenAI embedding.
environment variable ``EDENAI_API_KEY`` set with your API key, or pass
it as a named parameter.
"""
edenai_api_key: Optional[SecretStr] = Field(None, description="EdenAI API Token")
provider: str = "openai"
"""embedding provider to use (eg: openai,google etc.)"""
model: Optional[str] = None
"""
model name for above provider (eg: 'gpt-3.5-turbo-instruct' for openai)
available models are shown on https://docs.edenai.co/ under 'available providers'
"""
model_config = ConfigDict(
extra="forbid",
)
@pre_init
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key exists in environment."""
values["edenai_api_key"] = convert_to_secret_str(
get_from_dict_or_env(values, "edenai_api_key", "EDENAI_API_KEY")
)
return values
@staticmethod
def get_user_agent() -> str:
from langchain_community import __version__
return f"langchain/{__version__}"
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Compute embeddings using EdenAi api."""
url = "https://api.edenai.run/v2/text/embeddings"
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {self.edenai_api_key.get_secret_value()}", # type: ignore[union-attr]
"User-Agent": self.get_user_agent(),
}
payload: Dict[str, Any] = {"texts": texts, "providers": self.provider}
if self.model is not None:
payload["settings"] = {self.provider: self.model}
request = Requests(headers=headers)
response = request.post(url=url, data=payload)
if response.status_code >= 500:
raise Exception(f"EdenAI Server: Error {response.status_code}")
elif response.status_code >= 400:
raise ValueError(f"EdenAI received an invalid payload: {response.text}")
elif response.status_code != 200:
raise Exception(
f"EdenAI returned an unexpected response with status "
f"{response.status_code}: {response.text}"
)
temp = response.json()
provider_response = temp[self.provider]
if provider_response.get("status") == "fail":
err_msg = provider_response.get("error", {}).get("message")
raise Exception(err_msg)
embeddings = []
for embed_item in temp[self.provider]["items"]:
embedding = embed_item["embedding"]
embeddings.append(embedding)
return embeddings
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of documents using EdenAI.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
return self._generate_embeddings(texts)
def embed_query(self, text: str) -> List[float]:
"""Embed a query using EdenAI.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self._generate_embeddings([text])[0]

View File

@@ -0,0 +1,226 @@
from __future__ import annotations
from typing import TYPE_CHECKING, List, Optional
from langchain_core._api import deprecated
from langchain_core.utils import get_from_env
if TYPE_CHECKING:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient
from langchain_core.embeddings import Embeddings
@deprecated(
"0.1.11", alternative="Use class in langchain-elasticsearch package", pending=True
)
class ElasticsearchEmbeddings(Embeddings):
"""Elasticsearch embedding models.
This class provides an interface to generate embeddings using a model deployed
in an Elasticsearch cluster. It requires an Elasticsearch connection object
and the model_id of the model deployed in the cluster.
In Elasticsearch you need to have an embedding model loaded and deployed.
- https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-trained-model.html
- https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-deploy-models.html
"""
def __init__(
self,
client: MlClient,
model_id: str,
*,
input_field: str = "text_field",
):
"""
Initialize the ElasticsearchEmbeddings instance.
Args:
client (MlClient): An Elasticsearch ML client object.
model_id (str): The model_id of the model deployed in the Elasticsearch
cluster.
input_field (str): The name of the key for the input text field in the
document. Defaults to 'text_field'.
"""
self.client = client
self.model_id = model_id
self.input_field = input_field
@classmethod
def from_credentials(
cls,
model_id: str,
*,
es_cloud_id: Optional[str] = None,
es_user: Optional[str] = None,
es_password: Optional[str] = None,
input_field: str = "text_field",
) -> ElasticsearchEmbeddings:
"""Instantiate embeddings from Elasticsearch credentials.
Args:
model_id (str): The model_id of the model deployed in the Elasticsearch
cluster.
input_field (str): The name of the key for the input text field in the
document. Defaults to 'text_field'.
es_cloud_id: (str, optional): The Elasticsearch cloud ID to connect to.
es_user: (str, optional): Elasticsearch username.
es_password: (str, optional): Elasticsearch password.
Example:
.. code-block:: python
from langchain_community.embeddings import ElasticsearchEmbeddings
# Define the model ID and input field name (if different from default)
model_id = "your_model_id"
# Optional, only if different from 'text_field'
input_field = "your_input_field"
# Credentials can be passed in two ways. Either set the env vars
# ES_CLOUD_ID, ES_USER, ES_PASSWORD and they will be automatically
# pulled in, or pass them in directly as kwargs.
embeddings = ElasticsearchEmbeddings.from_credentials(
model_id,
input_field=input_field,
# es_cloud_id="foo",
# es_user="bar",
# es_password="baz",
)
documents = [
"This is an example document.",
"Another example document to generate embeddings for.",
]
embeddings_generator.embed_documents(documents)
"""
try:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient
except ImportError:
raise ImportError(
"elasticsearch package not found, please install with 'pip install "
"elasticsearch'"
)
es_cloud_id = es_cloud_id or get_from_env("es_cloud_id", "ES_CLOUD_ID")
es_user = es_user or get_from_env("es_user", "ES_USER")
es_password = es_password or get_from_env("es_password", "ES_PASSWORD")
# Connect to Elasticsearch
es_connection = Elasticsearch(
cloud_id=es_cloud_id, basic_auth=(es_user, es_password)
)
client = MlClient(es_connection)
return cls(client, model_id, input_field=input_field)
@classmethod
def from_es_connection(
cls,
model_id: str,
es_connection: Elasticsearch,
input_field: str = "text_field",
) -> ElasticsearchEmbeddings:
"""
Instantiate embeddings from an existing Elasticsearch connection.
This method provides a way to create an instance of the ElasticsearchEmbeddings
class using an existing Elasticsearch connection. The connection object is used
to create an MlClient, which is then used to initialize the
ElasticsearchEmbeddings instance.
Args:
model_id (str): The model_id of the model deployed in the Elasticsearch cluster.
es_connection (elasticsearch.Elasticsearch): An existing Elasticsearch
connection object. input_field (str, optional): The name of the key for the
input text field in the document. Defaults to 'text_field'.
Returns:
ElasticsearchEmbeddings: An instance of the ElasticsearchEmbeddings class.
Example:
.. code-block:: python
from elasticsearch import Elasticsearch
from langchain_community.embeddings import ElasticsearchEmbeddings
# Define the model ID and input field name (if different from default)
model_id = "your_model_id"
# Optional, only if different from 'text_field'
input_field = "your_input_field"
# Create Elasticsearch connection
es_connection = Elasticsearch(
hosts=["localhost:9200"], http_auth=("user", "password")
)
# Instantiate ElasticsearchEmbeddings using the existing connection
embeddings = ElasticsearchEmbeddings.from_es_connection(
model_id,
es_connection,
input_field=input_field,
)
documents = [
"This is an example document.",
"Another example document to generate embeddings for.",
]
embeddings_generator.embed_documents(documents)
"""
# Importing MlClient from elasticsearch.client within the method to
# avoid unnecessary import if the method is not used
from elasticsearch.client import MlClient
# Create an MlClient from the given Elasticsearch connection
client = MlClient(es_connection)
# Return a new instance of the ElasticsearchEmbeddings class with
# the MlClient, model_id, and input_field
return cls(client, model_id, input_field=input_field)
def _embedding_func(self, texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for the given texts using the Elasticsearch model.
Args:
texts (List[str]): A list of text strings to generate embeddings for.
Returns:
List[List[float]]: A list of embeddings, one for each text in the input
list.
"""
response = self.client.infer_trained_model(
model_id=self.model_id, docs=[{self.input_field: text} for text in texts]
)
embeddings = [doc["predicted_value"] for doc in response["inference_results"]]
return embeddings
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for a list of documents.
Args:
texts (List[str]): A list of document text strings to generate embeddings
for.
Returns:
List[List[float]]: A list of embeddings, one for each document in the input
list.
"""
return self._embedding_func(texts)
def embed_query(self, text: str) -> List[float]:
"""
Generate an embedding for a single query text.
Args:
text (str): The query text to generate an embedding for.
Returns:
List[float]: The embedding for the input query text.
"""
return self._embedding_func([text])[0]

View File

@@ -0,0 +1,155 @@
from typing import Any, Dict, List, Mapping, Optional
import requests
from langchain_core.embeddings import Embeddings
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
from pydantic import BaseModel, ConfigDict, SecretStr
from requests.adapters import HTTPAdapter, Retry
from typing_extensions import NotRequired, TypedDict
# Currently supported maximum batch size for embedding requests
MAX_BATCH_SIZE = 256
EMBAAS_API_URL = "https://api.embaas.io/v1/embeddings/"
class EmbaasEmbeddingsPayload(TypedDict):
"""Payload for the Embaas embeddings API."""
model: str
texts: List[str]
instruction: NotRequired[str]
class EmbaasEmbeddings(BaseModel, Embeddings):
"""Embaas's embedding service.
To use, you should have the
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
it as a named parameter to the constructor.
Example:
.. code-block:: python
# initialize with default model and instruction
from langchain_community.embeddings import EmbaasEmbeddings
emb = EmbaasEmbeddings()
# initialize with custom model and instruction
from langchain_community.embeddings import EmbaasEmbeddings
emb_model = "instructor-large"
emb_inst = "Represent the Wikipedia document for retrieval"
emb = EmbaasEmbeddings(
model=emb_model,
instruction=emb_inst
)
"""
model: str = "e5-large-v2"
"""The model used for embeddings."""
instruction: Optional[str] = None
"""Instruction used for domain-specific embeddings."""
api_url: str = EMBAAS_API_URL
"""The URL for the embaas embeddings API."""
embaas_api_key: Optional[SecretStr] = None
"""max number of retries for requests"""
max_retries: Optional[int] = 3
"""request timeout in seconds"""
timeout: Optional[int] = 30
model_config = ConfigDict(
extra="forbid",
)
@pre_init
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
embaas_api_key = convert_to_secret_str(
get_from_dict_or_env(values, "embaas_api_key", "EMBAAS_API_KEY")
)
values["embaas_api_key"] = embaas_api_key
return values
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying params."""
return {"model": self.model, "instruction": self.instruction}
def _generate_payload(self, texts: List[str]) -> EmbaasEmbeddingsPayload:
"""Generates payload for the API request."""
payload = EmbaasEmbeddingsPayload(texts=texts, model=self.model)
if self.instruction:
payload["instruction"] = self.instruction
return payload
def _handle_request(self, payload: EmbaasEmbeddingsPayload) -> List[List[float]]:
"""Sends a request to the Embaas API and handles the response."""
headers = {
"Authorization": f"Bearer {self.embaas_api_key.get_secret_value()}", # type: ignore[union-attr]
"Content-Type": "application/json",
}
session = requests.Session()
retries = Retry(
total=self.max_retries,
backoff_factor=0.5,
allowed_methods=["POST"],
raise_on_status=True,
)
session.mount("http://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries))
response = session.post(
self.api_url,
headers=headers,
json=payload,
timeout=self.timeout,
)
parsed_response = response.json()
embeddings = [item["embedding"] for item in parsed_response["data"]]
return embeddings
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Generate embeddings using the Embaas API."""
payload = self._generate_payload(texts)
try:
return self._handle_request(payload)
except requests.exceptions.RequestException as e:
if e.response is None or not e.response.text:
raise ValueError(f"Error raised by embaas embeddings API: {e}")
parsed_response = e.response.json()
if "message" in parsed_response:
raise ValueError(
"Validation Error raised by embaas embeddings API:"
f"{parsed_response['message']}"
)
raise
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Get embeddings for a list of texts.
Args:
texts: The list of texts to get embeddings for.
Returns:
List of embeddings, one for each text.
"""
batches = [
texts[i : i + MAX_BATCH_SIZE] for i in range(0, len(texts), MAX_BATCH_SIZE)
]
embeddings = [self._generate_embeddings(batch) for batch in batches]
# flatten the list of lists into a single list
return [embedding for batch in embeddings for embedding in batch]
def embed_query(self, text: str) -> List[float]:
"""Get embeddings for a single text.
Args:
text: The text to get embeddings for.
Returns:
List of embeddings.
"""
return self.embed_documents([text])[0]

View File

@@ -0,0 +1,158 @@
import asyncio
import logging
import threading
from typing import Dict, List, Optional
import requests
from langchain_core._api.deprecation import deprecated
from langchain_core.embeddings import Embeddings
from langchain_core.runnables.config import run_in_executor
from langchain_core.utils import get_from_dict_or_env, pre_init
from pydantic import BaseModel, ConfigDict
logger = logging.getLogger(__name__)
@deprecated(
since="0.0.13",
alternative="langchain_community.embeddings.QianfanEmbeddingsEndpoint",
)
class ErnieEmbeddings(BaseModel, Embeddings):
"""`Ernie Embeddings V1` embedding models."""
ernie_api_base: Optional[str] = None
ernie_client_id: Optional[str] = None
ernie_client_secret: Optional[str] = None
access_token: Optional[str] = None
chunk_size: int = 16
model_name: str = "ErnieBot-Embedding-V1"
_lock = threading.Lock()
model_config = ConfigDict(protected_namespaces=())
@pre_init
def validate_environment(cls, values: Dict) -> Dict:
values["ernie_api_base"] = get_from_dict_or_env(
values, "ernie_api_base", "ERNIE_API_BASE", "https://aip.baidubce.com"
)
values["ernie_client_id"] = get_from_dict_or_env(
values,
"ernie_client_id",
"ERNIE_CLIENT_ID",
)
values["ernie_client_secret"] = get_from_dict_or_env(
values,
"ernie_client_secret",
"ERNIE_CLIENT_SECRET",
)
return values
def _embedding(self, json: object) -> dict:
base_url = (
f"{self.ernie_api_base}/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings"
)
resp = requests.post(
f"{base_url}/embedding-v1",
headers={
"Content-Type": "application/json",
},
params={"access_token": self.access_token},
json=json,
)
return resp.json()
def _refresh_access_token_with_lock(self) -> None:
with self._lock:
logger.debug("Refreshing access token")
base_url: str = f"{self.ernie_api_base}/oauth/2.0/token"
resp = requests.post(
base_url,
headers={
"Content-Type": "application/json",
"Accept": "application/json",
},
params={
"grant_type": "client_credentials",
"client_id": self.ernie_client_id,
"client_secret": self.ernie_client_secret,
},
)
self.access_token = str(resp.json().get("access_token"))
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed search docs.
Args:
texts: The list of texts to embed
Returns:
List[List[float]]: List of embeddings, one for each text.
"""
if not self.access_token:
self._refresh_access_token_with_lock()
text_in_chunks = [
texts[i : i + self.chunk_size]
for i in range(0, len(texts), self.chunk_size)
]
lst = []
for chunk in text_in_chunks:
resp = self._embedding({"input": [text for text in chunk]})
if resp.get("error_code"):
if resp.get("error_code") == 111:
self._refresh_access_token_with_lock()
resp = self._embedding({"input": [text for text in chunk]})
else:
raise ValueError(f"Error from Ernie: {resp}")
lst.extend([i["embedding"] for i in resp["data"]])
return lst
def embed_query(self, text: str) -> List[float]:
"""Embed query text.
Args:
text: The text to embed.
Returns:
List[float]: Embeddings for the text.
"""
if not self.access_token:
self._refresh_access_token_with_lock()
resp = self._embedding({"input": [text]})
if resp.get("error_code"):
if resp.get("error_code") == 111:
self._refresh_access_token_with_lock()
resp = self._embedding({"input": [text]})
else:
raise ValueError(f"Error from Ernie: {resp}")
return resp["data"][0]["embedding"]
async def aembed_query(self, text: str) -> List[float]:
"""Asynchronous Embed query text.
Args:
text: The text to embed.
Returns:
List[float]: Embeddings for the text.
"""
return await run_in_executor(None, self.embed_query, text)
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
"""Asynchronous Embed search docs.
Args:
texts: The list of texts to embed
Returns:
List[List[float]]: List of embeddings, one for each text.
"""
result = await asyncio.gather(*[self.aembed_query(text) for text in texts])
return list(result)

View File

@@ -0,0 +1,50 @@
import hashlib
from typing import List
import numpy as np
from langchain_core.embeddings import Embeddings
from pydantic import BaseModel
class FakeEmbeddings(Embeddings, BaseModel):
"""Fake embedding model."""
size: int
"""The size of the embedding vector."""
def _get_embedding(self) -> List[float]:
return list(np.random.normal(size=self.size))
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return [self._get_embedding() for _ in texts]
def embed_query(self, text: str) -> List[float]:
return self._get_embedding()
class DeterministicFakeEmbedding(Embeddings, BaseModel):
"""
Fake embedding model that always returns
the same embedding vector for the same text.
"""
size: int
"""The size of the embedding vector."""
def _get_embedding(self, seed: int) -> List[float]:
# set the seed for the random generator
np.random.seed(seed)
return list(np.random.normal(size=self.size))
@staticmethod
def _get_seed(text: str) -> int:
"""
Get a seed for the random generator, using the hash of the text.
"""
return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return [self._get_embedding(seed=self._get_seed(_)) for _ in texts]
def embed_query(self, text: str) -> List[float]:
return self._get_embedding(seed=self._get_seed(text))

Some files were not shown because too many files have changed in this diff Show More