initial commit
This commit is contained in:
@@ -0,0 +1,454 @@
|
||||
"""**Embedding models** are wrappers around embedding models
|
||||
from different APIs and services.
|
||||
|
||||
**Embedding models** can be LLMs or not.
|
||||
|
||||
**Class hierarchy:**
|
||||
|
||||
.. code-block::
|
||||
|
||||
Embeddings --> <name>Embeddings # Examples: OpenAIEmbeddings, HuggingFaceEmbeddings
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_community.embeddings.aleph_alpha import (
|
||||
AlephAlphaAsymmetricSemanticEmbedding,
|
||||
AlephAlphaSymmetricSemanticEmbedding,
|
||||
)
|
||||
from langchain_community.embeddings.anyscale import (
|
||||
AnyscaleEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.ascend import (
|
||||
AscendEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.awa import (
|
||||
AwaEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.azure_openai import (
|
||||
AzureOpenAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.baichuan import (
|
||||
BaichuanTextEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.baidu_qianfan_endpoint import (
|
||||
QianfanEmbeddingsEndpoint,
|
||||
)
|
||||
from langchain_community.embeddings.bedrock import (
|
||||
BedrockEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.bookend import (
|
||||
BookendEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.clarifai import (
|
||||
ClarifaiEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.clova import (
|
||||
ClovaEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.cohere import (
|
||||
CohereEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.dashscope import (
|
||||
DashScopeEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.databricks import (
|
||||
DatabricksEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.deepinfra import (
|
||||
DeepInfraEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.edenai import (
|
||||
EdenAiEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.elasticsearch import (
|
||||
ElasticsearchEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.embaas import (
|
||||
EmbaasEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.ernie import (
|
||||
ErnieEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.fake import (
|
||||
DeterministicFakeEmbedding,
|
||||
FakeEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.fastembed import (
|
||||
FastEmbedEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.gigachat import (
|
||||
GigaChatEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.google_palm import (
|
||||
GooglePalmEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.gpt4all import (
|
||||
GPT4AllEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.gradient_ai import (
|
||||
GradientEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.huggingface import (
|
||||
HuggingFaceBgeEmbeddings,
|
||||
HuggingFaceEmbeddings,
|
||||
HuggingFaceInferenceAPIEmbeddings,
|
||||
HuggingFaceInstructEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.huggingface_hub import (
|
||||
HuggingFaceHubEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.hunyuan import (
|
||||
HunyuanEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.infinity import (
|
||||
InfinityEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.infinity_local import (
|
||||
InfinityEmbeddingsLocal,
|
||||
)
|
||||
from langchain_community.embeddings.ipex_llm import IpexLLMBgeEmbeddings
|
||||
from langchain_community.embeddings.itrex import (
|
||||
QuantizedBgeEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.javelin_ai_gateway import (
|
||||
JavelinAIGatewayEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.jina import (
|
||||
JinaEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.johnsnowlabs import (
|
||||
JohnSnowLabsEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.laser import (
|
||||
LaserEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.llamacpp import (
|
||||
LlamaCppEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.llamafile import (
|
||||
LlamafileEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.llm_rails import (
|
||||
LLMRailsEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.localai import (
|
||||
LocalAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.minimax import (
|
||||
MiniMaxEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.mlflow import (
|
||||
MlflowCohereEmbeddings,
|
||||
MlflowEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.mlflow_gateway import (
|
||||
MlflowAIGatewayEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.model2vec import (
|
||||
Model2vecEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.modelscope_hub import (
|
||||
ModelScopeEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.mosaicml import (
|
||||
MosaicMLInstructorEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.naver import (
|
||||
ClovaXEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.nemo import (
|
||||
NeMoEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.nlpcloud import (
|
||||
NLPCloudEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.oci_generative_ai import (
|
||||
OCIGenAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.octoai_embeddings import (
|
||||
OctoAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.ollama import (
|
||||
OllamaEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.openai import (
|
||||
OpenAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.openvino import (
|
||||
OpenVINOBgeEmbeddings,
|
||||
OpenVINOEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.optimum_intel import (
|
||||
QuantizedBiEncoderEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.oracleai import (
|
||||
OracleEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.ovhcloud import (
|
||||
OVHCloudEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.premai import (
|
||||
PremAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.sagemaker_endpoint import (
|
||||
SagemakerEndpointEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.sambanova import (
|
||||
SambaStudioEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.self_hosted import (
|
||||
SelfHostedEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.self_hosted_hugging_face import (
|
||||
SelfHostedHuggingFaceEmbeddings,
|
||||
SelfHostedHuggingFaceInstructEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.sentence_transformer import (
|
||||
SentenceTransformerEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.solar import (
|
||||
SolarEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.spacy_embeddings import (
|
||||
SpacyEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.sparkllm import (
|
||||
SparkLLMTextEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.tensorflow_hub import (
|
||||
TensorflowHubEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.textembed import (
|
||||
TextEmbedEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.titan_takeoff import (
|
||||
TitanTakeoffEmbed,
|
||||
)
|
||||
from langchain_community.embeddings.vertexai import (
|
||||
VertexAIEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.volcengine import (
|
||||
VolcanoEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.voyageai import (
|
||||
VoyageEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.xinference import (
|
||||
XinferenceEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.yandex import (
|
||||
YandexGPTEmbeddings,
|
||||
)
|
||||
from langchain_community.embeddings.zhipuai import (
|
||||
ZhipuAIEmbeddings,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AlephAlphaAsymmetricSemanticEmbedding",
|
||||
"AlephAlphaSymmetricSemanticEmbedding",
|
||||
"AnyscaleEmbeddings",
|
||||
"AscendEmbeddings",
|
||||
"AwaEmbeddings",
|
||||
"AzureOpenAIEmbeddings",
|
||||
"BaichuanTextEmbeddings",
|
||||
"BedrockEmbeddings",
|
||||
"BookendEmbeddings",
|
||||
"ClarifaiEmbeddings",
|
||||
"ClovaEmbeddings",
|
||||
"ClovaXEmbeddings",
|
||||
"CohereEmbeddings",
|
||||
"DashScopeEmbeddings",
|
||||
"DatabricksEmbeddings",
|
||||
"DeepInfraEmbeddings",
|
||||
"DeterministicFakeEmbedding",
|
||||
"EdenAiEmbeddings",
|
||||
"ElasticsearchEmbeddings",
|
||||
"EmbaasEmbeddings",
|
||||
"ErnieEmbeddings",
|
||||
"FakeEmbeddings",
|
||||
"FastEmbedEmbeddings",
|
||||
"GPT4AllEmbeddings",
|
||||
"GigaChatEmbeddings",
|
||||
"GooglePalmEmbeddings",
|
||||
"GradientEmbeddings",
|
||||
"HuggingFaceBgeEmbeddings",
|
||||
"HuggingFaceEmbeddings",
|
||||
"HuggingFaceHubEmbeddings",
|
||||
"HuggingFaceInferenceAPIEmbeddings",
|
||||
"HuggingFaceInstructEmbeddings",
|
||||
"InfinityEmbeddings",
|
||||
"InfinityEmbeddingsLocal",
|
||||
"IpexLLMBgeEmbeddings",
|
||||
"JavelinAIGatewayEmbeddings",
|
||||
"JinaEmbeddings",
|
||||
"JohnSnowLabsEmbeddings",
|
||||
"LLMRailsEmbeddings",
|
||||
"LaserEmbeddings",
|
||||
"LlamaCppEmbeddings",
|
||||
"LlamafileEmbeddings",
|
||||
"LocalAIEmbeddings",
|
||||
"MiniMaxEmbeddings",
|
||||
"MlflowAIGatewayEmbeddings",
|
||||
"MlflowCohereEmbeddings",
|
||||
"MlflowEmbeddings",
|
||||
"Model2vecEmbeddings",
|
||||
"ModelScopeEmbeddings",
|
||||
"MosaicMLInstructorEmbeddings",
|
||||
"NLPCloudEmbeddings",
|
||||
"NeMoEmbeddings",
|
||||
"OCIGenAIEmbeddings",
|
||||
"OctoAIEmbeddings",
|
||||
"OllamaEmbeddings",
|
||||
"OpenAIEmbeddings",
|
||||
"OpenVINOBgeEmbeddings",
|
||||
"OpenVINOEmbeddings",
|
||||
"OracleEmbeddings",
|
||||
"OVHCloudEmbeddings",
|
||||
"PremAIEmbeddings",
|
||||
"QianfanEmbeddingsEndpoint",
|
||||
"QuantizedBgeEmbeddings",
|
||||
"QuantizedBiEncoderEmbeddings",
|
||||
"SagemakerEndpointEmbeddings",
|
||||
"SambaStudioEmbeddings",
|
||||
"SelfHostedEmbeddings",
|
||||
"SelfHostedHuggingFaceEmbeddings",
|
||||
"SelfHostedHuggingFaceInstructEmbeddings",
|
||||
"SentenceTransformerEmbeddings",
|
||||
"SolarEmbeddings",
|
||||
"SpacyEmbeddings",
|
||||
"SparkLLMTextEmbeddings",
|
||||
"TensorflowHubEmbeddings",
|
||||
"TextEmbedEmbeddings",
|
||||
"TitanTakeoffEmbed",
|
||||
"VertexAIEmbeddings",
|
||||
"VolcanoEmbeddings",
|
||||
"VoyageEmbeddings",
|
||||
"XinferenceEmbeddings",
|
||||
"YandexGPTEmbeddings",
|
||||
"ZhipuAIEmbeddings",
|
||||
"HunyuanEmbeddings",
|
||||
]
|
||||
|
||||
_module_lookup = {
|
||||
"AlephAlphaAsymmetricSemanticEmbedding": "langchain_community.embeddings.aleph_alpha", # noqa: E501
|
||||
"AlephAlphaSymmetricSemanticEmbedding": "langchain_community.embeddings.aleph_alpha", # noqa: E501
|
||||
"AnyscaleEmbeddings": "langchain_community.embeddings.anyscale",
|
||||
"AwaEmbeddings": "langchain_community.embeddings.awa",
|
||||
"AzureOpenAIEmbeddings": "langchain_community.embeddings.azure_openai",
|
||||
"BaichuanTextEmbeddings": "langchain_community.embeddings.baichuan",
|
||||
"BedrockEmbeddings": "langchain_community.embeddings.bedrock",
|
||||
"BookendEmbeddings": "langchain_community.embeddings.bookend",
|
||||
"ClarifaiEmbeddings": "langchain_community.embeddings.clarifai",
|
||||
"ClovaEmbeddings": "langchain_community.embeddings.clova",
|
||||
"ClovaXEmbeddings": "langchain_community.embeddings.naver",
|
||||
"CohereEmbeddings": "langchain_community.embeddings.cohere",
|
||||
"DashScopeEmbeddings": "langchain_community.embeddings.dashscope",
|
||||
"DatabricksEmbeddings": "langchain_community.embeddings.databricks",
|
||||
"DeepInfraEmbeddings": "langchain_community.embeddings.deepinfra",
|
||||
"DeterministicFakeEmbedding": "langchain_community.embeddings.fake",
|
||||
"EdenAiEmbeddings": "langchain_community.embeddings.edenai",
|
||||
"ElasticsearchEmbeddings": "langchain_community.embeddings.elasticsearch",
|
||||
"EmbaasEmbeddings": "langchain_community.embeddings.embaas",
|
||||
"ErnieEmbeddings": "langchain_community.embeddings.ernie",
|
||||
"FakeEmbeddings": "langchain_community.embeddings.fake",
|
||||
"FastEmbedEmbeddings": "langchain_community.embeddings.fastembed",
|
||||
"GPT4AllEmbeddings": "langchain_community.embeddings.gpt4all",
|
||||
"GooglePalmEmbeddings": "langchain_community.embeddings.google_palm",
|
||||
"GradientEmbeddings": "langchain_community.embeddings.gradient_ai",
|
||||
"GigaChatEmbeddings": "langchain_community.embeddings.gigachat",
|
||||
"HuggingFaceBgeEmbeddings": "langchain_community.embeddings.huggingface",
|
||||
"HuggingFaceEmbeddings": "langchain_community.embeddings.huggingface",
|
||||
"HuggingFaceHubEmbeddings": "langchain_community.embeddings.huggingface_hub",
|
||||
"HuggingFaceInferenceAPIEmbeddings": "langchain_community.embeddings.huggingface",
|
||||
"HuggingFaceInstructEmbeddings": "langchain_community.embeddings.huggingface",
|
||||
"InfinityEmbeddings": "langchain_community.embeddings.infinity",
|
||||
"InfinityEmbeddingsLocal": "langchain_community.embeddings.infinity_local",
|
||||
"IpexLLMBgeEmbeddings": "langchain_community.embeddings.ipex_llm",
|
||||
"JavelinAIGatewayEmbeddings": "langchain_community.embeddings.javelin_ai_gateway",
|
||||
"JinaEmbeddings": "langchain_community.embeddings.jina",
|
||||
"JohnSnowLabsEmbeddings": "langchain_community.embeddings.johnsnowlabs",
|
||||
"LLMRailsEmbeddings": "langchain_community.embeddings.llm_rails",
|
||||
"LaserEmbeddings": "langchain_community.embeddings.laser",
|
||||
"LlamaCppEmbeddings": "langchain_community.embeddings.llamacpp",
|
||||
"LlamafileEmbeddings": "langchain_community.embeddings.llamafile",
|
||||
"LocalAIEmbeddings": "langchain_community.embeddings.localai",
|
||||
"MiniMaxEmbeddings": "langchain_community.embeddings.minimax",
|
||||
"MlflowAIGatewayEmbeddings": "langchain_community.embeddings.mlflow_gateway",
|
||||
"MlflowCohereEmbeddings": "langchain_community.embeddings.mlflow",
|
||||
"MlflowEmbeddings": "langchain_community.embeddings.mlflow",
|
||||
"Model2vecEmbeddings": "langchain_community.embeddings.model2vec",
|
||||
"ModelScopeEmbeddings": "langchain_community.embeddings.modelscope_hub",
|
||||
"MosaicMLInstructorEmbeddings": "langchain_community.embeddings.mosaicml",
|
||||
"NLPCloudEmbeddings": "langchain_community.embeddings.nlpcloud",
|
||||
"NeMoEmbeddings": "langchain_community.embeddings.nemo",
|
||||
"OCIGenAIEmbeddings": "langchain_community.embeddings.oci_generative_ai",
|
||||
"OctoAIEmbeddings": "langchain_community.embeddings.octoai_embeddings",
|
||||
"OllamaEmbeddings": "langchain_community.embeddings.ollama",
|
||||
"OpenAIEmbeddings": "langchain_community.embeddings.openai",
|
||||
"OpenVINOEmbeddings": "langchain_community.embeddings.openvino",
|
||||
"OpenVINOBgeEmbeddings": "langchain_community.embeddings.openvino",
|
||||
"QianfanEmbeddingsEndpoint": "langchain_community.embeddings.baidu_qianfan_endpoint", # noqa: E501
|
||||
"QuantizedBgeEmbeddings": "langchain_community.embeddings.itrex",
|
||||
"QuantizedBiEncoderEmbeddings": "langchain_community.embeddings.optimum_intel",
|
||||
"OracleEmbeddings": "langchain_community.embeddings.oracleai",
|
||||
"OVHCloudEmbeddings": "langchain_community.embeddings.ovhcloud",
|
||||
"SagemakerEndpointEmbeddings": "langchain_community.embeddings.sagemaker_endpoint",
|
||||
"SambaStudioEmbeddings": "langchain_community.embeddings.sambanova",
|
||||
"SelfHostedEmbeddings": "langchain_community.embeddings.self_hosted",
|
||||
"SelfHostedHuggingFaceEmbeddings": "langchain_community.embeddings.self_hosted_hugging_face", # noqa: E501
|
||||
"SelfHostedHuggingFaceInstructEmbeddings": "langchain_community.embeddings.self_hosted_hugging_face", # noqa: E501
|
||||
"SentenceTransformerEmbeddings": "langchain_community.embeddings.sentence_transformer", # noqa: E501
|
||||
"SolarEmbeddings": "langchain_community.embeddings.solar",
|
||||
"SpacyEmbeddings": "langchain_community.embeddings.spacy_embeddings",
|
||||
"SparkLLMTextEmbeddings": "langchain_community.embeddings.sparkllm",
|
||||
"TensorflowHubEmbeddings": "langchain_community.embeddings.tensorflow_hub",
|
||||
"VertexAIEmbeddings": "langchain_community.embeddings.vertexai",
|
||||
"VolcanoEmbeddings": "langchain_community.embeddings.volcengine",
|
||||
"VoyageEmbeddings": "langchain_community.embeddings.voyageai",
|
||||
"XinferenceEmbeddings": "langchain_community.embeddings.xinference",
|
||||
"TextEmbedEmbeddings": "langchain_community.embeddings.textembed",
|
||||
"TitanTakeoffEmbed": "langchain_community.embeddings.titan_takeoff",
|
||||
"PremAIEmbeddings": "langchain_community.embeddings.premai",
|
||||
"YandexGPTEmbeddings": "langchain_community.embeddings.yandex",
|
||||
"AscendEmbeddings": "langchain_community.embeddings.ascend",
|
||||
"ZhipuAIEmbeddings": "langchain_community.embeddings.zhipuai",
|
||||
"HunyuanEmbeddings": "langchain_community.embeddings.hunyuan",
|
||||
}
|
||||
|
||||
|
||||
def __getattr__(name: str) -> Any:
|
||||
if name in _module_lookup:
|
||||
module = importlib.import_module(_module_lookup[name])
|
||||
return getattr(module, name)
|
||||
raise AttributeError(f"module {__name__} has no attribute {name}")
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# TODO: this is in here to maintain backwards compatibility
|
||||
class HypotheticalDocumentEmbedder:
|
||||
def __init__(self, *args: Any, **kwargs: Any):
|
||||
logger.warning(
|
||||
"Using a deprecated class. Please use "
|
||||
"`from langchain_classic.chains import HypotheticalDocumentEmbedder` "
|
||||
"instead"
|
||||
)
|
||||
from langchain_classic.chains.hyde.base import HypotheticalDocumentEmbedder as H
|
||||
|
||||
return H(*args, **kwargs) # type: ignore[return-value]
|
||||
|
||||
@classmethod
|
||||
def from_llm(cls, *args: Any, **kwargs: Any) -> Any:
|
||||
logger.warning(
|
||||
"Using a deprecated class. Please use "
|
||||
"`from langchain_classic.chains import HypotheticalDocumentEmbedder` "
|
||||
"instead"
|
||||
)
|
||||
from langchain_classic.chains.hyde.base import HypotheticalDocumentEmbedder as H
|
||||
|
||||
return H.from_llm(*args, **kwargs)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,256 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
from pydantic import BaseModel, model_validator
|
||||
|
||||
|
||||
class AlephAlphaAsymmetricSemanticEmbedding(BaseModel, Embeddings):
|
||||
"""Aleph Alpha's asymmetric semantic embedding.
|
||||
|
||||
AA provides you with an endpoint to embed a document and a query.
|
||||
The models were optimized to make the embeddings of documents and
|
||||
the query for a document as similar as possible.
|
||||
To learn more, check out: https://docs.aleph-alpha.com/docs/tasks/semantic_embed/
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from aleph_alpha import AlephAlphaAsymmetricSemanticEmbedding
|
||||
|
||||
embeddings = AlephAlphaAsymmetricSemanticEmbedding(
|
||||
normalize=True, compress_to_size=128
|
||||
)
|
||||
|
||||
document = "This is a content of the document"
|
||||
query = "What is the content of the document?"
|
||||
|
||||
doc_result = embeddings.embed_documents([document])
|
||||
query_result = embeddings.embed_query(query)
|
||||
|
||||
"""
|
||||
|
||||
client: Any #: :meta private:
|
||||
|
||||
# Embedding params
|
||||
model: str = "luminous-base"
|
||||
"""Model name to use."""
|
||||
compress_to_size: Optional[int] = None
|
||||
"""Should the returned embeddings come back as an original 5120-dim vector,
|
||||
or should it be compressed to 128-dim."""
|
||||
normalize: bool = False
|
||||
"""Should returned embeddings be normalized"""
|
||||
contextual_control_threshold: Optional[int] = None
|
||||
"""Attention control parameters only apply to those tokens that have
|
||||
explicitly been set in the request."""
|
||||
control_log_additive: bool = True
|
||||
"""Apply controls on prompt items by adding the log(control_factor)
|
||||
to attention scores."""
|
||||
|
||||
# Client params
|
||||
aleph_alpha_api_key: Optional[str] = None
|
||||
"""API key for Aleph Alpha API."""
|
||||
host: str = "https://api.aleph-alpha.com"
|
||||
"""The hostname of the API host.
|
||||
The default one is "https://api.aleph-alpha.com")"""
|
||||
hosting: Optional[str] = None
|
||||
"""Determines in which datacenters the request may be processed.
|
||||
You can either set the parameter to "aleph-alpha" or omit it (defaulting to None).
|
||||
Not setting this value, or setting it to None, gives us maximal flexibility
|
||||
in processing your request in our
|
||||
own datacenters and on servers hosted with other providers.
|
||||
Choose this option for maximal availability.
|
||||
Setting it to "aleph-alpha" allows us to only process the request
|
||||
in our own datacenters.
|
||||
Choose this option for maximal data privacy."""
|
||||
request_timeout_seconds: int = 305
|
||||
"""Client timeout that will be set for HTTP requests in the
|
||||
`requests` library's API calls.
|
||||
Server will close all requests after 300 seconds with an internal server error."""
|
||||
total_retries: int = 8
|
||||
"""The number of retries made in case requests fail with certain retryable
|
||||
status codes. If the last
|
||||
retry fails a corresponding exception is raised. Note, that between retries
|
||||
an exponential backoff
|
||||
is applied, starting with 0.5 s after the first retry and doubling for each
|
||||
retry made. So with the
|
||||
default setting of 8 retries a total wait time of 63.5 s is added between
|
||||
the retries."""
|
||||
nice: bool = False
|
||||
"""Setting this to True, will signal to the API that you intend to be
|
||||
nice to other users
|
||||
by de-prioritizing your request below concurrent ones."""
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
aleph_alpha_api_key = get_from_dict_or_env(
|
||||
values, "aleph_alpha_api_key", "ALEPH_ALPHA_API_KEY"
|
||||
)
|
||||
try:
|
||||
from aleph_alpha_client import Client
|
||||
|
||||
values["client"] = Client(
|
||||
token=aleph_alpha_api_key,
|
||||
host=values["host"],
|
||||
hosting=values["hosting"],
|
||||
request_timeout_seconds=values["request_timeout_seconds"],
|
||||
total_retries=values["total_retries"],
|
||||
nice=values["nice"],
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import aleph_alpha_client python package. "
|
||||
"Please install it with `pip install aleph_alpha_client`."
|
||||
)
|
||||
|
||||
return values
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Call out to Aleph Alpha's asymmetric Document endpoint.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
try:
|
||||
from aleph_alpha_client import (
|
||||
Prompt,
|
||||
SemanticEmbeddingRequest,
|
||||
SemanticRepresentation,
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import aleph_alpha_client python package. "
|
||||
"Please install it with `pip install aleph_alpha_client`."
|
||||
)
|
||||
document_embeddings = []
|
||||
|
||||
for text in texts:
|
||||
document_params = {
|
||||
"prompt": Prompt.from_text(text),
|
||||
"representation": SemanticRepresentation.Document,
|
||||
"compress_to_size": self.compress_to_size,
|
||||
"normalize": self.normalize,
|
||||
"contextual_control_threshold": self.contextual_control_threshold,
|
||||
"control_log_additive": self.control_log_additive,
|
||||
}
|
||||
|
||||
document_request = SemanticEmbeddingRequest(**document_params)
|
||||
document_response = self.client.semantic_embed(
|
||||
request=document_request, model=self.model
|
||||
)
|
||||
|
||||
document_embeddings.append(document_response.embedding)
|
||||
|
||||
return document_embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to Aleph Alpha's asymmetric, query embedding endpoint
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
try:
|
||||
from aleph_alpha_client import (
|
||||
Prompt,
|
||||
SemanticEmbeddingRequest,
|
||||
SemanticRepresentation,
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import aleph_alpha_client python package. "
|
||||
"Please install it with `pip install aleph_alpha_client`."
|
||||
)
|
||||
symmetric_params = {
|
||||
"prompt": Prompt.from_text(text),
|
||||
"representation": SemanticRepresentation.Query,
|
||||
"compress_to_size": self.compress_to_size,
|
||||
"normalize": self.normalize,
|
||||
"contextual_control_threshold": self.contextual_control_threshold,
|
||||
"control_log_additive": self.control_log_additive,
|
||||
}
|
||||
|
||||
symmetric_request = SemanticEmbeddingRequest(**symmetric_params)
|
||||
symmetric_response = self.client.semantic_embed(
|
||||
request=symmetric_request, model=self.model
|
||||
)
|
||||
|
||||
return symmetric_response.embedding
|
||||
|
||||
|
||||
class AlephAlphaSymmetricSemanticEmbedding(AlephAlphaAsymmetricSemanticEmbedding):
|
||||
"""Symmetric version of the Aleph Alpha's semantic embeddings.
|
||||
|
||||
The main difference is that here, both the documents and
|
||||
queries are embedded with a SemanticRepresentation.Symmetric
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from aleph_alpha import AlephAlphaSymmetricSemanticEmbedding
|
||||
|
||||
embeddings = AlephAlphaAsymmetricSemanticEmbedding(
|
||||
normalize=True, compress_to_size=128
|
||||
)
|
||||
text = "This is a test text"
|
||||
|
||||
doc_result = embeddings.embed_documents([text])
|
||||
query_result = embeddings.embed_query(text)
|
||||
"""
|
||||
|
||||
def _embed(self, text: str) -> List[float]:
|
||||
try:
|
||||
from aleph_alpha_client import (
|
||||
Prompt,
|
||||
SemanticEmbeddingRequest,
|
||||
SemanticRepresentation,
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import aleph_alpha_client python package. "
|
||||
"Please install it with `pip install aleph_alpha_client`."
|
||||
)
|
||||
query_params = {
|
||||
"prompt": Prompt.from_text(text),
|
||||
"representation": SemanticRepresentation.Symmetric,
|
||||
"compress_to_size": self.compress_to_size,
|
||||
"normalize": self.normalize,
|
||||
"contextual_control_threshold": self.contextual_control_threshold,
|
||||
"control_log_additive": self.control_log_additive,
|
||||
}
|
||||
|
||||
query_request = SemanticEmbeddingRequest(**query_params)
|
||||
query_response = self.client.semantic_embed(
|
||||
request=query_request, model=self.model
|
||||
)
|
||||
|
||||
return query_response.embedding
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Call out to Aleph Alpha's Document endpoint.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
document_embeddings = []
|
||||
|
||||
for text in texts:
|
||||
document_embeddings.append(self._embed(text))
|
||||
return document_embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to Aleph Alpha's asymmetric, query embedding endpoint
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self._embed(text)
|
||||
@@ -0,0 +1,76 @@
|
||||
"""Anyscale embeddings wrapper."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
|
||||
from pydantic import Field, SecretStr
|
||||
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain_community.utils.openai import is_openai_v1
|
||||
|
||||
DEFAULT_API_BASE = "https://api.endpoints.anyscale.com/v1"
|
||||
DEFAULT_MODEL = "thenlper/gte-large"
|
||||
|
||||
|
||||
class AnyscaleEmbeddings(OpenAIEmbeddings):
|
||||
"""`Anyscale` Embeddings API."""
|
||||
|
||||
anyscale_api_key: Optional[SecretStr] = Field(default=None)
|
||||
"""AnyScale Endpoints API keys."""
|
||||
model: str = Field(default=DEFAULT_MODEL)
|
||||
"""Model name to use."""
|
||||
anyscale_api_base: str = Field(default=DEFAULT_API_BASE)
|
||||
"""Base URL path for API requests."""
|
||||
tiktoken_enabled: bool = False
|
||||
"""Set this to False for non-OpenAI implementations of the embeddings API"""
|
||||
embedding_ctx_length: int = 500
|
||||
"""The maximum number of tokens to embed at once."""
|
||||
|
||||
@property
|
||||
def lc_secrets(self) -> Dict[str, str]:
|
||||
return {
|
||||
"anyscale_api_key": "ANYSCALE_API_KEY",
|
||||
}
|
||||
|
||||
@pre_init
|
||||
def validate_environment(cls, values: dict) -> dict:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
values["anyscale_api_key"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(
|
||||
values,
|
||||
"anyscale_api_key",
|
||||
"ANYSCALE_API_KEY",
|
||||
)
|
||||
)
|
||||
values["anyscale_api_base"] = get_from_dict_or_env(
|
||||
values,
|
||||
"anyscale_api_base",
|
||||
"ANYSCALE_API_BASE",
|
||||
default=DEFAULT_API_BASE,
|
||||
)
|
||||
try:
|
||||
import openai
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import openai python package. "
|
||||
"Please install it with `pip install openai`."
|
||||
)
|
||||
if is_openai_v1():
|
||||
# For backwards compatibility.
|
||||
client_params = {
|
||||
"api_key": values["anyscale_api_key"].get_secret_value(),
|
||||
"base_url": values["anyscale_api_base"],
|
||||
}
|
||||
values["client"] = openai.OpenAI(**client_params).embeddings
|
||||
else:
|
||||
values["openai_api_base"] = values["anyscale_api_base"]
|
||||
values["openai_api_key"] = values["anyscale_api_key"].get_secret_value()
|
||||
values["client"] = openai.Embedding
|
||||
return values
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
return "anyscale-embedding"
|
||||
137
venv/Lib/site-packages/langchain_community/embeddings/ascend.py
Normal file
137
venv/Lib/site-packages/langchain_community/embeddings/ascend.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
|
||||
|
||||
class AscendEmbeddings(Embeddings, BaseModel):
|
||||
"""
|
||||
Ascend NPU accelerate Embedding model
|
||||
|
||||
Please ensure that you have installed CANN and torch_npu.
|
||||
|
||||
Example:
|
||||
|
||||
from langchain_community.embeddings import AscendEmbeddings
|
||||
model = AscendEmbeddings(model_path=<path_to_model>,
|
||||
device_id=0,
|
||||
query_instruction="Represent this sentence for searching relevant passages: "
|
||||
)
|
||||
"""
|
||||
|
||||
"""model path"""
|
||||
model_path: str
|
||||
"""Ascend NPU device id."""
|
||||
device_id: int = 0
|
||||
"""Unstruntion to used for embedding query."""
|
||||
query_instruction: str = ""
|
||||
"""Unstruntion to used for embedding document."""
|
||||
document_instruction: str = ""
|
||||
use_fp16: bool = True
|
||||
pooling_method: Optional[str] = "cls"
|
||||
batch_size: int = 32
|
||||
model: Any
|
||||
tokenizer: Any
|
||||
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
try:
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import transformers, please install with "
|
||||
"`pip install -U transformers`."
|
||||
) from e
|
||||
try:
|
||||
self.model = AutoModel.from_pretrained(self.model_path).npu().eval()
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to load model [self.model_path], due to following error:{e}"
|
||||
)
|
||||
|
||||
if self.use_fp16:
|
||||
self.model.half()
|
||||
self.encode([f"warmup {i} times" for i in range(10)])
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
if "model_path" not in values:
|
||||
raise ValueError("model_path is required")
|
||||
if not os.access(values["model_path"], os.F_OK):
|
||||
raise FileNotFoundError(
|
||||
f"Unable to find valid model path in [{values['model_path']}]"
|
||||
)
|
||||
try:
|
||||
import torch_npu
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError("torch_npu not found, please install torch_npu")
|
||||
except Exception as e:
|
||||
raise e
|
||||
try:
|
||||
torch_npu.npu.set_device(values["device_id"])
|
||||
except Exception as e:
|
||||
raise Exception(f"set device failed due to {e}")
|
||||
return values
|
||||
|
||||
def encode(self, sentences: Any) -> Any:
|
||||
inputs = self.tokenizer(
|
||||
sentences,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=512,
|
||||
)
|
||||
try:
|
||||
import torch
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import torch, please install with `pip install -U torch`."
|
||||
) from e
|
||||
last_hidden_state = self.model(
|
||||
inputs.input_ids.npu(), inputs.attention_mask.npu(), return_dict=True
|
||||
).last_hidden_state
|
||||
tmp = self.pooling(last_hidden_state, inputs["attention_mask"].npu())
|
||||
embeddings = torch.nn.functional.normalize(tmp, dim=-1)
|
||||
return embeddings.cpu().detach().numpy()
|
||||
|
||||
def pooling(self, last_hidden_state: Any, attention_mask: Any = None) -> Any:
|
||||
try:
|
||||
import torch
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import torch, please install with `pip install -U torch`."
|
||||
) from e
|
||||
if self.pooling_method == "cls":
|
||||
return last_hidden_state[:, 0]
|
||||
elif self.pooling_method == "mean":
|
||||
s = torch.sum(
|
||||
last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=-1
|
||||
)
|
||||
d = attention_mask.sum(dim=1, keepdim=True).float()
|
||||
return s / d
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Pooling method [{self.pooling_method}] not implemented"
|
||||
)
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import numpy, please install with `pip install -U numpy`."
|
||||
) from e
|
||||
embedding_list = []
|
||||
for i in range(0, len(texts), self.batch_size):
|
||||
texts_ = texts[i : i + self.batch_size]
|
||||
emb = self.encode([self.document_instruction + text for text in texts_])
|
||||
embedding_list.append(emb)
|
||||
return np.concatenate(embedding_list)
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
return self.encode([self.query_instruction + text])[0]
|
||||
64
venv/Lib/site-packages/langchain_community/embeddings/awa.py
Normal file
64
venv/Lib/site-packages/langchain_community/embeddings/awa.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import BaseModel, model_validator
|
||||
|
||||
|
||||
class AwaEmbeddings(BaseModel, Embeddings):
|
||||
"""Embedding documents and queries with Awa DB.
|
||||
|
||||
Attributes:
|
||||
client: The AwaEmbedding client.
|
||||
model: The name of the model used for embedding.
|
||||
Default is "all-mpnet-base-v2".
|
||||
"""
|
||||
|
||||
client: Any #: :meta private:
|
||||
model: str = "all-mpnet-base-v2"
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate that awadb library is installed."""
|
||||
|
||||
try:
|
||||
from awadb import AwaEmbedding
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Could not import awadb library. "
|
||||
"Please install it with `pip install awadb`"
|
||||
) from exc
|
||||
values["client"] = AwaEmbedding()
|
||||
return values
|
||||
|
||||
def set_model(self, model_name: str) -> None:
|
||||
"""Set the model used for embedding.
|
||||
The default model used is all-mpnet-base-v2
|
||||
|
||||
Args:
|
||||
model_name: A string which represents the name of model.
|
||||
"""
|
||||
self.model = model_name
|
||||
self.client.model_name = model_name
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed a list of documents using AwaEmbedding.
|
||||
|
||||
Args:
|
||||
texts: The list of texts need to be embedded
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
return self.client.EmbeddingBatch(texts)
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Compute query embeddings using AwaEmbedding.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self.client.Embedding(text)
|
||||
@@ -0,0 +1,187 @@
|
||||
"""Azure OpenAI embeddings wrapper."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from typing import Any, Awaitable, Callable, Dict, Optional, Union
|
||||
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
from pydantic import Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain_community.utils.openai import is_openai_v1
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.0.9",
|
||||
removal="1.0",
|
||||
alternative_import="langchain_openai.AzureOpenAIEmbeddings",
|
||||
)
|
||||
class AzureOpenAIEmbeddings(OpenAIEmbeddings):
|
||||
"""`Azure OpenAI` Embeddings API."""
|
||||
|
||||
azure_endpoint: Union[str, None] = None
|
||||
"""Your Azure endpoint, including the resource.
|
||||
|
||||
Automatically inferred from env var `AZURE_OPENAI_ENDPOINT` if not provided.
|
||||
|
||||
Example: `https://example-resource.azure.openai.com/`
|
||||
"""
|
||||
deployment: Optional[str] = Field(default=None, alias="azure_deployment")
|
||||
"""A model deployment.
|
||||
|
||||
If given sets the base client URL to include `/deployments/{azure_deployment}`.
|
||||
Note: this means you won't be able to use non-deployment endpoints.
|
||||
"""
|
||||
openai_api_key: Union[str, None] = Field(default=None, alias="api_key")
|
||||
"""Automatically inferred from env var `AZURE_OPENAI_API_KEY` if not provided."""
|
||||
azure_ad_token: Union[str, None] = None
|
||||
"""Your Azure Active Directory token.
|
||||
|
||||
Automatically inferred from env var `AZURE_OPENAI_AD_TOKEN` if not provided.
|
||||
|
||||
For more:
|
||||
https://www.microsoft.com/en-us/security/business/identity-access/microsoft-entra-id.
|
||||
"""
|
||||
azure_ad_token_provider: Union[Callable[[], str], None] = None
|
||||
"""A function that returns an Azure Active Directory token.
|
||||
|
||||
Will be invoked on every sync request. For async requests,
|
||||
will be invoked if `azure_ad_async_token_provider` is not provided.
|
||||
"""
|
||||
azure_ad_async_token_provider: Union[Callable[[], Awaitable[str]], None] = None
|
||||
"""A function that returns an Azure Active Directory token.
|
||||
|
||||
Will be invoked on every async request.
|
||||
"""
|
||||
openai_api_version: Optional[str] = Field(default=None, alias="api_version")
|
||||
"""Automatically inferred from env var `OPENAI_API_VERSION` if not provided."""
|
||||
validate_base_url: bool = True
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
# Check OPENAI_KEY for backwards compatibility.
|
||||
# TODO: Remove OPENAI_API_KEY support to avoid possible conflict when using
|
||||
# other forms of azure credentials.
|
||||
values["openai_api_key"] = (
|
||||
values.get("openai_api_key")
|
||||
or os.getenv("AZURE_OPENAI_API_KEY")
|
||||
or os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
values["openai_api_base"] = values.get("openai_api_base") or os.getenv(
|
||||
"OPENAI_API_BASE"
|
||||
)
|
||||
values["openai_api_version"] = values.get("openai_api_version") or os.getenv(
|
||||
"OPENAI_API_VERSION", default="2023-05-15"
|
||||
)
|
||||
values["openai_api_type"] = get_from_dict_or_env(
|
||||
values, "openai_api_type", "OPENAI_API_TYPE", default="azure"
|
||||
)
|
||||
values["openai_organization"] = (
|
||||
values.get("openai_organization")
|
||||
or os.getenv("OPENAI_ORG_ID")
|
||||
or os.getenv("OPENAI_ORGANIZATION")
|
||||
)
|
||||
values["openai_proxy"] = get_from_dict_or_env(
|
||||
values,
|
||||
"openai_proxy",
|
||||
"OPENAI_PROXY",
|
||||
default="",
|
||||
)
|
||||
values["azure_endpoint"] = values.get("azure_endpoint") or os.getenv(
|
||||
"AZURE_OPENAI_ENDPOINT"
|
||||
)
|
||||
values["azure_ad_token"] = values.get("azure_ad_token") or os.getenv(
|
||||
"AZURE_OPENAI_AD_TOKEN"
|
||||
)
|
||||
# Azure OpenAI embedding models allow a maximum of 2048 texts
|
||||
# at a time in each batch
|
||||
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
|
||||
values["chunk_size"] = min(values["chunk_size"], 2048)
|
||||
try:
|
||||
import openai # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import openai python package. "
|
||||
"Please install it with `pip install openai`."
|
||||
)
|
||||
if is_openai_v1():
|
||||
# For backwards compatibility. Before openai v1, no distinction was made
|
||||
# between azure_endpoint and base_url (openai_api_base).
|
||||
openai_api_base = values["openai_api_base"]
|
||||
if openai_api_base and values["validate_base_url"]:
|
||||
if "/openai" not in openai_api_base:
|
||||
values["openai_api_base"] += "/openai"
|
||||
warnings.warn(
|
||||
"As of openai>=1.0.0, Azure endpoints should be specified via "
|
||||
f"the `azure_endpoint` param not `openai_api_base` "
|
||||
f"(or alias `base_url`). Updating `openai_api_base` from "
|
||||
f"{openai_api_base} to {values['openai_api_base']}."
|
||||
)
|
||||
if values["deployment"]:
|
||||
warnings.warn(
|
||||
"As of openai>=1.0.0, if `deployment` (or alias "
|
||||
"`azure_deployment`) is specified then "
|
||||
"`openai_api_base` (or alias `base_url`) should not be. "
|
||||
"Instead use `deployment` (or alias `azure_deployment`) "
|
||||
"and `azure_endpoint`."
|
||||
)
|
||||
if values["deployment"] not in values["openai_api_base"]:
|
||||
warnings.warn(
|
||||
"As of openai>=1.0.0, if `openai_api_base` "
|
||||
"(or alias `base_url`) is specified it is expected to be "
|
||||
"of the form "
|
||||
"https://example-resource.azure.openai.com/openai/deployments/example-deployment. " # noqa: E501
|
||||
f"Updating {openai_api_base} to "
|
||||
f"{values['openai_api_base']}."
|
||||
)
|
||||
values["openai_api_base"] += (
|
||||
"/deployments/" + values["deployment"]
|
||||
)
|
||||
values["deployment"] = None
|
||||
return values
|
||||
|
||||
@model_validator(mode="after")
|
||||
def post_init_validator(self) -> Self:
|
||||
"""Validate that the base url is set."""
|
||||
import openai
|
||||
|
||||
if is_openai_v1():
|
||||
client_params = {
|
||||
"api_version": self.openai_api_version,
|
||||
"azure_endpoint": self.azure_endpoint,
|
||||
"azure_deployment": self.deployment,
|
||||
"api_key": self.openai_api_key,
|
||||
"azure_ad_token": self.azure_ad_token,
|
||||
"azure_ad_token_provider": self.azure_ad_token_provider,
|
||||
"organization": self.openai_organization,
|
||||
"base_url": self.openai_api_base,
|
||||
"timeout": self.request_timeout,
|
||||
"max_retries": self.max_retries,
|
||||
"default_headers": {
|
||||
**(self.default_headers or {}),
|
||||
"User-Agent": "langchain-comm-python-azure-openai",
|
||||
},
|
||||
"default_query": self.default_query,
|
||||
"http_client": self.http_client,
|
||||
}
|
||||
self.client = openai.AzureOpenAI(**client_params).embeddings
|
||||
|
||||
if self.azure_ad_async_token_provider:
|
||||
client_params["azure_ad_token_provider"] = (
|
||||
self.azure_ad_async_token_provider
|
||||
)
|
||||
|
||||
self.async_client = openai.AsyncAzureOpenAI(**client_params).embeddings
|
||||
else:
|
||||
self.client = openai.Embedding
|
||||
return self
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
return "azure-openai-chat"
|
||||
@@ -0,0 +1,150 @@
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import requests
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import (
|
||||
secret_from_env,
|
||||
)
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
Field,
|
||||
SecretStr,
|
||||
model_validator,
|
||||
)
|
||||
from requests import RequestException
|
||||
from typing_extensions import Self
|
||||
|
||||
BAICHUAN_API_URL: str = "https://api.baichuan-ai.com/v1/embeddings"
|
||||
|
||||
# BaichuanTextEmbeddings is an embedding model provided by Baichuan Inc. (https://www.baichuan-ai.com/home).
|
||||
# As of today (Jan 25th, 2024) BaichuanTextEmbeddings ranks #1 in C-MTEB
|
||||
# (Chinese Multi-Task Embedding Benchmark) leaderboard.
|
||||
# Leaderboard (Under Overall -> Chinese section): https://huggingface.co/spaces/mteb/leaderboard
|
||||
|
||||
# Official Website: https://platform.baichuan-ai.com/docs/text-Embedding
|
||||
# An API-key is required to use this embedding model. You can get one by registering
|
||||
# at https://platform.baichuan-ai.com/docs/text-Embedding.
|
||||
# BaichuanTextEmbeddings support 512 token window and produces vectors with
|
||||
# 1024 dimensions.
|
||||
|
||||
|
||||
# NOTE!! BaichuanTextEmbeddings only supports Chinese text embedding.
|
||||
# Multi-language support is coming soon.
|
||||
class BaichuanTextEmbeddings(BaseModel, Embeddings):
|
||||
"""Baichuan Text Embedding models.
|
||||
|
||||
Setup:
|
||||
To use, you should set the environment variable ``BAICHUAN_API_KEY`` to
|
||||
your API key or pass it as a named parameter to the constructor.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export BAICHUAN_API_KEY="your-api-key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import BaichuanTextEmbeddings
|
||||
|
||||
embeddings = BaichuanTextEmbeddings()
|
||||
|
||||
Embed:
|
||||
.. code-block:: python
|
||||
|
||||
# embed the documents
|
||||
vectors = embeddings.embed_documents([text1, text2, ...])
|
||||
|
||||
# embed the query
|
||||
vectors = embeddings.embed_query(text)
|
||||
""" # noqa: E501
|
||||
|
||||
session: Any = None #: :meta private:
|
||||
model_name: str = Field(default="Baichuan-Text-Embedding", alias="model")
|
||||
"""The model used to embed the documents."""
|
||||
baichuan_api_key: SecretStr = Field(
|
||||
alias="api_key",
|
||||
default_factory=secret_from_env(["BAICHUAN_API_KEY", "BAICHUAN_AUTH_TOKEN"]),
|
||||
)
|
||||
"""Automatically inferred from env var `BAICHUAN_API_KEY` if not provided."""
|
||||
chunk_size: int = 16
|
||||
"""Chunk size when multiple texts are input"""
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True, protected_namespaces=())
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_environment(self) -> Self:
|
||||
"""Validate that auth token exists in environment."""
|
||||
session = requests.Session()
|
||||
session.headers.update(
|
||||
{
|
||||
"Authorization": f"Bearer {self.baichuan_api_key.get_secret_value()}",
|
||||
"Accept-Encoding": "identity",
|
||||
"Content-type": "application/json",
|
||||
}
|
||||
)
|
||||
self.session = session
|
||||
return self
|
||||
|
||||
def _embed(self, texts: List[str]) -> Optional[List[List[float]]]:
|
||||
"""Internal method to call Baichuan Embedding API and return embeddings.
|
||||
|
||||
Args:
|
||||
texts: A list of texts to embed.
|
||||
|
||||
Returns:
|
||||
A list of list of floats representing the embeddings, or None if an
|
||||
error occurs.
|
||||
"""
|
||||
chunk_texts = [
|
||||
texts[i : i + self.chunk_size]
|
||||
for i in range(0, len(texts), self.chunk_size)
|
||||
]
|
||||
embed_results = []
|
||||
for chunk in chunk_texts:
|
||||
response = self.session.post(
|
||||
BAICHUAN_API_URL, json={"input": chunk, "model": self.model_name}
|
||||
)
|
||||
# Raise exception if response status code from 400 to 600
|
||||
response.raise_for_status()
|
||||
# Check if the response status code indicates success
|
||||
if response.status_code == 200:
|
||||
resp = response.json()
|
||||
embeddings = resp.get("data", [])
|
||||
# Sort resulting embeddings by index
|
||||
sorted_embeddings = sorted(embeddings, key=lambda e: e.get("index", 0))
|
||||
# Return just the embeddings
|
||||
embed_results.extend(
|
||||
[result.get("embedding", []) for result in sorted_embeddings]
|
||||
)
|
||||
else:
|
||||
# Log error or handle unsuccessful response appropriately
|
||||
# Handle 100 <= status_code < 400, not include 200
|
||||
raise RequestException(
|
||||
f"Error: Received status code {response.status_code} from "
|
||||
"`BaichuanEmbedding` API"
|
||||
)
|
||||
return embed_results
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> Optional[List[List[float]]]: # type: ignore[override]
|
||||
"""Public method to get embeddings for a list of documents.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
A list of embeddings, one for each text, or None if an error occurs.
|
||||
"""
|
||||
return self._embed(texts)
|
||||
|
||||
def embed_query(self, text: str) -> Optional[List[float]]: # type: ignore[override]
|
||||
"""Public method to get embedding for a single query text.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text, or None if an error occurs.
|
||||
"""
|
||||
result = self._embed([text])
|
||||
return result[0] if result is not None else None
|
||||
@@ -0,0 +1,186 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
|
||||
from pydantic import BaseModel, ConfigDict, Field, SecretStr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QianfanEmbeddingsEndpoint(BaseModel, Embeddings):
|
||||
"""Baidu Qianfan Embeddings embedding models.
|
||||
|
||||
Setup:
|
||||
To use, you should have the ``qianfan`` python package installed, and set
|
||||
environment variables ``QIANFAN_AK``, ``QIANFAN_SK``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install qianfan
|
||||
export QIANFAN_AK="your-api-key"
|
||||
export QIANFAN_SK="your-secret_key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import QianfanEmbeddingsEndpoint
|
||||
|
||||
embeddings = QianfanEmbeddingsEndpoint()
|
||||
|
||||
Embed:
|
||||
.. code-block:: python
|
||||
|
||||
# embed the documents
|
||||
vectors = embeddings.embed_documents([text1, text2, ...])
|
||||
|
||||
# embed the query
|
||||
vectors = embeddings.embed_query(text)
|
||||
|
||||
# embed the documents with async
|
||||
vectors = await embeddings.aembed_documents([text1, text2, ...])
|
||||
|
||||
# embed the query with async
|
||||
vectors = await embeddings.aembed_query(text)
|
||||
""" # noqa: E501
|
||||
|
||||
qianfan_ak: Optional[SecretStr] = Field(default=None, alias="api_key")
|
||||
"""Qianfan application apikey"""
|
||||
|
||||
qianfan_sk: Optional[SecretStr] = Field(default=None, alias="secret_key")
|
||||
"""Qianfan application secretkey"""
|
||||
|
||||
chunk_size: int = 16
|
||||
"""Chunk size when multiple texts are input"""
|
||||
|
||||
model: Optional[str] = Field(default=None)
|
||||
"""Model name
|
||||
you could get from https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Nlks5zkzu
|
||||
|
||||
for now, we support Embedding-V1 and
|
||||
- Embedding-V1 (默认模型)
|
||||
- bge-large-en
|
||||
- bge-large-zh
|
||||
|
||||
preset models are mapping to an endpoint.
|
||||
`model` will be ignored if `endpoint` is set
|
||||
"""
|
||||
|
||||
endpoint: str = ""
|
||||
"""Endpoint of the Qianfan Embedding, required if custom model used."""
|
||||
|
||||
client: Any = None
|
||||
"""Qianfan client"""
|
||||
|
||||
init_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""init kwargs for qianfan client init, such as `query_per_second` which is
|
||||
associated with qianfan resource object to limit QPS"""
|
||||
|
||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""extra params for model invoke using with `do`."""
|
||||
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
@pre_init
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""
|
||||
Validate whether qianfan_ak and qianfan_sk in the environment variables or
|
||||
configuration file are available or not.
|
||||
|
||||
init qianfan embedding client with `ak`, `sk`, `model`, `endpoint`
|
||||
|
||||
Args:
|
||||
|
||||
values: a dictionary containing configuration information, must include the
|
||||
fields of qianfan_ak and qianfan_sk
|
||||
Returns:
|
||||
|
||||
a dictionary containing configuration information. If qianfan_ak and
|
||||
qianfan_sk are not provided in the environment variables or configuration
|
||||
file,the original values will be returned; otherwise, values containing
|
||||
qianfan_ak and qianfan_sk will be returned.
|
||||
Raises:
|
||||
|
||||
ValueError: qianfan package not found, please install it with `pip install
|
||||
qianfan`
|
||||
"""
|
||||
values["qianfan_ak"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(
|
||||
values,
|
||||
"qianfan_ak",
|
||||
"QIANFAN_AK",
|
||||
default="",
|
||||
)
|
||||
)
|
||||
values["qianfan_sk"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(
|
||||
values,
|
||||
"qianfan_sk",
|
||||
"QIANFAN_SK",
|
||||
default="",
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
import qianfan
|
||||
|
||||
params = {
|
||||
**values.get("init_kwargs", {}),
|
||||
"model": values["model"],
|
||||
}
|
||||
if values["qianfan_ak"].get_secret_value() != "":
|
||||
params["ak"] = values["qianfan_ak"].get_secret_value()
|
||||
if values["qianfan_sk"].get_secret_value() != "":
|
||||
params["sk"] = values["qianfan_sk"].get_secret_value()
|
||||
if values["endpoint"] is not None and values["endpoint"] != "":
|
||||
params["endpoint"] = values["endpoint"]
|
||||
values["client"] = qianfan.Embedding(**params)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"qianfan package not found, please install it with "
|
||||
"`pip install qianfan`"
|
||||
)
|
||||
return values
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
resp = self.embed_documents([text])
|
||||
return resp[0]
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Embeds a list of text documents using the AutoVOT algorithm.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of text documents to embed.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings for each document in the input list.
|
||||
Each embedding is represented as a list of float values.
|
||||
"""
|
||||
text_in_chunks = [
|
||||
texts[i : i + self.chunk_size]
|
||||
for i in range(0, len(texts), self.chunk_size)
|
||||
]
|
||||
lst = []
|
||||
for chunk in text_in_chunks:
|
||||
resp = self.client.do(texts=chunk, **self.model_kwargs)
|
||||
lst.extend([res["embedding"] for res in resp["data"]])
|
||||
return lst
|
||||
|
||||
async def aembed_query(self, text: str) -> List[float]:
|
||||
embeddings = await self.aembed_documents([text])
|
||||
return embeddings[0]
|
||||
|
||||
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
text_in_chunks = [
|
||||
texts[i : i + self.chunk_size]
|
||||
for i in range(0, len(texts), self.chunk_size)
|
||||
]
|
||||
lst = []
|
||||
for chunk in text_in_chunks:
|
||||
resp = await self.client.ado(texts=chunk, **self.model_kwargs)
|
||||
for res in resp["data"]:
|
||||
lst.extend([res["embedding"]])
|
||||
return lst
|
||||
222
venv/Lib/site-packages/langchain_community/embeddings/bedrock.py
Normal file
222
venv/Lib/site-packages/langchain_community/embeddings/bedrock.py
Normal file
@@ -0,0 +1,222 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.runnables.config import run_in_executor
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.11",
|
||||
removal="1.0",
|
||||
alternative_import="langchain_aws.BedrockEmbeddings",
|
||||
)
|
||||
class BedrockEmbeddings(BaseModel, Embeddings):
|
||||
"""Bedrock embedding models.
|
||||
|
||||
To authenticate, the AWS client uses the following methods to
|
||||
automatically load credentials:
|
||||
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
||||
|
||||
If a specific credential profile should be used, you must pass
|
||||
the name of the profile from the ~/.aws/credentials file that is to be used.
|
||||
|
||||
Make sure the credentials / roles used have the required policies to
|
||||
access the Bedrock service.
|
||||
"""
|
||||
|
||||
"""
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.bedrock_embeddings import BedrockEmbeddings
|
||||
|
||||
region_name ="us-east-1"
|
||||
credentials_profile_name = "default"
|
||||
model_id = "amazon.titan-embed-text-v1"
|
||||
|
||||
be = BedrockEmbeddings(
|
||||
credentials_profile_name=credentials_profile_name,
|
||||
region_name=region_name,
|
||||
model_id=model_id
|
||||
)
|
||||
"""
|
||||
|
||||
client: Any = None #: :meta private:
|
||||
"""Bedrock client."""
|
||||
region_name: Optional[str] = None
|
||||
"""The aws region e.g., `us-west-2`. Fallsback to AWS_DEFAULT_REGION env variable
|
||||
or region specified in ~/.aws/config in case it is not provided here.
|
||||
"""
|
||||
|
||||
credentials_profile_name: Optional[str] = None
|
||||
"""The name of the profile in the ~/.aws/credentials or ~/.aws/config files, which
|
||||
has either access keys or role information specified.
|
||||
If not specified, the default credential profile or, if on an EC2 instance,
|
||||
credentials from IMDS will be used.
|
||||
See: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
||||
"""
|
||||
|
||||
model_id: str = "amazon.titan-embed-text-v1"
|
||||
"""Id of the model to call, e.g., amazon.titan-embed-text-v1, this is
|
||||
equivalent to the modelId property in the list-foundation-models api"""
|
||||
|
||||
model_kwargs: Optional[Dict] = None
|
||||
"""Keyword arguments to pass to the model."""
|
||||
|
||||
endpoint_url: Optional[str] = None
|
||||
"""Needed if you don't want to default to us-east-1 endpoint"""
|
||||
|
||||
normalize: bool = False
|
||||
"""Whether the embeddings should be normalized to unit vectors"""
|
||||
|
||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_environment(self) -> Self:
|
||||
"""Validate that AWS credentials to and python package exists in environment."""
|
||||
|
||||
if self.client is not None:
|
||||
return self
|
||||
|
||||
try:
|
||||
import boto3
|
||||
|
||||
if self.credentials_profile_name is not None:
|
||||
session = boto3.Session(profile_name=self.credentials_profile_name)
|
||||
else:
|
||||
# use default credentials
|
||||
session = boto3.Session()
|
||||
|
||||
client_params = {}
|
||||
if self.region_name:
|
||||
client_params["region_name"] = self.region_name
|
||||
|
||||
if self.endpoint_url:
|
||||
client_params["endpoint_url"] = self.endpoint_url
|
||||
|
||||
self.client = session.client("bedrock-runtime", **client_params)
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import boto3 python package. "
|
||||
"Please install it with `pip install boto3`."
|
||||
)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"Could not load credentials to authenticate with AWS client. "
|
||||
"Please check that credentials in the specified "
|
||||
f"profile name are valid. Bedrock error: {e}"
|
||||
) from e
|
||||
|
||||
return self
|
||||
|
||||
def _embedding_func(self, text: str) -> List[float]:
|
||||
"""Call out to Bedrock embedding endpoint."""
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace(os.linesep, " ")
|
||||
|
||||
# format input body for provider
|
||||
provider = self.model_id.split(".")[0]
|
||||
_model_kwargs = self.model_kwargs or {}
|
||||
input_body = {**_model_kwargs}
|
||||
if provider == "cohere":
|
||||
if "input_type" not in input_body.keys():
|
||||
input_body["input_type"] = "search_document"
|
||||
input_body["texts"] = [text]
|
||||
else:
|
||||
# includes common provider == "amazon"
|
||||
input_body["inputText"] = text
|
||||
body = json.dumps(input_body)
|
||||
|
||||
try:
|
||||
# invoke bedrock API
|
||||
response = self.client.invoke_model(
|
||||
body=body,
|
||||
modelId=self.model_id,
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
)
|
||||
|
||||
# format output based on provider
|
||||
response_body = json.loads(response.get("body").read())
|
||||
if provider == "cohere":
|
||||
return response_body.get("embeddings")[0]
|
||||
else:
|
||||
# includes common provider == "amazon"
|
||||
return response_body.get("embedding")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error raised by inference endpoint: {e}")
|
||||
|
||||
def _normalize_vector(self, embeddings: List[float]) -> List[float]:
|
||||
"""Normalize the embedding to a unit vector."""
|
||||
emb = np.array(embeddings)
|
||||
norm_emb = emb / np.linalg.norm(emb)
|
||||
return norm_emb.tolist()
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Compute doc embeddings using a Bedrock model.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
results = []
|
||||
for text in texts:
|
||||
response = self._embedding_func(text)
|
||||
|
||||
if self.normalize:
|
||||
response = self._normalize_vector(response)
|
||||
|
||||
results.append(response)
|
||||
|
||||
return results
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Compute query embeddings using a Bedrock model.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
embedding = self._embedding_func(text)
|
||||
|
||||
if self.normalize:
|
||||
return self._normalize_vector(embedding)
|
||||
|
||||
return embedding
|
||||
|
||||
async def aembed_query(self, text: str) -> List[float]:
|
||||
"""Asynchronous compute query embeddings using a Bedrock model.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
|
||||
return await run_in_executor(None, self.embed_query, text)
|
||||
|
||||
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Asynchronous compute doc embeddings using a Bedrock model.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
|
||||
result = await asyncio.gather(*[self.aembed_query(text) for text in texts])
|
||||
|
||||
return list(result)
|
||||
@@ -0,0 +1,97 @@
|
||||
"""Wrapper around Bookend AI embedding models."""
|
||||
|
||||
import json
|
||||
from typing import Any, List
|
||||
|
||||
import requests
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
API_URL = "https://api.bookend.ai/"
|
||||
DEFAULT_TASK = "embeddings"
|
||||
PATH = "/models/predict"
|
||||
|
||||
|
||||
class BookendEmbeddings(BaseModel, Embeddings):
|
||||
"""Bookend AI sentence_transformers embedding models.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import BookendEmbeddings
|
||||
|
||||
bookend = BookendEmbeddings(
|
||||
domain={domain}
|
||||
api_token={api_token}
|
||||
model_id={model_id}
|
||||
)
|
||||
bookend.embed_documents([
|
||||
"Please put on these earmuffs because I can't you hear.",
|
||||
"Baby wipes are made of chocolate stardust.",
|
||||
])
|
||||
bookend.embed_query(
|
||||
"She only paints with bold colors; she does not like pastels."
|
||||
)
|
||||
"""
|
||||
|
||||
domain: str
|
||||
"""Request for a domain at https://bookend.ai/ to use this embeddings module."""
|
||||
api_token: str
|
||||
"""Request for an API token at https://bookend.ai/ to use this embeddings module."""
|
||||
model_id: str
|
||||
"""Embeddings model ID to use."""
|
||||
auth_header: dict = Field(default_factory=dict)
|
||||
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
self.auth_header = {"Authorization": "Basic {}".format(self.api_token)}
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed documents using a Bookend deployed embeddings model.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
result = []
|
||||
headers = self.auth_header
|
||||
headers["Content-Type"] = "application/json; charset=utf-8"
|
||||
params = {
|
||||
"model_id": self.model_id,
|
||||
"task": DEFAULT_TASK,
|
||||
}
|
||||
|
||||
for text in texts:
|
||||
data = json.dumps(
|
||||
{
|
||||
"text": text,
|
||||
"question": None,
|
||||
"context": None,
|
||||
"instruction": None,
|
||||
}
|
||||
)
|
||||
r = requests.request(
|
||||
"POST",
|
||||
API_URL + self.domain + PATH,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
)
|
||||
result.append(r.json()[0]["data"])
|
||||
|
||||
return result
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Embed a query using a Bookend deployed embeddings model.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self.embed_documents([text])[0]
|
||||
@@ -0,0 +1,139 @@
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ClarifaiEmbeddings(BaseModel, Embeddings):
|
||||
"""Clarifai embedding models.
|
||||
|
||||
To use, you should have the ``clarifai`` python package installed, and the
|
||||
environment variable ``CLARIFAI_PAT`` set with your personal access token or pass it
|
||||
as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import ClarifaiEmbeddings
|
||||
clarifai = ClarifaiEmbeddings(user_id=USER_ID,
|
||||
app_id=APP_ID,
|
||||
model_id=MODEL_ID)
|
||||
(or)
|
||||
Example_URL = "https://clarifai.com/clarifai/main/models/BAAI-bge-base-en-v15"
|
||||
clarifai = ClarifaiEmbeddings(model_url=EXAMPLE_URL)
|
||||
"""
|
||||
|
||||
model_url: Optional[str] = None
|
||||
"""Model url to use."""
|
||||
model_id: Optional[str] = None
|
||||
"""Model id to use."""
|
||||
model_version_id: Optional[str] = None
|
||||
"""Model version id to use."""
|
||||
app_id: Optional[str] = None
|
||||
"""Clarifai application id to use."""
|
||||
user_id: Optional[str] = None
|
||||
"""Clarifai user id to use."""
|
||||
pat: Optional[str] = Field(default=None, exclude=True)
|
||||
"""Clarifai personal access token to use."""
|
||||
token: Optional[str] = Field(default=None, exclude=True)
|
||||
"""Clarifai session token to use."""
|
||||
model: Any = Field(default=None, exclude=True) #: :meta private:
|
||||
api_base: str = "https://api.clarifai.com"
|
||||
|
||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate that we have all required info to access Clarifai
|
||||
platform and python package exists in environment."""
|
||||
|
||||
try:
|
||||
from clarifai.client.model import Model
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import clarifai python package. "
|
||||
"Please install it with `pip install clarifai`."
|
||||
)
|
||||
user_id = values.get("user_id")
|
||||
app_id = values.get("app_id")
|
||||
model_id = values.get("model_id")
|
||||
model_version_id = values.get("model_version_id")
|
||||
model_url = values.get("model_url")
|
||||
api_base = values.get("api_base")
|
||||
pat = values.get("pat")
|
||||
token = values.get("token")
|
||||
|
||||
values["model"] = Model(
|
||||
url=model_url,
|
||||
app_id=app_id,
|
||||
user_id=user_id,
|
||||
model_version=dict(id=model_version_id),
|
||||
pat=pat,
|
||||
token=token,
|
||||
model_id=model_id,
|
||||
base_url=api_base,
|
||||
)
|
||||
|
||||
return values
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Call out to Clarifai's embedding models.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
from clarifai.client.input import Inputs
|
||||
|
||||
input_obj = Inputs.from_auth_helper(self.model.auth_helper)
|
||||
batch_size = 32
|
||||
embeddings = []
|
||||
|
||||
try:
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i : i + batch_size]
|
||||
input_batch = [
|
||||
input_obj.get_text_input(input_id=str(id), raw_text=inp)
|
||||
for id, inp in enumerate(batch)
|
||||
]
|
||||
predict_response = self.model.predict(input_batch)
|
||||
embeddings.extend(
|
||||
[
|
||||
list(output.data.embeddings[0].vector)
|
||||
for output in predict_response.outputs
|
||||
]
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Predict failed, exception: {e}")
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to Clarifai's embedding models.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
|
||||
try:
|
||||
predict_response = self.model.predict_by_bytes(
|
||||
bytes(text, "utf-8"), input_type="text"
|
||||
)
|
||||
embeddings = [
|
||||
list(op.data.embeddings[0].vector) for op in predict_response.outputs
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Predict failed, exception: {e}")
|
||||
|
||||
return embeddings[0]
|
||||
@@ -0,0 +1,97 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import requests
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
DEFAULT_MODEL_NAME = "@cf/baai/bge-base-en-v1.5"
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.3.23",
|
||||
removal="1.0",
|
||||
alternative_import="langchain_cloudflare.CloudflareWorkersAIEmbeddings",
|
||||
)
|
||||
class CloudflareWorkersAIEmbeddings(BaseModel, Embeddings):
|
||||
"""Cloudflare Workers AI embedding model.
|
||||
|
||||
To use, you need to provide an API token and
|
||||
account ID to access Cloudflare Workers AI.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import CloudflareWorkersAIEmbeddings
|
||||
|
||||
account_id = "my_account_id"
|
||||
api_token = "my_secret_api_token"
|
||||
model_name = "@cf/baai/bge-small-en-v1.5"
|
||||
|
||||
cf = CloudflareWorkersAIEmbeddings(
|
||||
account_id=account_id,
|
||||
api_token=api_token,
|
||||
model_name=model_name
|
||||
)
|
||||
"""
|
||||
|
||||
api_base_url: str = "https://api.cloudflare.com/client/v4/accounts"
|
||||
account_id: str
|
||||
api_token: str
|
||||
model_name: str = DEFAULT_MODEL_NAME
|
||||
batch_size: int = 50
|
||||
strip_new_lines: bool = True
|
||||
headers: Dict[str, str] = {"Authorization": "Bearer "}
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize the Cloudflare Workers AI client."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
|
||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Compute doc embeddings using Cloudflare Workers AI.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
if self.strip_new_lines:
|
||||
texts = [text.replace("\n", " ") for text in texts]
|
||||
|
||||
batches = [
|
||||
texts[i : i + self.batch_size]
|
||||
for i in range(0, len(texts), self.batch_size)
|
||||
]
|
||||
embeddings = []
|
||||
|
||||
for batch in batches:
|
||||
response = requests.post(
|
||||
f"{self.api_base_url}/{self.account_id}/ai/run/{self.model_name}",
|
||||
headers=self.headers,
|
||||
json={"text": batch},
|
||||
)
|
||||
embeddings.extend(response.json()["result"]["data"])
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Compute query embeddings using Cloudflare Workers AI.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
text = text.replace("\n", " ") if self.strip_new_lines else text
|
||||
response = requests.post(
|
||||
f"{self.api_base_url}/{self.account_id}/ai/run/{self.model_name}",
|
||||
headers=self.headers,
|
||||
json={"text": [text]},
|
||||
)
|
||||
return response.json()["result"]["data"][0]
|
||||
142
venv/Lib/site-packages/langchain_community/embeddings/clova.py
Normal file
142
venv/Lib/site-packages/langchain_community/embeddings/clova.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional, cast
|
||||
|
||||
import requests
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
|
||||
from pydantic import BaseModel, ConfigDict, SecretStr, model_validator
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.3.4",
|
||||
removal="1.0.0",
|
||||
alternative_import="langchain_community.ClovaXEmbeddings",
|
||||
)
|
||||
class ClovaEmbeddings(BaseModel, Embeddings):
|
||||
"""
|
||||
Clova's embedding service.
|
||||
|
||||
To use this service,
|
||||
|
||||
you should have the following environment variables
|
||||
set with your API tokens and application ID,
|
||||
or pass them as named parameters to the constructor:
|
||||
|
||||
- ``CLOVA_EMB_API_KEY``: API key for accessing Clova's embedding service.
|
||||
- ``CLOVA_EMB_APIGW_API_KEY``: API gateway key for enhanced security.
|
||||
- ``CLOVA_EMB_APP_ID``: Application ID for identifying your application.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import ClovaEmbeddings
|
||||
embeddings = ClovaEmbeddings(
|
||||
clova_emb_api_key='your_clova_emb_api_key',
|
||||
clova_emb_apigw_api_key='your_clova_emb_apigw_api_key',
|
||||
app_id='your_app_id'
|
||||
)
|
||||
|
||||
query_text = "This is a test query."
|
||||
query_result = embeddings.embed_query(query_text)
|
||||
|
||||
document_text = "This is a test document."
|
||||
document_result = embeddings.embed_documents([document_text])
|
||||
|
||||
"""
|
||||
|
||||
endpoint_url: str = (
|
||||
"https://clovastudio.apigw.ntruss.com/testapp/v1/api-tools/embedding"
|
||||
)
|
||||
"""Endpoint URL to use."""
|
||||
model: str = "clir-emb-dolphin"
|
||||
"""Embedding model name to use."""
|
||||
clova_emb_api_key: Optional[SecretStr] = None
|
||||
"""API key for accessing Clova's embedding service."""
|
||||
clova_emb_apigw_api_key: Optional[SecretStr] = None
|
||||
"""API gateway key for enhanced security."""
|
||||
app_id: Optional[SecretStr] = None
|
||||
"""Application ID for identifying your application."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate api key exists in environment."""
|
||||
values["clova_emb_api_key"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(values, "clova_emb_api_key", "CLOVA_EMB_API_KEY")
|
||||
)
|
||||
values["clova_emb_apigw_api_key"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(
|
||||
values, "clova_emb_apigw_api_key", "CLOVA_EMB_APIGW_API_KEY"
|
||||
)
|
||||
)
|
||||
values["app_id"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(values, "app_id", "CLOVA_EMB_APP_ID")
|
||||
)
|
||||
return values
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Embed a list of texts and return their embeddings.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
embeddings = []
|
||||
for text in texts:
|
||||
embeddings.append(self._embed_text(text))
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""
|
||||
Embed a single query text and return its embedding.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self._embed_text(text)
|
||||
|
||||
def _embed_text(self, text: str) -> List[float]:
|
||||
"""
|
||||
Internal method to call the embedding API and handle the response.
|
||||
"""
|
||||
payload = {"text": text}
|
||||
|
||||
# HTTP headers for authorization
|
||||
headers = {
|
||||
"X-NCP-CLOVASTUDIO-API-KEY": cast(
|
||||
SecretStr, self.clova_emb_api_key
|
||||
).get_secret_value(),
|
||||
"X-NCP-APIGW-API-KEY": cast(
|
||||
SecretStr, self.clova_emb_apigw_api_key
|
||||
).get_secret_value(),
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# send request
|
||||
app_id = cast(SecretStr, self.app_id).get_secret_value()
|
||||
response = requests.post(
|
||||
f"{self.endpoint_url}/{self.model}/{app_id}",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
)
|
||||
|
||||
# check for errors
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
if "result" in response_data and "embedding" in response_data["result"]:
|
||||
return response_data["result"]["embedding"]
|
||||
raise ValueError(
|
||||
f"API request failed with status {response.status_code}: {response.text}"
|
||||
)
|
||||
172
venv/Lib/site-packages/langchain_community/embeddings/cohere.py
Normal file
172
venv/Lib/site-packages/langchain_community/embeddings/cohere.py
Normal file
@@ -0,0 +1,172 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
|
||||
from langchain_community.llms.cohere import _create_retry_decorator
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.0.30",
|
||||
removal="1.0",
|
||||
alternative_import="langchain_cohere.CohereEmbeddings",
|
||||
)
|
||||
class CohereEmbeddings(BaseModel, Embeddings):
|
||||
"""Cohere embedding models.
|
||||
|
||||
To use, you should have the ``cohere`` python package installed, and the
|
||||
environment variable ``COHERE_API_KEY`` set with your API key or pass it
|
||||
as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import CohereEmbeddings
|
||||
cohere = CohereEmbeddings(
|
||||
model="embed-english-light-v3.0",
|
||||
cohere_api_key="my-api-key"
|
||||
)
|
||||
"""
|
||||
|
||||
client: Any = None #: :meta private:
|
||||
"""Cohere client."""
|
||||
async_client: Any = None #: :meta private:
|
||||
"""Cohere async client."""
|
||||
model: str = "embed-english-v2.0"
|
||||
"""Model name to use."""
|
||||
|
||||
truncate: Optional[str] = None
|
||||
"""Truncate embeddings that are too long from start or end ("NONE"|"START"|"END")"""
|
||||
|
||||
cohere_api_key: Optional[str] = None
|
||||
|
||||
max_retries: int = 3
|
||||
"""Maximum number of retries to make when generating."""
|
||||
request_timeout: Optional[float] = None
|
||||
"""Timeout in seconds for the Cohere API request."""
|
||||
user_agent: str = "langchain"
|
||||
"""Identifier for the application making the request."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
cohere_api_key = get_from_dict_or_env(
|
||||
values, "cohere_api_key", "COHERE_API_KEY"
|
||||
)
|
||||
request_timeout = values.get("request_timeout")
|
||||
|
||||
try:
|
||||
import cohere
|
||||
|
||||
client_name = values["user_agent"]
|
||||
values["client"] = cohere.Client(
|
||||
cohere_api_key,
|
||||
timeout=request_timeout,
|
||||
client_name=client_name,
|
||||
)
|
||||
values["async_client"] = cohere.AsyncClient(
|
||||
cohere_api_key,
|
||||
timeout=request_timeout,
|
||||
client_name=client_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import cohere python package. "
|
||||
"Please install it with `pip install cohere`."
|
||||
)
|
||||
return values
|
||||
|
||||
def embed_with_retry(self, **kwargs: Any) -> Any:
|
||||
"""Use tenacity to retry the embed call."""
|
||||
retry_decorator = _create_retry_decorator(self.max_retries)
|
||||
|
||||
@retry_decorator
|
||||
def _embed_with_retry(**kwargs: Any) -> Any:
|
||||
return self.client.embed(**kwargs)
|
||||
|
||||
return _embed_with_retry(**kwargs)
|
||||
|
||||
def aembed_with_retry(self, **kwargs: Any) -> Any:
|
||||
"""Use tenacity to retry the embed call."""
|
||||
retry_decorator = _create_retry_decorator(self.max_retries)
|
||||
|
||||
@retry_decorator
|
||||
async def _embed_with_retry(**kwargs: Any) -> Any:
|
||||
return await self.async_client.embed(**kwargs)
|
||||
|
||||
return _embed_with_retry(**kwargs)
|
||||
|
||||
def embed(
|
||||
self, texts: List[str], *, input_type: Optional[str] = None
|
||||
) -> List[List[float]]:
|
||||
embeddings = self.embed_with_retry(
|
||||
model=self.model,
|
||||
texts=texts,
|
||||
input_type=input_type,
|
||||
truncate=self.truncate,
|
||||
).embeddings
|
||||
return [list(map(float, e)) for e in embeddings]
|
||||
|
||||
async def aembed(
|
||||
self, texts: List[str], *, input_type: Optional[str] = None
|
||||
) -> List[List[float]]:
|
||||
embeddings = (
|
||||
await self.aembed_with_retry(
|
||||
model=self.model,
|
||||
texts=texts,
|
||||
input_type=input_type,
|
||||
truncate=self.truncate,
|
||||
)
|
||||
).embeddings
|
||||
return [list(map(float, e)) for e in embeddings]
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed a list of document texts.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
return self.embed(texts, input_type="search_document")
|
||||
|
||||
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Async call out to Cohere's embedding endpoint.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
return await self.aembed(texts, input_type="search_document")
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to Cohere's embedding endpoint.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self.embed([text], input_type="search_query")[0]
|
||||
|
||||
async def aembed_query(self, text: str) -> List[float]:
|
||||
"""Async call out to Cohere's embedding endpoint.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return (await self.aembed([text], input_type="search_query"))[0]
|
||||
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
from requests.exceptions import HTTPError
|
||||
from tenacity import (
|
||||
before_sleep_log,
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BATCH_SIZE = {
|
||||
"text-embedding-v1": 25,
|
||||
"text-embedding-v2": 25,
|
||||
"text-embedding-v3": 10,
|
||||
"text-embedding-v4": 10,
|
||||
}
|
||||
|
||||
|
||||
def _create_retry_decorator(embeddings: DashScopeEmbeddings) -> Callable[[Any], Any]:
|
||||
multiplier = 1
|
||||
min_seconds = 1
|
||||
max_seconds = 4
|
||||
# Wait 2^x * 1 second between each retry starting with
|
||||
# 1 seconds, then up to 4 seconds, then 4 seconds afterwards
|
||||
return retry(
|
||||
reraise=True,
|
||||
stop=stop_after_attempt(embeddings.max_retries),
|
||||
wait=wait_exponential(multiplier, min=min_seconds, max=max_seconds),
|
||||
retry=(retry_if_exception_type(HTTPError)),
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
|
||||
|
||||
def embed_with_retry(embeddings: DashScopeEmbeddings, **kwargs: Any) -> Any:
|
||||
"""Use tenacity to retry the embedding call."""
|
||||
retry_decorator = _create_retry_decorator(embeddings)
|
||||
|
||||
@retry_decorator
|
||||
def _embed_with_retry(**kwargs: Any) -> Any:
|
||||
result = []
|
||||
i = 0
|
||||
input_data = kwargs["input"]
|
||||
input_len = len(input_data) if isinstance(input_data, list) else 1
|
||||
batch_size = BATCH_SIZE.get(kwargs["model"], 25)
|
||||
while i < input_len:
|
||||
kwargs["input"] = (
|
||||
input_data[i : i + batch_size]
|
||||
if isinstance(input_data, list)
|
||||
else input_data
|
||||
)
|
||||
resp = embeddings.client.call(**kwargs)
|
||||
if resp.status_code == 200:
|
||||
result += resp.output["embeddings"]
|
||||
elif resp.status_code in [400, 401]:
|
||||
raise ValueError(
|
||||
f"status_code: {resp.status_code} \n "
|
||||
f"code: {resp.code} \n message: {resp.message}"
|
||||
)
|
||||
else:
|
||||
raise HTTPError(
|
||||
f"HTTP error occurred: status_code: {resp.status_code} \n "
|
||||
f"code: {resp.code} \n message: {resp.message}",
|
||||
response=resp,
|
||||
)
|
||||
i += batch_size
|
||||
return result
|
||||
|
||||
return _embed_with_retry(**kwargs)
|
||||
|
||||
|
||||
class DashScopeEmbeddings(BaseModel, Embeddings):
|
||||
"""DashScope embedding models.
|
||||
|
||||
To use, you should have the ``dashscope`` python package installed, and the
|
||||
environment variable ``DASHSCOPE_API_KEY`` set with your API key or pass it
|
||||
as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import DashScopeEmbeddings
|
||||
embeddings = DashScopeEmbeddings(dashscope_api_key="my-api-key")
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
import os
|
||||
os.environ["DASHSCOPE_API_KEY"] = "your DashScope API KEY"
|
||||
|
||||
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
|
||||
embeddings = DashScopeEmbeddings(
|
||||
model="text-embedding-v1",
|
||||
)
|
||||
text = "This is a test query."
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
"""
|
||||
|
||||
client: Any = None #: :meta private:
|
||||
"""The DashScope client."""
|
||||
model: str = "text-embedding-v1"
|
||||
dashscope_api_key: Optional[str] = None
|
||||
max_retries: int = 5
|
||||
"""Maximum number of retries to make when generating."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
import dashscope
|
||||
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
values["dashscope_api_key"] = get_from_dict_or_env(
|
||||
values, "dashscope_api_key", "DASHSCOPE_API_KEY"
|
||||
)
|
||||
dashscope.api_key = values["dashscope_api_key"]
|
||||
try:
|
||||
import dashscope
|
||||
|
||||
values["client"] = dashscope.TextEmbedding
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import dashscope python package. "
|
||||
"Please install it with `pip install dashscope`."
|
||||
)
|
||||
return values
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Call out to DashScope's embedding endpoint for embedding search docs.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
embeddings = embed_with_retry(
|
||||
self, input=texts, text_type="document", model=self.model
|
||||
)
|
||||
embedding_list = [item["embedding"] for item in embeddings]
|
||||
return embedding_list
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to DashScope's embedding endpoint for embedding query text.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embedding for the text.
|
||||
"""
|
||||
embedding = embed_with_retry(
|
||||
self, input=text, text_type="query", model=self.model
|
||||
)[0]["embedding"]
|
||||
return embedding
|
||||
@@ -0,0 +1,52 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterator, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
|
||||
from langchain_community.embeddings.mlflow import MlflowEmbeddings
|
||||
|
||||
|
||||
def _chunk(texts: List[str], size: int) -> Iterator[List[str]]:
|
||||
for i in range(0, len(texts), size):
|
||||
yield texts[i : i + size]
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.3.3",
|
||||
removal="1.0",
|
||||
alternative_import="databricks_langchain.DatabricksEmbeddings",
|
||||
)
|
||||
class DatabricksEmbeddings(MlflowEmbeddings):
|
||||
"""Databricks embeddings.
|
||||
|
||||
To use, you should have the ``mlflow`` python package installed.
|
||||
For more information, see https://mlflow.org/docs/latest/llms/deployments.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import DatabricksEmbeddings
|
||||
|
||||
embeddings = DatabricksEmbeddings(
|
||||
target_uri="databricks",
|
||||
endpoint="embeddings",
|
||||
)
|
||||
"""
|
||||
|
||||
target_uri: str = "databricks"
|
||||
"""The target URI to use. Defaults to ``databricks``."""
|
||||
|
||||
@property
|
||||
def _mlflow_extras(self) -> str:
|
||||
return ""
|
||||
|
||||
def _validate_uri(self) -> None:
|
||||
if self.target_uri == "databricks":
|
||||
return
|
||||
|
||||
if urlparse(self.target_uri).scheme != "databricks":
|
||||
raise ValueError(
|
||||
"Invalid target URI. The target URI must be a valid databricks URI."
|
||||
)
|
||||
@@ -0,0 +1,140 @@
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
import requests
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import get_from_dict_or_env, pre_init
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
DEFAULT_MODEL_ID = "sentence-transformers/clip-ViT-B-32"
|
||||
MAX_BATCH_SIZE = 1024
|
||||
|
||||
|
||||
class DeepInfraEmbeddings(BaseModel, Embeddings):
|
||||
"""Deep Infra's embedding inference service.
|
||||
|
||||
To use, you should have the
|
||||
environment variable ``DEEPINFRA_API_TOKEN`` set with your API token, or pass
|
||||
it as a named parameter to the constructor.
|
||||
There are multiple embeddings models available,
|
||||
see https://deepinfra.com/models?type=embeddings.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import DeepInfraEmbeddings
|
||||
deepinfra_emb = DeepInfraEmbeddings(
|
||||
model_id="sentence-transformers/clip-ViT-B-32",
|
||||
deepinfra_api_token="my-api-key"
|
||||
)
|
||||
r1 = deepinfra_emb.embed_documents(
|
||||
[
|
||||
"Alpha is the first letter of Greek alphabet",
|
||||
"Beta is the second letter of Greek alphabet",
|
||||
]
|
||||
)
|
||||
r2 = deepinfra_emb.embed_query(
|
||||
"What is the second letter of Greek alphabet"
|
||||
)
|
||||
|
||||
"""
|
||||
|
||||
model_id: str = DEFAULT_MODEL_ID
|
||||
"""Embeddings model to use."""
|
||||
normalize: bool = False
|
||||
"""whether to normalize the computed embeddings"""
|
||||
embed_instruction: str = "passage: "
|
||||
"""Instruction used to embed documents."""
|
||||
query_instruction: str = "query: "
|
||||
"""Instruction used to embed the query."""
|
||||
model_kwargs: Optional[dict] = None
|
||||
"""Other model keyword args"""
|
||||
deepinfra_api_token: Optional[str] = None
|
||||
"""API token for Deep Infra. If not provided, the token is
|
||||
fetched from the environment variable 'DEEPINFRA_API_TOKEN'."""
|
||||
batch_size: int = MAX_BATCH_SIZE
|
||||
"""Batch size for embedding requests."""
|
||||
|
||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||
|
||||
@pre_init
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
deepinfra_api_token = get_from_dict_or_env(
|
||||
values, "deepinfra_api_token", "DEEPINFRA_API_TOKEN"
|
||||
)
|
||||
values["deepinfra_api_token"] = deepinfra_api_token
|
||||
return values
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {"model_id": self.model_id}
|
||||
|
||||
def _embed(self, input: List[str]) -> List[List[float]]:
|
||||
_model_kwargs = self.model_kwargs or {}
|
||||
# HTTP headers for authorization
|
||||
headers = {
|
||||
"Authorization": f"bearer {self.deepinfra_api_token}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
# send request
|
||||
try:
|
||||
res = requests.post(
|
||||
f"https://api.deepinfra.com/v1/inference/{self.model_id}",
|
||||
headers=headers,
|
||||
json={"inputs": input, "normalize": self.normalize, **_model_kwargs},
|
||||
)
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise ValueError(f"Error raised by inference endpoint: {e}")
|
||||
|
||||
if res.status_code != 200:
|
||||
raise ValueError(
|
||||
"Error raised by inference API HTTP code: %s, %s"
|
||||
% (res.status_code, res.text)
|
||||
)
|
||||
try:
|
||||
t = res.json()
|
||||
embeddings = t["embeddings"]
|
||||
except requests.exceptions.JSONDecodeError as e:
|
||||
raise ValueError(
|
||||
f"Error raised by inference API: {e}.\nResponse: {res.text}"
|
||||
)
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed documents using a Deep Infra deployed embedding model.
|
||||
For larger batches, the input list of texts is chunked into smaller
|
||||
batches to avoid exceeding the maximum request size.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
|
||||
embeddings = []
|
||||
instruction_pairs = [f"{self.embed_instruction}{text}" for text in texts]
|
||||
|
||||
chunks = [
|
||||
instruction_pairs[i : i + self.batch_size]
|
||||
for i in range(0, len(instruction_pairs), self.batch_size)
|
||||
]
|
||||
for chunk in chunks:
|
||||
embeddings += self._embed(chunk)
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Embed a query using a Deep Infra deployed embedding model.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
instruction_pair = f"{self.query_instruction}{text}"
|
||||
embedding = self._embed([instruction_pair])[0]
|
||||
return embedding
|
||||
114
venv/Lib/site-packages/langchain_community/embeddings/edenai.py
Normal file
114
venv/Lib/site-packages/langchain_community/embeddings/edenai.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
Field,
|
||||
SecretStr,
|
||||
)
|
||||
|
||||
from langchain_community.utilities.requests import Requests
|
||||
|
||||
|
||||
class EdenAiEmbeddings(BaseModel, Embeddings):
|
||||
"""EdenAI embedding.
|
||||
environment variable ``EDENAI_API_KEY`` set with your API key, or pass
|
||||
it as a named parameter.
|
||||
"""
|
||||
|
||||
edenai_api_key: Optional[SecretStr] = Field(None, description="EdenAI API Token")
|
||||
|
||||
provider: str = "openai"
|
||||
"""embedding provider to use (eg: openai,google etc.)"""
|
||||
|
||||
model: Optional[str] = None
|
||||
"""
|
||||
model name for above provider (eg: 'gpt-3.5-turbo-instruct' for openai)
|
||||
available models are shown on https://docs.edenai.co/ under 'available providers'
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
@pre_init
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key exists in environment."""
|
||||
values["edenai_api_key"] = convert_to_secret_str(
|
||||
get_from_dict_or_env(values, "edenai_api_key", "EDENAI_API_KEY")
|
||||
)
|
||||
return values
|
||||
|
||||
@staticmethod
|
||||
def get_user_agent() -> str:
|
||||
from langchain_community import __version__
|
||||
|
||||
return f"langchain/{__version__}"
|
||||
|
||||
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Compute embeddings using EdenAi api."""
|
||||
url = "https://api.edenai.run/v2/text/embeddings"
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"content-type": "application/json",
|
||||
"authorization": f"Bearer {self.edenai_api_key.get_secret_value()}", # type: ignore[union-attr]
|
||||
"User-Agent": self.get_user_agent(),
|
||||
}
|
||||
|
||||
payload: Dict[str, Any] = {"texts": texts, "providers": self.provider}
|
||||
|
||||
if self.model is not None:
|
||||
payload["settings"] = {self.provider: self.model}
|
||||
|
||||
request = Requests(headers=headers)
|
||||
response = request.post(url=url, data=payload)
|
||||
if response.status_code >= 500:
|
||||
raise Exception(f"EdenAI Server: Error {response.status_code}")
|
||||
elif response.status_code >= 400:
|
||||
raise ValueError(f"EdenAI received an invalid payload: {response.text}")
|
||||
elif response.status_code != 200:
|
||||
raise Exception(
|
||||
f"EdenAI returned an unexpected response with status "
|
||||
f"{response.status_code}: {response.text}"
|
||||
)
|
||||
|
||||
temp = response.json()
|
||||
|
||||
provider_response = temp[self.provider]
|
||||
if provider_response.get("status") == "fail":
|
||||
err_msg = provider_response.get("error", {}).get("message")
|
||||
raise Exception(err_msg)
|
||||
|
||||
embeddings = []
|
||||
for embed_item in temp[self.provider]["items"]:
|
||||
embedding = embed_item["embedding"]
|
||||
|
||||
embeddings.append(embedding)
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed a list of documents using EdenAI.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
|
||||
return self._generate_embeddings(texts)
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Embed a query using EdenAI.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self._generate_embeddings([text])[0]
|
||||
@@ -0,0 +1,226 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.utils import get_from_env
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.client import MlClient
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
|
||||
|
||||
@deprecated(
|
||||
"0.1.11", alternative="Use class in langchain-elasticsearch package", pending=True
|
||||
)
|
||||
class ElasticsearchEmbeddings(Embeddings):
|
||||
"""Elasticsearch embedding models.
|
||||
|
||||
This class provides an interface to generate embeddings using a model deployed
|
||||
in an Elasticsearch cluster. It requires an Elasticsearch connection object
|
||||
and the model_id of the model deployed in the cluster.
|
||||
|
||||
In Elasticsearch you need to have an embedding model loaded and deployed.
|
||||
- https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-trained-model.html
|
||||
- https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-deploy-models.html
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: MlClient,
|
||||
model_id: str,
|
||||
*,
|
||||
input_field: str = "text_field",
|
||||
):
|
||||
"""
|
||||
Initialize the ElasticsearchEmbeddings instance.
|
||||
|
||||
Args:
|
||||
client (MlClient): An Elasticsearch ML client object.
|
||||
model_id (str): The model_id of the model deployed in the Elasticsearch
|
||||
cluster.
|
||||
input_field (str): The name of the key for the input text field in the
|
||||
document. Defaults to 'text_field'.
|
||||
"""
|
||||
self.client = client
|
||||
self.model_id = model_id
|
||||
self.input_field = input_field
|
||||
|
||||
@classmethod
|
||||
def from_credentials(
|
||||
cls,
|
||||
model_id: str,
|
||||
*,
|
||||
es_cloud_id: Optional[str] = None,
|
||||
es_user: Optional[str] = None,
|
||||
es_password: Optional[str] = None,
|
||||
input_field: str = "text_field",
|
||||
) -> ElasticsearchEmbeddings:
|
||||
"""Instantiate embeddings from Elasticsearch credentials.
|
||||
|
||||
Args:
|
||||
model_id (str): The model_id of the model deployed in the Elasticsearch
|
||||
cluster.
|
||||
input_field (str): The name of the key for the input text field in the
|
||||
document. Defaults to 'text_field'.
|
||||
es_cloud_id: (str, optional): The Elasticsearch cloud ID to connect to.
|
||||
es_user: (str, optional): Elasticsearch username.
|
||||
es_password: (str, optional): Elasticsearch password.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.embeddings import ElasticsearchEmbeddings
|
||||
|
||||
# Define the model ID and input field name (if different from default)
|
||||
model_id = "your_model_id"
|
||||
# Optional, only if different from 'text_field'
|
||||
input_field = "your_input_field"
|
||||
|
||||
# Credentials can be passed in two ways. Either set the env vars
|
||||
# ES_CLOUD_ID, ES_USER, ES_PASSWORD and they will be automatically
|
||||
# pulled in, or pass them in directly as kwargs.
|
||||
embeddings = ElasticsearchEmbeddings.from_credentials(
|
||||
model_id,
|
||||
input_field=input_field,
|
||||
# es_cloud_id="foo",
|
||||
# es_user="bar",
|
||||
# es_password="baz",
|
||||
)
|
||||
|
||||
documents = [
|
||||
"This is an example document.",
|
||||
"Another example document to generate embeddings for.",
|
||||
]
|
||||
embeddings_generator.embed_documents(documents)
|
||||
"""
|
||||
try:
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.client import MlClient
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"elasticsearch package not found, please install with 'pip install "
|
||||
"elasticsearch'"
|
||||
)
|
||||
|
||||
es_cloud_id = es_cloud_id or get_from_env("es_cloud_id", "ES_CLOUD_ID")
|
||||
es_user = es_user or get_from_env("es_user", "ES_USER")
|
||||
es_password = es_password or get_from_env("es_password", "ES_PASSWORD")
|
||||
|
||||
# Connect to Elasticsearch
|
||||
es_connection = Elasticsearch(
|
||||
cloud_id=es_cloud_id, basic_auth=(es_user, es_password)
|
||||
)
|
||||
client = MlClient(es_connection)
|
||||
return cls(client, model_id, input_field=input_field)
|
||||
|
||||
@classmethod
|
||||
def from_es_connection(
|
||||
cls,
|
||||
model_id: str,
|
||||
es_connection: Elasticsearch,
|
||||
input_field: str = "text_field",
|
||||
) -> ElasticsearchEmbeddings:
|
||||
"""
|
||||
Instantiate embeddings from an existing Elasticsearch connection.
|
||||
|
||||
This method provides a way to create an instance of the ElasticsearchEmbeddings
|
||||
class using an existing Elasticsearch connection. The connection object is used
|
||||
to create an MlClient, which is then used to initialize the
|
||||
ElasticsearchEmbeddings instance.
|
||||
|
||||
Args:
|
||||
model_id (str): The model_id of the model deployed in the Elasticsearch cluster.
|
||||
es_connection (elasticsearch.Elasticsearch): An existing Elasticsearch
|
||||
connection object. input_field (str, optional): The name of the key for the
|
||||
input text field in the document. Defaults to 'text_field'.
|
||||
|
||||
Returns:
|
||||
ElasticsearchEmbeddings: An instance of the ElasticsearchEmbeddings class.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from langchain_community.embeddings import ElasticsearchEmbeddings
|
||||
|
||||
# Define the model ID and input field name (if different from default)
|
||||
model_id = "your_model_id"
|
||||
# Optional, only if different from 'text_field'
|
||||
input_field = "your_input_field"
|
||||
|
||||
# Create Elasticsearch connection
|
||||
es_connection = Elasticsearch(
|
||||
hosts=["localhost:9200"], http_auth=("user", "password")
|
||||
)
|
||||
|
||||
# Instantiate ElasticsearchEmbeddings using the existing connection
|
||||
embeddings = ElasticsearchEmbeddings.from_es_connection(
|
||||
model_id,
|
||||
es_connection,
|
||||
input_field=input_field,
|
||||
)
|
||||
|
||||
documents = [
|
||||
"This is an example document.",
|
||||
"Another example document to generate embeddings for.",
|
||||
]
|
||||
embeddings_generator.embed_documents(documents)
|
||||
"""
|
||||
# Importing MlClient from elasticsearch.client within the method to
|
||||
# avoid unnecessary import if the method is not used
|
||||
from elasticsearch.client import MlClient
|
||||
|
||||
# Create an MlClient from the given Elasticsearch connection
|
||||
client = MlClient(es_connection)
|
||||
|
||||
# Return a new instance of the ElasticsearchEmbeddings class with
|
||||
# the MlClient, model_id, and input_field
|
||||
return cls(client, model_id, input_field=input_field)
|
||||
|
||||
def _embedding_func(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings for the given texts using the Elasticsearch model.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of text strings to generate embeddings for.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings, one for each text in the input
|
||||
list.
|
||||
"""
|
||||
response = self.client.infer_trained_model(
|
||||
model_id=self.model_id, docs=[{self.input_field: text} for text in texts]
|
||||
)
|
||||
|
||||
embeddings = [doc["predicted_value"] for doc in response["inference_results"]]
|
||||
return embeddings
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings for a list of documents.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of document text strings to generate embeddings
|
||||
for.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings, one for each document in the input
|
||||
list.
|
||||
"""
|
||||
return self._embedding_func(texts)
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""
|
||||
Generate an embedding for a single query text.
|
||||
|
||||
Args:
|
||||
text (str): The query text to generate an embedding for.
|
||||
|
||||
Returns:
|
||||
List[float]: The embedding for the input query text.
|
||||
"""
|
||||
return self._embedding_func([text])[0]
|
||||
155
venv/Lib/site-packages/langchain_community/embeddings/embaas.py
Normal file
155
venv/Lib/site-packages/langchain_community/embeddings/embaas.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
import requests
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
|
||||
from pydantic import BaseModel, ConfigDict, SecretStr
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
from typing_extensions import NotRequired, TypedDict
|
||||
|
||||
# Currently supported maximum batch size for embedding requests
|
||||
MAX_BATCH_SIZE = 256
|
||||
EMBAAS_API_URL = "https://api.embaas.io/v1/embeddings/"
|
||||
|
||||
|
||||
class EmbaasEmbeddingsPayload(TypedDict):
|
||||
"""Payload for the Embaas embeddings API."""
|
||||
|
||||
model: str
|
||||
texts: List[str]
|
||||
instruction: NotRequired[str]
|
||||
|
||||
|
||||
class EmbaasEmbeddings(BaseModel, Embeddings):
|
||||
"""Embaas's embedding service.
|
||||
|
||||
To use, you should have the
|
||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||
it as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
# initialize with default model and instruction
|
||||
from langchain_community.embeddings import EmbaasEmbeddings
|
||||
emb = EmbaasEmbeddings()
|
||||
|
||||
# initialize with custom model and instruction
|
||||
from langchain_community.embeddings import EmbaasEmbeddings
|
||||
emb_model = "instructor-large"
|
||||
emb_inst = "Represent the Wikipedia document for retrieval"
|
||||
emb = EmbaasEmbeddings(
|
||||
model=emb_model,
|
||||
instruction=emb_inst
|
||||
)
|
||||
"""
|
||||
|
||||
model: str = "e5-large-v2"
|
||||
"""The model used for embeddings."""
|
||||
instruction: Optional[str] = None
|
||||
"""Instruction used for domain-specific embeddings."""
|
||||
api_url: str = EMBAAS_API_URL
|
||||
"""The URL for the embaas embeddings API."""
|
||||
embaas_api_key: Optional[SecretStr] = None
|
||||
"""max number of retries for requests"""
|
||||
max_retries: Optional[int] = 3
|
||||
"""request timeout in seconds"""
|
||||
timeout: Optional[int] = 30
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
@pre_init
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
embaas_api_key = convert_to_secret_str(
|
||||
get_from_dict_or_env(values, "embaas_api_key", "EMBAAS_API_KEY")
|
||||
)
|
||||
values["embaas_api_key"] = embaas_api_key
|
||||
return values
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
"""Get the identifying params."""
|
||||
return {"model": self.model, "instruction": self.instruction}
|
||||
|
||||
def _generate_payload(self, texts: List[str]) -> EmbaasEmbeddingsPayload:
|
||||
"""Generates payload for the API request."""
|
||||
payload = EmbaasEmbeddingsPayload(texts=texts, model=self.model)
|
||||
if self.instruction:
|
||||
payload["instruction"] = self.instruction
|
||||
return payload
|
||||
|
||||
def _handle_request(self, payload: EmbaasEmbeddingsPayload) -> List[List[float]]:
|
||||
"""Sends a request to the Embaas API and handles the response."""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.embaas_api_key.get_secret_value()}", # type: ignore[union-attr]
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=self.max_retries,
|
||||
backoff_factor=0.5,
|
||||
allowed_methods=["POST"],
|
||||
raise_on_status=True,
|
||||
)
|
||||
|
||||
session.mount("http://", HTTPAdapter(max_retries=retries))
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
response = session.post(
|
||||
self.api_url,
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
parsed_response = response.json()
|
||||
embeddings = [item["embedding"] for item in parsed_response["data"]]
|
||||
|
||||
return embeddings
|
||||
|
||||
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using the Embaas API."""
|
||||
payload = self._generate_payload(texts)
|
||||
try:
|
||||
return self._handle_request(payload)
|
||||
except requests.exceptions.RequestException as e:
|
||||
if e.response is None or not e.response.text:
|
||||
raise ValueError(f"Error raised by embaas embeddings API: {e}")
|
||||
|
||||
parsed_response = e.response.json()
|
||||
if "message" in parsed_response:
|
||||
raise ValueError(
|
||||
"Validation Error raised by embaas embeddings API:"
|
||||
f"{parsed_response['message']}"
|
||||
)
|
||||
raise
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Get embeddings for a list of texts.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to get embeddings for.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
batches = [
|
||||
texts[i : i + MAX_BATCH_SIZE] for i in range(0, len(texts), MAX_BATCH_SIZE)
|
||||
]
|
||||
embeddings = [self._generate_embeddings(batch) for batch in batches]
|
||||
# flatten the list of lists into a single list
|
||||
return [embedding for batch in embeddings for embedding in batch]
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Get embeddings for a single text.
|
||||
|
||||
Args:
|
||||
text: The text to get embeddings for.
|
||||
|
||||
Returns:
|
||||
List of embeddings.
|
||||
"""
|
||||
return self.embed_documents([text])[0]
|
||||
158
venv/Lib/site-packages/langchain_community/embeddings/ernie.py
Normal file
158
venv/Lib/site-packages/langchain_community/embeddings/ernie.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.runnables.config import run_in_executor
|
||||
from langchain_core.utils import get_from_dict_or_env, pre_init
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.0.13",
|
||||
alternative="langchain_community.embeddings.QianfanEmbeddingsEndpoint",
|
||||
)
|
||||
class ErnieEmbeddings(BaseModel, Embeddings):
|
||||
"""`Ernie Embeddings V1` embedding models."""
|
||||
|
||||
ernie_api_base: Optional[str] = None
|
||||
ernie_client_id: Optional[str] = None
|
||||
ernie_client_secret: Optional[str] = None
|
||||
access_token: Optional[str] = None
|
||||
|
||||
chunk_size: int = 16
|
||||
|
||||
model_name: str = "ErnieBot-Embedding-V1"
|
||||
|
||||
_lock = threading.Lock()
|
||||
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
@pre_init
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
values["ernie_api_base"] = get_from_dict_or_env(
|
||||
values, "ernie_api_base", "ERNIE_API_BASE", "https://aip.baidubce.com"
|
||||
)
|
||||
values["ernie_client_id"] = get_from_dict_or_env(
|
||||
values,
|
||||
"ernie_client_id",
|
||||
"ERNIE_CLIENT_ID",
|
||||
)
|
||||
values["ernie_client_secret"] = get_from_dict_or_env(
|
||||
values,
|
||||
"ernie_client_secret",
|
||||
"ERNIE_CLIENT_SECRET",
|
||||
)
|
||||
return values
|
||||
|
||||
def _embedding(self, json: object) -> dict:
|
||||
base_url = (
|
||||
f"{self.ernie_api_base}/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings"
|
||||
)
|
||||
resp = requests.post(
|
||||
f"{base_url}/embedding-v1",
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
params={"access_token": self.access_token},
|
||||
json=json,
|
||||
)
|
||||
return resp.json()
|
||||
|
||||
def _refresh_access_token_with_lock(self) -> None:
|
||||
with self._lock:
|
||||
logger.debug("Refreshing access token")
|
||||
base_url: str = f"{self.ernie_api_base}/oauth/2.0/token"
|
||||
resp = requests.post(
|
||||
base_url,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
params={
|
||||
"grant_type": "client_credentials",
|
||||
"client_id": self.ernie_client_id,
|
||||
"client_secret": self.ernie_client_secret,
|
||||
},
|
||||
)
|
||||
self.access_token = str(resp.json().get("access_token"))
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed search docs.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed
|
||||
|
||||
Returns:
|
||||
List[List[float]]: List of embeddings, one for each text.
|
||||
"""
|
||||
|
||||
if not self.access_token:
|
||||
self._refresh_access_token_with_lock()
|
||||
text_in_chunks = [
|
||||
texts[i : i + self.chunk_size]
|
||||
for i in range(0, len(texts), self.chunk_size)
|
||||
]
|
||||
lst = []
|
||||
for chunk in text_in_chunks:
|
||||
resp = self._embedding({"input": [text for text in chunk]})
|
||||
if resp.get("error_code"):
|
||||
if resp.get("error_code") == 111:
|
||||
self._refresh_access_token_with_lock()
|
||||
resp = self._embedding({"input": [text for text in chunk]})
|
||||
else:
|
||||
raise ValueError(f"Error from Ernie: {resp}")
|
||||
lst.extend([i["embedding"] for i in resp["data"]])
|
||||
return lst
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Embed query text.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
List[float]: Embeddings for the text.
|
||||
"""
|
||||
|
||||
if not self.access_token:
|
||||
self._refresh_access_token_with_lock()
|
||||
resp = self._embedding({"input": [text]})
|
||||
if resp.get("error_code"):
|
||||
if resp.get("error_code") == 111:
|
||||
self._refresh_access_token_with_lock()
|
||||
resp = self._embedding({"input": [text]})
|
||||
else:
|
||||
raise ValueError(f"Error from Ernie: {resp}")
|
||||
return resp["data"][0]["embedding"]
|
||||
|
||||
async def aembed_query(self, text: str) -> List[float]:
|
||||
"""Asynchronous Embed query text.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
List[float]: Embeddings for the text.
|
||||
"""
|
||||
|
||||
return await run_in_executor(None, self.embed_query, text)
|
||||
|
||||
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Asynchronous Embed search docs.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed
|
||||
|
||||
Returns:
|
||||
List[List[float]]: List of embeddings, one for each text.
|
||||
"""
|
||||
|
||||
result = await asyncio.gather(*[self.aembed_query(text) for text in texts])
|
||||
|
||||
return list(result)
|
||||
@@ -0,0 +1,50 @@
|
||||
import hashlib
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class FakeEmbeddings(Embeddings, BaseModel):
|
||||
"""Fake embedding model."""
|
||||
|
||||
size: int
|
||||
"""The size of the embedding vector."""
|
||||
|
||||
def _get_embedding(self) -> List[float]:
|
||||
return list(np.random.normal(size=self.size))
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
return [self._get_embedding() for _ in texts]
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
return self._get_embedding()
|
||||
|
||||
|
||||
class DeterministicFakeEmbedding(Embeddings, BaseModel):
|
||||
"""
|
||||
Fake embedding model that always returns
|
||||
the same embedding vector for the same text.
|
||||
"""
|
||||
|
||||
size: int
|
||||
"""The size of the embedding vector."""
|
||||
|
||||
def _get_embedding(self, seed: int) -> List[float]:
|
||||
# set the seed for the random generator
|
||||
np.random.seed(seed)
|
||||
return list(np.random.normal(size=self.size))
|
||||
|
||||
@staticmethod
|
||||
def _get_seed(text: str) -> int:
|
||||
"""
|
||||
Get a seed for the random generator, using the hash of the text.
|
||||
"""
|
||||
return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
return [self._get_embedding(seed=self._get_seed(_)) for _ in texts]
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
return self._get_embedding(seed=self._get_seed(text))
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user