initial commit
This commit is contained in:
73
venv/Lib/site-packages/langchain_text_splitters/spacy.py
Normal file
73
venv/Lib/site-packages/langchain_text_splitters/spacy.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Spacy text splitter."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from langchain_text_splitters.base import TextSplitter
|
||||
|
||||
try:
|
||||
# Type ignores needed as long as spacy doesn't support Python 3.14.
|
||||
import spacy # type: ignore[import-not-found, unused-ignore]
|
||||
from spacy.lang.en import English # type: ignore[import-not-found, unused-ignore]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from spacy.language import ( # type: ignore[import-not-found, unused-ignore]
|
||||
Language,
|
||||
)
|
||||
|
||||
_HAS_SPACY = True
|
||||
except ImportError:
|
||||
_HAS_SPACY = False
|
||||
|
||||
|
||||
class SpacyTextSplitter(TextSplitter):
|
||||
"""Splitting text using Spacy package.
|
||||
|
||||
Per default, Spacy's `en_core_web_sm` model is used and
|
||||
its default max_length is 1000000 (it is the length of maximum character
|
||||
this model takes which can be increased for large files). For a faster, but
|
||||
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
separator: str = "\n\n",
|
||||
pipeline: str = "en_core_web_sm",
|
||||
max_length: int = 1_000_000,
|
||||
*,
|
||||
strip_whitespace: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize the spacy text splitter."""
|
||||
super().__init__(**kwargs)
|
||||
self._tokenizer = _make_spacy_pipeline_for_splitting(
|
||||
pipeline, max_length=max_length
|
||||
)
|
||||
self._separator = separator
|
||||
self._strip_whitespace = strip_whitespace
|
||||
|
||||
@override
|
||||
def split_text(self, text: str) -> list[str]:
|
||||
splits = (
|
||||
s.text if self._strip_whitespace else s.text_with_ws
|
||||
for s in self._tokenizer(text).sents
|
||||
)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
def _make_spacy_pipeline_for_splitting(
|
||||
pipeline: str, *, max_length: int = 1_000_000
|
||||
) -> Language:
|
||||
if not _HAS_SPACY:
|
||||
msg = "Spacy is not installed, please install it with `pip install spacy`."
|
||||
raise ImportError(msg)
|
||||
if pipeline == "sentencizer":
|
||||
sentencizer: Language = English()
|
||||
sentencizer.add_pipe("sentencizer")
|
||||
else:
|
||||
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
|
||||
sentencizer.max_length = max_length
|
||||
return sentencizer
|
||||
Reference in New Issue
Block a user