initial commit
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
"""**Chat Loaders** load chat messages from common communications platforms.
|
||||
|
||||
Load chat messages from various
|
||||
communications platforms such as Facebook Messenger, Telegram, and
|
||||
WhatsApp. The loaded chat messages can be used for fine-tuning models.
|
||||
|
||||
**Class hierarchy:**
|
||||
|
||||
.. code-block::
|
||||
|
||||
BaseChatLoader --> <name>ChatLoader # Examples: WhatsAppChatLoader, IMessageChatLoader
|
||||
|
||||
**Main helpers:**
|
||||
|
||||
.. code-block::
|
||||
|
||||
ChatSession
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
import importlib
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_community.chat_loaders.base import (
|
||||
BaseChatLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.facebook_messenger import (
|
||||
FolderFacebookMessengerChatLoader,
|
||||
SingleFileFacebookMessengerChatLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.gmail import (
|
||||
GMailLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.imessage import (
|
||||
IMessageChatLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.langsmith import (
|
||||
LangSmithDatasetChatLoader,
|
||||
LangSmithRunChatLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.slack import (
|
||||
SlackChatLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.telegram import (
|
||||
TelegramChatLoader,
|
||||
)
|
||||
from langchain_community.chat_loaders.whatsapp import (
|
||||
WhatsAppChatLoader,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BaseChatLoader",
|
||||
"FolderFacebookMessengerChatLoader",
|
||||
"GMailLoader",
|
||||
"IMessageChatLoader",
|
||||
"LangSmithDatasetChatLoader",
|
||||
"LangSmithRunChatLoader",
|
||||
"SingleFileFacebookMessengerChatLoader",
|
||||
"SlackChatLoader",
|
||||
"TelegramChatLoader",
|
||||
"WhatsAppChatLoader",
|
||||
]
|
||||
|
||||
_module_lookup = {
|
||||
"BaseChatLoader": "langchain_core.chat_loaders",
|
||||
"FolderFacebookMessengerChatLoader": "langchain_community.chat_loaders.facebook_messenger", # noqa: E501
|
||||
"GMailLoader": "langchain_community.chat_loaders.gmail",
|
||||
"IMessageChatLoader": "langchain_community.chat_loaders.imessage",
|
||||
"LangSmithDatasetChatLoader": "langchain_community.chat_loaders.langsmith",
|
||||
"LangSmithRunChatLoader": "langchain_community.chat_loaders.langsmith",
|
||||
"SingleFileFacebookMessengerChatLoader": "langchain_community.chat_loaders.facebook_messenger", # noqa: E501
|
||||
"SlackChatLoader": "langchain_community.chat_loaders.slack",
|
||||
"TelegramChatLoader": "langchain_community.chat_loaders.telegram",
|
||||
"WhatsAppChatLoader": "langchain_community.chat_loaders.whatsapp",
|
||||
}
|
||||
|
||||
|
||||
def __getattr__(name: str) -> Any:
|
||||
if name in _module_lookup:
|
||||
module = importlib.import_module(_module_lookup[name])
|
||||
return getattr(module, name)
|
||||
raise AttributeError(f"module {__name__} has no attribute {name}")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,3 @@
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
|
||||
__all__ = ["BaseChatLoader"]
|
||||
@@ -0,0 +1,78 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Union
|
||||
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
|
||||
"""Load `Facebook Messenger` chat data from a single file.
|
||||
|
||||
Args:
|
||||
path (Union[Path, str]): The path to the chat file.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: Union[Path, str]) -> None:
|
||||
super().__init__()
|
||||
self.file_path = path if isinstance(path, Path) else Path(path)
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy loads the chat data from the file.
|
||||
|
||||
Yields:
|
||||
ChatSession: A chat session containing the loaded messages.
|
||||
|
||||
"""
|
||||
with open(self.file_path) as f:
|
||||
data = json.load(f)
|
||||
sorted_data = sorted(data["messages"], key=lambda x: x["timestamp_ms"])
|
||||
messages = []
|
||||
for index, m in enumerate(sorted_data):
|
||||
if "content" not in m:
|
||||
logger.info(
|
||||
f"""Skipping Message No.
|
||||
{index + 1} as no content is present in the message"""
|
||||
)
|
||||
continue
|
||||
messages.append(
|
||||
HumanMessage(
|
||||
content=m["content"], additional_kwargs={"sender": m["sender_name"]}
|
||||
)
|
||||
)
|
||||
yield ChatSession(messages=messages)
|
||||
|
||||
|
||||
class FolderFacebookMessengerChatLoader(BaseChatLoader):
|
||||
"""Load `Facebook Messenger` chat data from a folder.
|
||||
|
||||
Args:
|
||||
path (Union[str, Path]): The path to the directory
|
||||
containing the chat files.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: Union[str, Path]) -> None:
|
||||
super().__init__()
|
||||
self.directory_path = Path(path) if isinstance(path, str) else path
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy loads the chat data from the folder.
|
||||
|
||||
Yields:
|
||||
ChatSession: A chat session containing the loaded messages.
|
||||
|
||||
"""
|
||||
inbox_path = self.directory_path / "inbox"
|
||||
for _dir in inbox_path.iterdir():
|
||||
if _dir.is_dir():
|
||||
for _file in _dir.iterdir():
|
||||
if _file.suffix.lower() == ".json":
|
||||
file_loader = SingleFileFacebookMessengerChatLoader(path=_file)
|
||||
for result in file_loader.lazy_load():
|
||||
yield result
|
||||
117
venv/Lib/site-packages/langchain_community/chat_loaders/gmail.py
Normal file
117
venv/Lib/site-packages/langchain_community/chat_loaders/gmail.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import base64
|
||||
import re
|
||||
from typing import Any, Iterator
|
||||
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
|
||||
def _extract_email_content(msg: Any) -> HumanMessage:
|
||||
from_email = None
|
||||
for values in msg["payload"]["headers"]:
|
||||
name = values["name"]
|
||||
if name == "From":
|
||||
from_email = values["value"]
|
||||
if from_email is None:
|
||||
raise ValueError
|
||||
for part in msg["payload"]["parts"]:
|
||||
if part["mimeType"] == "text/plain":
|
||||
data = part["body"]["data"]
|
||||
data = base64.urlsafe_b64decode(data).decode("utf-8")
|
||||
# Regular expression to split the email body at the first
|
||||
# occurrence of a line that starts with "On ... wrote:"
|
||||
pattern = re.compile(r"\r\nOn .+(\r\n)*wrote:\r\n")
|
||||
# Split the email body and extract the first part
|
||||
newest_response = re.split(pattern, data)[0]
|
||||
message = HumanMessage(
|
||||
content=newest_response, additional_kwargs={"sender": from_email}
|
||||
)
|
||||
return message
|
||||
raise ValueError
|
||||
|
||||
|
||||
def _get_message_data(service: Any, message: Any) -> ChatSession:
|
||||
msg = service.users().messages().get(userId="me", id=message["id"]).execute()
|
||||
message_content = _extract_email_content(msg)
|
||||
in_reply_to = None
|
||||
email_data = msg["payload"]["headers"]
|
||||
for values in email_data:
|
||||
name = values["name"]
|
||||
if name == "In-Reply-To":
|
||||
in_reply_to = values["value"]
|
||||
if in_reply_to is None:
|
||||
raise ValueError
|
||||
|
||||
thread_id = msg["threadId"]
|
||||
|
||||
thread = service.users().threads().get(userId="me", id=thread_id).execute()
|
||||
messages = thread["messages"]
|
||||
|
||||
response_email = None
|
||||
for message in messages:
|
||||
email_data = message["payload"]["headers"]
|
||||
for values in email_data:
|
||||
if values["name"] == "Message-ID":
|
||||
message_id = values["value"]
|
||||
if message_id == in_reply_to:
|
||||
response_email = message
|
||||
if response_email is None:
|
||||
raise ValueError
|
||||
starter_content = _extract_email_content(response_email)
|
||||
return ChatSession(messages=[starter_content, message_content])
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.0.32",
|
||||
removal="1.0",
|
||||
alternative_import="langchain_google_community.GMailLoader",
|
||||
)
|
||||
class GMailLoader(BaseChatLoader):
|
||||
"""Load data from `GMail`.
|
||||
|
||||
There are many ways you could want to load data from GMail.
|
||||
This loader is currently fairly opinionated in how to do so.
|
||||
The way it does it is it first looks for all messages that you have sent.
|
||||
It then looks for messages where you are responding to a previous email.
|
||||
It then fetches that previous email, and creates a training example
|
||||
of that email, followed by your email.
|
||||
|
||||
Note that there are clear limitations here. For example,
|
||||
all examples created are only looking at the previous email for context.
|
||||
|
||||
To use:
|
||||
|
||||
- Set up a Google Developer Account:
|
||||
Go to the Google Developer Console, create a project,
|
||||
and enable the Gmail API for that project.
|
||||
This will give you a credentials.json file that you'll need later.
|
||||
"""
|
||||
|
||||
def __init__(self, creds: Any, n: int = 100, raise_error: bool = False) -> None:
|
||||
super().__init__()
|
||||
self.creds = creds
|
||||
self.n = n
|
||||
self.raise_error = raise_error
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
service = build("gmail", "v1", credentials=self.creds)
|
||||
results = (
|
||||
service.users()
|
||||
.messages()
|
||||
.list(userId="me", labelIds=["SENT"], maxResults=self.n)
|
||||
.execute()
|
||||
)
|
||||
messages = results.get("messages", [])
|
||||
for message in messages:
|
||||
try:
|
||||
yield _get_message_data(service, message)
|
||||
except Exception as e:
|
||||
# TODO: handle errors better
|
||||
if self.raise_error:
|
||||
raise e
|
||||
else:
|
||||
pass
|
||||
@@ -0,0 +1,221 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterator, List, Optional, Union
|
||||
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import sqlite3
|
||||
|
||||
|
||||
def nanoseconds_from_2001_to_datetime(nanoseconds: int) -> datetime:
|
||||
"""Convert nanoseconds since 2001 to a datetime object.
|
||||
|
||||
Args:
|
||||
nanoseconds (int): Nanoseconds since January 1, 2001.
|
||||
|
||||
Returns:
|
||||
datetime: Datetime object.
|
||||
"""
|
||||
# Convert nanoseconds to seconds (1 second = 1e9 nanoseconds)
|
||||
timestamp_in_seconds = nanoseconds / 1e9
|
||||
|
||||
# The reference date is January 1, 2001, in Unix time
|
||||
reference_date_seconds = datetime(2001, 1, 1).timestamp()
|
||||
|
||||
# Calculate the actual timestamp by adding the reference date
|
||||
actual_timestamp = reference_date_seconds + timestamp_in_seconds
|
||||
|
||||
# Convert to a datetime object
|
||||
return datetime.fromtimestamp(actual_timestamp)
|
||||
|
||||
|
||||
class IMessageChatLoader(BaseChatLoader):
|
||||
"""Load chat sessions from the `iMessage` chat.db SQLite file.
|
||||
|
||||
It only works on macOS when you have iMessage enabled and have the chat.db file.
|
||||
|
||||
The chat.db file is likely located at ~/Library/Messages/chat.db. However, your
|
||||
terminal may not have permission to access this file. To resolve this, you can
|
||||
copy the file to a different location, change the permissions of the file, or
|
||||
grant full disk access for your terminal emulator
|
||||
in System Settings > Security and Privacy > Full Disk Access.
|
||||
"""
|
||||
|
||||
def __init__(self, path: Optional[Union[str, Path]] = None):
|
||||
"""
|
||||
Initialize the IMessageChatLoader.
|
||||
|
||||
Args:
|
||||
path (str or Path, optional): Path to the chat.db SQLite file.
|
||||
Defaults to None, in which case the default path
|
||||
~/Library/Messages/chat.db will be used.
|
||||
"""
|
||||
if path is None:
|
||||
path = Path.home() / "Library" / "Messages" / "chat.db"
|
||||
self.db_path = path if isinstance(path, Path) else Path(path)
|
||||
if not self.db_path.exists():
|
||||
raise FileNotFoundError(f"File {self.db_path} not found")
|
||||
try:
|
||||
import sqlite3 # noqa: F401
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"The sqlite3 module is required to load iMessage chats.\n"
|
||||
"Please install it with `pip install pysqlite3`"
|
||||
) from e
|
||||
|
||||
@staticmethod
|
||||
def _parse_attributed_body(attributed_body: bytes) -> str:
|
||||
"""
|
||||
Parse the attributedBody field of the message table
|
||||
for the text content of the message.
|
||||
|
||||
The attributedBody field is a binary blob that contains
|
||||
the message content after the byte string b"NSString":
|
||||
|
||||
5 bytes 1-3 bytes `len` bytes
|
||||
... | b"NSString" | preamble | `len` | contents | ...
|
||||
|
||||
The 5 preamble bytes are always b"\x01\x94\x84\x01+"
|
||||
|
||||
The size of `len` is either 1 byte or 3 bytes:
|
||||
- If the first byte in `len` is b"\x81" then `len` is 3 bytes long.
|
||||
So the message length is the 2 bytes after, in little Endian.
|
||||
- Otherwise, the size of `len` is 1 byte, and the message length is
|
||||
that byte.
|
||||
|
||||
Args:
|
||||
attributed_body (bytes): attributedBody field of the message table.
|
||||
Return:
|
||||
str: Text content of the message.
|
||||
"""
|
||||
content = attributed_body.split(b"NSString")[1][5:]
|
||||
length, start = content[0], 1
|
||||
if content[0] == 129:
|
||||
length, start = int.from_bytes(content[1:3], "little"), 3
|
||||
return content[start : start + length].decode("utf-8", errors="ignore")
|
||||
|
||||
@staticmethod
|
||||
def _get_session_query(use_chat_handle_table: bool) -> str:
|
||||
# Messages sent pre OSX 12 require a join through the chat_handle_join table
|
||||
# However, the table doesn't exist if database created with OSX 12 or above.
|
||||
|
||||
joins_w_chat_handle = """
|
||||
JOIN chat_handle_join ON
|
||||
chat_message_join.chat_id = chat_handle_join.chat_id
|
||||
JOIN handle ON
|
||||
handle.ROWID = chat_handle_join.handle_id"""
|
||||
|
||||
joins_no_chat_handle = """
|
||||
JOIN handle ON message.handle_id = handle.ROWID
|
||||
"""
|
||||
|
||||
joins = joins_w_chat_handle if use_chat_handle_table else joins_no_chat_handle
|
||||
|
||||
return f"""
|
||||
SELECT message.date,
|
||||
handle.id,
|
||||
message.text,
|
||||
message.is_from_me,
|
||||
message.attributedBody
|
||||
FROM message
|
||||
JOIN chat_message_join ON
|
||||
message.ROWID = chat_message_join.message_id
|
||||
{joins}
|
||||
WHERE chat_message_join.chat_id = ?
|
||||
ORDER BY message.date ASC;
|
||||
"""
|
||||
|
||||
def _load_single_chat_session(
|
||||
self, cursor: "sqlite3.Cursor", use_chat_handle_table: bool, chat_id: int
|
||||
) -> ChatSession:
|
||||
"""
|
||||
Load a single chat session from the iMessage chat.db.
|
||||
|
||||
Args:
|
||||
cursor: SQLite cursor object.
|
||||
chat_id (int): ID of the chat session to load.
|
||||
|
||||
Returns:
|
||||
ChatSession: Loaded chat session.
|
||||
"""
|
||||
results: List[HumanMessage] = []
|
||||
|
||||
query = self._get_session_query(use_chat_handle_table)
|
||||
cursor.execute(query, (chat_id,))
|
||||
messages = cursor.fetchall()
|
||||
|
||||
for date, sender, text, is_from_me, attributedBody in messages:
|
||||
if text:
|
||||
content = text
|
||||
elif attributedBody:
|
||||
content = self._parse_attributed_body(attributedBody)
|
||||
else: # Skip messages with no content
|
||||
continue
|
||||
|
||||
results.append(
|
||||
HumanMessage(
|
||||
role=sender,
|
||||
content=content,
|
||||
additional_kwargs={
|
||||
"message_time": date,
|
||||
"message_time_as_datetime": nanoseconds_from_2001_to_datetime(
|
||||
date
|
||||
),
|
||||
"sender": sender,
|
||||
"is_from_me": bool(is_from_me),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return ChatSession(messages=results)
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""
|
||||
Lazy load the chat sessions from the iMessage chat.db
|
||||
and yield them in the required format.
|
||||
|
||||
Yields:
|
||||
ChatSession: Loaded chat session.
|
||||
"""
|
||||
import sqlite3
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
except sqlite3.OperationalError as e:
|
||||
raise ValueError(
|
||||
f"Could not open iMessage DB file {self.db_path}.\n"
|
||||
"Make sure your terminal emulator has disk access to this file.\n"
|
||||
" You can either copy the DB file to an accessible location"
|
||||
" or grant full disk access for your terminal emulator."
|
||||
" You can grant full disk access for your terminal emulator"
|
||||
" in System Settings > Security and Privacy > Full Disk Access."
|
||||
) from e
|
||||
cursor = conn.cursor()
|
||||
|
||||
# See if chat_handle_join table exists:
|
||||
query = """SELECT name FROM sqlite_master
|
||||
WHERE type='table' AND name='chat_handle_join';"""
|
||||
|
||||
cursor.execute(query)
|
||||
is_chat_handle_join_exists = cursor.fetchone()
|
||||
|
||||
# Fetch the list of chat IDs sorted by time (most recent first)
|
||||
query = """SELECT chat_id
|
||||
FROM message
|
||||
JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
|
||||
GROUP BY chat_id
|
||||
ORDER BY MAX(date) DESC;"""
|
||||
cursor.execute(query)
|
||||
chat_ids = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
for chat_id in chat_ids:
|
||||
yield self._load_single_chat_session(
|
||||
cursor, is_chat_handle_join_exists, chat_id
|
||||
)
|
||||
|
||||
conn.close()
|
||||
@@ -0,0 +1,159 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Union, cast
|
||||
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.load.load import load
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langsmith.client import Client
|
||||
from langsmith.schemas import Run
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LangSmithRunChatLoader(BaseChatLoader):
|
||||
"""
|
||||
Load chat sessions from a list of LangSmith "llm" runs.
|
||||
|
||||
Attributes:
|
||||
runs (Iterable[Union[str, Run]]): The list of LLM run IDs or run objects.
|
||||
client (Client): Instance of LangSmith client for fetching data.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, runs: Iterable[Union[str, Run]], client: Optional["Client"] = None
|
||||
):
|
||||
"""
|
||||
Initialize a new LangSmithRunChatLoader instance.
|
||||
|
||||
:param runs: List of LLM run IDs or run objects.
|
||||
:param client: An instance of LangSmith client, if not provided,
|
||||
a new client instance will be created.
|
||||
"""
|
||||
from langsmith.client import Client
|
||||
|
||||
self.runs = runs
|
||||
self.client = client or Client()
|
||||
|
||||
@staticmethod
|
||||
def _load_single_chat_session(llm_run: "Run") -> ChatSession:
|
||||
"""
|
||||
Convert an individual LangSmith LLM run to a ChatSession.
|
||||
|
||||
:param llm_run: The LLM run object.
|
||||
:return: A chat session representing the run's data.
|
||||
"""
|
||||
chat_session = LangSmithRunChatLoader._get_messages_from_llm_run(llm_run)
|
||||
functions = LangSmithRunChatLoader._get_functions_from_llm_run(llm_run)
|
||||
if functions:
|
||||
chat_session["functions"] = functions
|
||||
return chat_session
|
||||
|
||||
@staticmethod
|
||||
def _get_messages_from_llm_run(llm_run: "Run") -> ChatSession:
|
||||
"""
|
||||
Extract messages from a LangSmith LLM run.
|
||||
|
||||
:param llm_run: The LLM run object.
|
||||
:return: ChatSession with the extracted messages.
|
||||
"""
|
||||
if llm_run.run_type != "llm":
|
||||
raise ValueError(f"Expected run of type llm. Got: {llm_run.run_type}")
|
||||
if "messages" not in llm_run.inputs:
|
||||
raise ValueError(f"Run has no 'messages' inputs. Got {llm_run.inputs}")
|
||||
if not llm_run.outputs:
|
||||
raise ValueError("Cannot convert pending run")
|
||||
messages = load(llm_run.inputs)["messages"]
|
||||
message_chunk = load(llm_run.outputs)["generations"][0]["message"]
|
||||
return ChatSession(messages=messages + [message_chunk])
|
||||
|
||||
@staticmethod
|
||||
def _get_functions_from_llm_run(llm_run: "Run") -> Optional[List[Dict]]:
|
||||
"""
|
||||
Extract functions from a LangSmith LLM run if they exist.
|
||||
|
||||
:param llm_run: The LLM run object.
|
||||
:return: Functions from the run or None.
|
||||
"""
|
||||
if llm_run.run_type != "llm":
|
||||
raise ValueError(f"Expected run of type llm. Got: {llm_run.run_type}")
|
||||
return (llm_run.extra or {}).get("invocation_params", {}).get("functions")
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""
|
||||
Lazy load the chat sessions from the iterable of run IDs.
|
||||
|
||||
This method fetches the runs and converts them to chat sessions on-the-fly,
|
||||
yielding one session at a time.
|
||||
|
||||
:return: Iterator of chat sessions containing messages.
|
||||
"""
|
||||
from langsmith.schemas import Run
|
||||
|
||||
for run_obj in self.runs:
|
||||
try:
|
||||
if hasattr(run_obj, "id"):
|
||||
run = run_obj
|
||||
else:
|
||||
run = self.client.read_run(run_obj)
|
||||
session = self._load_single_chat_session(cast(Run, run))
|
||||
yield session
|
||||
except ValueError as e:
|
||||
logger.warning(f"Could not load run {run_obj}: {repr(e)}")
|
||||
continue
|
||||
|
||||
|
||||
class LangSmithDatasetChatLoader(BaseChatLoader):
|
||||
"""
|
||||
Load chat sessions from a LangSmith dataset with the "chat" data type.
|
||||
|
||||
Attributes:
|
||||
dataset_name (str): The name of the LangSmith dataset.
|
||||
client (Client): Instance of LangSmith client for fetching data.
|
||||
"""
|
||||
|
||||
def __init__(self, *, dataset_name: str, client: Optional["Client"] = None):
|
||||
"""
|
||||
Initialize a new LangSmithChatDatasetLoader instance.
|
||||
|
||||
:param dataset_name: The name of the LangSmith dataset.
|
||||
:param client: An instance of LangSmith client; if not provided,
|
||||
a new client instance will be created.
|
||||
"""
|
||||
try:
|
||||
from langsmith.client import Client
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"The LangSmith client is required to load LangSmith datasets.\n"
|
||||
"Please install it with `pip install langsmith`"
|
||||
) from e
|
||||
|
||||
self.dataset_name = dataset_name
|
||||
self.client = client or Client()
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""
|
||||
Lazy load the chat sessions from the specified LangSmith dataset.
|
||||
|
||||
This method fetches the chat data from the dataset and
|
||||
converts each data point to chat sessions on-the-fly,
|
||||
yielding one session at a time.
|
||||
|
||||
:return: Iterator of chat sessions containing messages.
|
||||
"""
|
||||
from langchain_community.adapters import openai as oai_adapter
|
||||
|
||||
data = self.client.read_dataset_openai_finetuning(
|
||||
dataset_name=self.dataset_name
|
||||
)
|
||||
for data_point in data:
|
||||
yield ChatSession(
|
||||
messages=[
|
||||
oai_adapter.convert_dict_to_message(m)
|
||||
for m in data_point.get("messages", [])
|
||||
],
|
||||
functions=data_point.get("functions"),
|
||||
)
|
||||
@@ -0,0 +1,87 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Union
|
||||
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SlackChatLoader(BaseChatLoader):
|
||||
"""Load `Slack` conversations from a dump zip file."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
):
|
||||
"""
|
||||
Initialize the chat loader with the path to the exported Slack dump zip file.
|
||||
|
||||
:param path: Path to the exported Slack dump zip file.
|
||||
"""
|
||||
self.zip_path = path if isinstance(path, Path) else Path(path)
|
||||
if not self.zip_path.exists():
|
||||
raise FileNotFoundError(f"File {self.zip_path} not found")
|
||||
|
||||
@staticmethod
|
||||
def _load_single_chat_session(messages: List[Dict]) -> ChatSession:
|
||||
results: List[Union[AIMessage, HumanMessage]] = []
|
||||
previous_sender = None
|
||||
for message in messages:
|
||||
if not isinstance(message, dict):
|
||||
continue
|
||||
text = message.get("text", "")
|
||||
timestamp = message.get("ts", "")
|
||||
sender = message.get("user", "")
|
||||
if not sender:
|
||||
continue
|
||||
skip_pattern = re.compile(
|
||||
r"<@U\d+> has joined the channel", flags=re.IGNORECASE
|
||||
)
|
||||
if skip_pattern.match(text):
|
||||
continue
|
||||
if sender == previous_sender:
|
||||
results[-1].content += "\n\n" + text
|
||||
results[-1].additional_kwargs["events"].append(
|
||||
{"message_time": timestamp}
|
||||
)
|
||||
else:
|
||||
results.append(
|
||||
HumanMessage(
|
||||
role=sender,
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": sender,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
previous_sender = sender
|
||||
return ChatSession(messages=results)
|
||||
|
||||
@staticmethod
|
||||
def _read_json(zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
|
||||
"""Read JSON data from a zip subfile."""
|
||||
with zip_file.open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"Expected list of dictionaries, got {type(data)}")
|
||||
return data
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""
|
||||
Lazy load the chat sessions from the Slack dump file and yield them
|
||||
in the required format.
|
||||
|
||||
:return: Iterator of chat sessions containing messages.
|
||||
"""
|
||||
with zipfile.ZipFile(str(self.zip_path), "r") as zip_file:
|
||||
for file_path in zip_file.namelist():
|
||||
if file_path.endswith(".json"):
|
||||
messages = self._read_json(zip_file, file_path)
|
||||
yield self._load_single_chat_session(messages)
|
||||
@@ -0,0 +1,155 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Union
|
||||
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TelegramChatLoader(BaseChatLoader):
|
||||
"""Load `telegram` conversations to LangChain chat messages.
|
||||
|
||||
To export, use the Telegram Desktop app from
|
||||
https://desktop.telegram.org/, select a conversation, click the three dots
|
||||
in the top right corner, and select "Export chat history". Then select
|
||||
"Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
|
||||
the desktop app (like "Telegram for MacOS") do not support exporting chat
|
||||
history.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
):
|
||||
"""Initialize the TelegramChatLoader.
|
||||
|
||||
Args:
|
||||
path (Union[str, Path]): Path to the exported Telegram chat zip,
|
||||
directory, json, or HTML file.
|
||||
"""
|
||||
self.path = path if isinstance(path, str) else str(path)
|
||||
|
||||
@staticmethod
|
||||
def _load_single_chat_session_html(file_path: str) -> ChatSession:
|
||||
"""Load a single chat session from an HTML file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the HTML file.
|
||||
|
||||
Returns:
|
||||
ChatSession: The loaded chat session.
|
||||
"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the 'beautifulsoup4' package to load"
|
||||
" Telegram HTML files. You can do this by running"
|
||||
"'pip install beautifulsoup4' in your terminal."
|
||||
)
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
soup = BeautifulSoup(file, "html.parser")
|
||||
|
||||
results: List[Union[HumanMessage, AIMessage]] = []
|
||||
previous_sender = None
|
||||
for message in soup.select(".message.default"):
|
||||
timestamp = message.select_one(".pull_right.date.details")["title"] # type: ignore[index]
|
||||
from_name_element = message.select_one(".from_name")
|
||||
if from_name_element is None and previous_sender is None:
|
||||
logger.debug("from_name not found in message")
|
||||
continue
|
||||
elif from_name_element is None:
|
||||
from_name = previous_sender
|
||||
else:
|
||||
from_name = from_name_element.text.strip()
|
||||
text = message.select_one(".text").text.strip() # type: ignore[union-attr]
|
||||
results.append(
|
||||
HumanMessage(
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": from_name,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
previous_sender = from_name
|
||||
|
||||
return ChatSession(messages=results)
|
||||
|
||||
@staticmethod
|
||||
def _load_single_chat_session_json(file_path: str) -> ChatSession:
|
||||
"""Load a single chat session from a JSON file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the JSON file.
|
||||
|
||||
Returns:
|
||||
ChatSession: The loaded chat session.
|
||||
"""
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
|
||||
messages = data.get("messages", [])
|
||||
results: List[BaseMessage] = []
|
||||
for message in messages:
|
||||
text = message.get("text", "")
|
||||
timestamp = message.get("date", "")
|
||||
from_name = message.get("from", "")
|
||||
if from_name is None:
|
||||
from_name = "Deleted Account"
|
||||
|
||||
results.append(
|
||||
HumanMessage(
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": from_name,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return ChatSession(messages=results)
|
||||
|
||||
@staticmethod
|
||||
def _iterate_files(path: str) -> Iterator[str]:
|
||||
"""Iterate over files in a directory or zip file.
|
||||
|
||||
Args:
|
||||
path (str): Path to the directory or zip file.
|
||||
|
||||
Yields:
|
||||
str: Path to each file.
|
||||
"""
|
||||
if os.path.isfile(path) and path.endswith((".html", ".json")):
|
||||
yield path
|
||||
elif os.path.isdir(path):
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
if file.endswith((".html", ".json")):
|
||||
yield os.path.join(root, file)
|
||||
elif zipfile.is_zipfile(path):
|
||||
with zipfile.ZipFile(path) as zip_file:
|
||||
for file in zip_file.namelist():
|
||||
if file.endswith((".html", ".json")):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
yield zip_file.extract(file, path=temp_dir)
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy load the messages from the chat file and yield them
|
||||
in as chat sessions.
|
||||
|
||||
Yields:
|
||||
ChatSession: The loaded chat session.
|
||||
"""
|
||||
for file_path in self._iterate_files(self.path):
|
||||
if file_path.endswith(".html"):
|
||||
yield self._load_single_chat_session_html(file_path)
|
||||
elif file_path.endswith(".json"):
|
||||
yield self._load_single_chat_session_json(file_path)
|
||||
104
venv/Lib/site-packages/langchain_community/chat_loaders/utils.py
Normal file
104
venv/Lib/site-packages/langchain_community/chat_loaders/utils.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""Utilities for chat loaders."""
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import Iterable, Iterator, List
|
||||
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import AIMessage, BaseMessage
|
||||
|
||||
|
||||
def merge_chat_runs_in_session(
|
||||
chat_session: ChatSession, delimiter: str = "\n\n"
|
||||
) -> ChatSession:
|
||||
"""Merge chat runs together in a chat session.
|
||||
|
||||
A chat run is a sequence of messages from the same sender.
|
||||
|
||||
Args:
|
||||
chat_session: A chat session.
|
||||
|
||||
Returns:
|
||||
A chat session with merged chat runs.
|
||||
"""
|
||||
messages: List[BaseMessage] = []
|
||||
for message in chat_session["messages"]:
|
||||
if isinstance(message.content, list):
|
||||
text = ""
|
||||
for content in message.content:
|
||||
if isinstance(content, dict):
|
||||
text += content.get("text", "") or ""
|
||||
else:
|
||||
text += content
|
||||
message.content = text
|
||||
if not isinstance(message.content, str):
|
||||
raise ValueError(
|
||||
"Chat Loaders only support messages with content type string, "
|
||||
f"got {message.content}"
|
||||
)
|
||||
if not messages:
|
||||
messages.append(deepcopy(message))
|
||||
elif (
|
||||
isinstance(message, type(messages[-1]))
|
||||
and messages[-1].additional_kwargs.get("sender") is not None
|
||||
and messages[-1].additional_kwargs["sender"]
|
||||
== message.additional_kwargs.get("sender")
|
||||
):
|
||||
if not isinstance(messages[-1].content, str):
|
||||
raise ValueError(
|
||||
"Chat Loaders only support messages with content type string, "
|
||||
f"got {messages[-1].content}"
|
||||
)
|
||||
messages[-1].content = (
|
||||
messages[-1].content + delimiter + message.content
|
||||
).strip()
|
||||
messages[-1].additional_kwargs.get("events", []).extend(
|
||||
message.additional_kwargs.get("events") or []
|
||||
)
|
||||
else:
|
||||
messages.append(deepcopy(message))
|
||||
return ChatSession(messages=messages)
|
||||
|
||||
|
||||
def merge_chat_runs(chat_sessions: Iterable[ChatSession]) -> Iterator[ChatSession]:
|
||||
"""Merge chat runs together.
|
||||
|
||||
A chat run is a sequence of messages from the same sender.
|
||||
|
||||
Args:
|
||||
chat_sessions: A list of chat sessions.
|
||||
|
||||
Returns:
|
||||
A list of chat sessions with merged chat runs.
|
||||
"""
|
||||
for chat_session in chat_sessions:
|
||||
yield merge_chat_runs_in_session(chat_session)
|
||||
|
||||
|
||||
def map_ai_messages_in_session(chat_sessions: ChatSession, sender: str) -> ChatSession:
|
||||
"""Convert messages from the specified 'sender' to AI messages.
|
||||
|
||||
This is useful for fine-tuning the AI to adapt to your voice.
|
||||
"""
|
||||
messages = []
|
||||
num_converted = 0
|
||||
for message in chat_sessions["messages"]:
|
||||
if message.additional_kwargs.get("sender") == sender:
|
||||
message = AIMessage(
|
||||
content=message.content,
|
||||
additional_kwargs=message.additional_kwargs.copy(),
|
||||
example=getattr(message, "example", None),
|
||||
)
|
||||
num_converted += 1
|
||||
messages.append(message)
|
||||
return ChatSession(messages=messages)
|
||||
|
||||
|
||||
def map_ai_messages(
|
||||
chat_sessions: Iterable[ChatSession], sender: str
|
||||
) -> Iterator[ChatSession]:
|
||||
"""Convert messages from the specified 'sender' to AI messages.
|
||||
|
||||
This is useful for fine-tuning the AI to adapt to your voice.
|
||||
"""
|
||||
for chat_session in chat_sessions:
|
||||
yield map_ai_messages_in_session(chat_session, sender)
|
||||
@@ -0,0 +1,119 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from typing import Iterator, List, Union
|
||||
|
||||
from langchain_core.chat_loaders import BaseChatLoader
|
||||
from langchain_core.chat_sessions import ChatSession
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WhatsAppChatLoader(BaseChatLoader):
|
||||
"""Load `WhatsApp` conversations from a dump zip file or directory."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize the WhatsAppChatLoader.
|
||||
|
||||
Args:
|
||||
path (str): Path to the exported WhatsApp chat
|
||||
zip directory, folder, or file.
|
||||
|
||||
To generate the dump, open the chat, click the three dots in the top
|
||||
right corner, and select "More". Then select "Export chat" and
|
||||
choose "Without media".
|
||||
"""
|
||||
self.path = path
|
||||
ignore_lines = [
|
||||
"This message was deleted",
|
||||
"<Media omitted>",
|
||||
"image omitted",
|
||||
"Messages and calls are end-to-end encrypted. No one outside of this chat,"
|
||||
" not even WhatsApp, can read or listen to them.",
|
||||
]
|
||||
self._ignore_lines = re.compile(
|
||||
r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
self._message_line_regex = re.compile(
|
||||
r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _load_single_chat_session(self, file_path: str) -> ChatSession:
|
||||
"""Load a single chat session from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the chat file.
|
||||
|
||||
Returns:
|
||||
ChatSession: The loaded chat session.
|
||||
"""
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
txt = file.read()
|
||||
|
||||
# Split messages by newlines, but keep multi-line messages grouped
|
||||
chat_lines: List[str] = []
|
||||
current_message = ""
|
||||
for line in txt.split("\n"):
|
||||
if self._message_line_regex.match(line):
|
||||
if current_message:
|
||||
chat_lines.append(current_message)
|
||||
current_message = line
|
||||
else:
|
||||
current_message += " " + line.strip()
|
||||
if current_message:
|
||||
chat_lines.append(current_message)
|
||||
results: List[Union[HumanMessage, AIMessage]] = []
|
||||
for line in chat_lines:
|
||||
result = self._message_line_regex.match(line.strip())
|
||||
if result:
|
||||
timestamp, sender, text = result.groups()
|
||||
if not self._ignore_lines.match(text.strip()):
|
||||
results.append(
|
||||
HumanMessage(
|
||||
role=sender,
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": sender,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Could not parse line: {line}")
|
||||
return ChatSession(messages=results)
|
||||
|
||||
@staticmethod
|
||||
def _iterate_files(path: str) -> Iterator[str]:
|
||||
"""Iterate over the files in a directory or zip file.
|
||||
|
||||
Args:
|
||||
path (str): Path to the directory or zip file.
|
||||
|
||||
Yields:
|
||||
str: The path to each file.
|
||||
"""
|
||||
if os.path.isfile(path):
|
||||
yield path
|
||||
elif os.path.isdir(path):
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
if file.endswith(".txt"):
|
||||
yield os.path.join(root, file)
|
||||
elif zipfile.is_zipfile(path):
|
||||
with zipfile.ZipFile(path) as zip_file:
|
||||
for file in zip_file.namelist():
|
||||
if file.endswith(".txt"):
|
||||
yield zip_file.extract(file)
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy load the messages from the chat file and yield
|
||||
them as chat sessions.
|
||||
|
||||
Yields:
|
||||
Iterator[ChatSession]: The loaded chat sessions.
|
||||
"""
|
||||
yield self._load_single_chat_session(self.path)
|
||||
Reference in New Issue
Block a user