initial commit

This commit is contained in:
2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions

View File

@@ -0,0 +1,83 @@
"""**Chat Loaders** load chat messages from common communications platforms.
Load chat messages from various
communications platforms such as Facebook Messenger, Telegram, and
WhatsApp. The loaded chat messages can be used for fine-tuning models.
**Class hierarchy:**
.. code-block::
BaseChatLoader --> <name>ChatLoader # Examples: WhatsAppChatLoader, IMessageChatLoader
**Main helpers:**
.. code-block::
ChatSession
""" # noqa: E501
import importlib
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from langchain_community.chat_loaders.base import (
BaseChatLoader,
)
from langchain_community.chat_loaders.facebook_messenger import (
FolderFacebookMessengerChatLoader,
SingleFileFacebookMessengerChatLoader,
)
from langchain_community.chat_loaders.gmail import (
GMailLoader,
)
from langchain_community.chat_loaders.imessage import (
IMessageChatLoader,
)
from langchain_community.chat_loaders.langsmith import (
LangSmithDatasetChatLoader,
LangSmithRunChatLoader,
)
from langchain_community.chat_loaders.slack import (
SlackChatLoader,
)
from langchain_community.chat_loaders.telegram import (
TelegramChatLoader,
)
from langchain_community.chat_loaders.whatsapp import (
WhatsAppChatLoader,
)
__all__ = [
"BaseChatLoader",
"FolderFacebookMessengerChatLoader",
"GMailLoader",
"IMessageChatLoader",
"LangSmithDatasetChatLoader",
"LangSmithRunChatLoader",
"SingleFileFacebookMessengerChatLoader",
"SlackChatLoader",
"TelegramChatLoader",
"WhatsAppChatLoader",
]
_module_lookup = {
"BaseChatLoader": "langchain_core.chat_loaders",
"FolderFacebookMessengerChatLoader": "langchain_community.chat_loaders.facebook_messenger", # noqa: E501
"GMailLoader": "langchain_community.chat_loaders.gmail",
"IMessageChatLoader": "langchain_community.chat_loaders.imessage",
"LangSmithDatasetChatLoader": "langchain_community.chat_loaders.langsmith",
"LangSmithRunChatLoader": "langchain_community.chat_loaders.langsmith",
"SingleFileFacebookMessengerChatLoader": "langchain_community.chat_loaders.facebook_messenger", # noqa: E501
"SlackChatLoader": "langchain_community.chat_loaders.slack",
"TelegramChatLoader": "langchain_community.chat_loaders.telegram",
"WhatsAppChatLoader": "langchain_community.chat_loaders.whatsapp",
}
def __getattr__(name: str) -> Any:
if name in _module_lookup:
module = importlib.import_module(_module_lookup[name])
return getattr(module, name)
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@@ -0,0 +1,3 @@
from langchain_core.chat_loaders import BaseChatLoader
__all__ = ["BaseChatLoader"]

View File

@@ -0,0 +1,78 @@
import json
import logging
from pathlib import Path
from typing import Iterator, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import HumanMessage
logger = logging.getLogger(__file__)
class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
"""Load `Facebook Messenger` chat data from a single file.
Args:
path (Union[Path, str]): The path to the chat file.
"""
def __init__(self, path: Union[Path, str]) -> None:
super().__init__()
self.file_path = path if isinstance(path, Path) else Path(path)
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy loads the chat data from the file.
Yields:
ChatSession: A chat session containing the loaded messages.
"""
with open(self.file_path) as f:
data = json.load(f)
sorted_data = sorted(data["messages"], key=lambda x: x["timestamp_ms"])
messages = []
for index, m in enumerate(sorted_data):
if "content" not in m:
logger.info(
f"""Skipping Message No.
{index + 1} as no content is present in the message"""
)
continue
messages.append(
HumanMessage(
content=m["content"], additional_kwargs={"sender": m["sender_name"]}
)
)
yield ChatSession(messages=messages)
class FolderFacebookMessengerChatLoader(BaseChatLoader):
"""Load `Facebook Messenger` chat data from a folder.
Args:
path (Union[str, Path]): The path to the directory
containing the chat files.
"""
def __init__(self, path: Union[str, Path]) -> None:
super().__init__()
self.directory_path = Path(path) if isinstance(path, str) else path
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy loads the chat data from the folder.
Yields:
ChatSession: A chat session containing the loaded messages.
"""
inbox_path = self.directory_path / "inbox"
for _dir in inbox_path.iterdir():
if _dir.is_dir():
for _file in _dir.iterdir():
if _file.suffix.lower() == ".json":
file_loader = SingleFileFacebookMessengerChatLoader(path=_file)
for result in file_loader.lazy_load():
yield result

View File

@@ -0,0 +1,117 @@
import base64
import re
from typing import Any, Iterator
from langchain_core._api.deprecation import deprecated
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import HumanMessage
def _extract_email_content(msg: Any) -> HumanMessage:
from_email = None
for values in msg["payload"]["headers"]:
name = values["name"]
if name == "From":
from_email = values["value"]
if from_email is None:
raise ValueError
for part in msg["payload"]["parts"]:
if part["mimeType"] == "text/plain":
data = part["body"]["data"]
data = base64.urlsafe_b64decode(data).decode("utf-8")
# Regular expression to split the email body at the first
# occurrence of a line that starts with "On ... wrote:"
pattern = re.compile(r"\r\nOn .+(\r\n)*wrote:\r\n")
# Split the email body and extract the first part
newest_response = re.split(pattern, data)[0]
message = HumanMessage(
content=newest_response, additional_kwargs={"sender": from_email}
)
return message
raise ValueError
def _get_message_data(service: Any, message: Any) -> ChatSession:
msg = service.users().messages().get(userId="me", id=message["id"]).execute()
message_content = _extract_email_content(msg)
in_reply_to = None
email_data = msg["payload"]["headers"]
for values in email_data:
name = values["name"]
if name == "In-Reply-To":
in_reply_to = values["value"]
if in_reply_to is None:
raise ValueError
thread_id = msg["threadId"]
thread = service.users().threads().get(userId="me", id=thread_id).execute()
messages = thread["messages"]
response_email = None
for message in messages:
email_data = message["payload"]["headers"]
for values in email_data:
if values["name"] == "Message-ID":
message_id = values["value"]
if message_id == in_reply_to:
response_email = message
if response_email is None:
raise ValueError
starter_content = _extract_email_content(response_email)
return ChatSession(messages=[starter_content, message_content])
@deprecated(
since="0.0.32",
removal="1.0",
alternative_import="langchain_google_community.GMailLoader",
)
class GMailLoader(BaseChatLoader):
"""Load data from `GMail`.
There are many ways you could want to load data from GMail.
This loader is currently fairly opinionated in how to do so.
The way it does it is it first looks for all messages that you have sent.
It then looks for messages where you are responding to a previous email.
It then fetches that previous email, and creates a training example
of that email, followed by your email.
Note that there are clear limitations here. For example,
all examples created are only looking at the previous email for context.
To use:
- Set up a Google Developer Account:
Go to the Google Developer Console, create a project,
and enable the Gmail API for that project.
This will give you a credentials.json file that you'll need later.
"""
def __init__(self, creds: Any, n: int = 100, raise_error: bool = False) -> None:
super().__init__()
self.creds = creds
self.n = n
self.raise_error = raise_error
def lazy_load(self) -> Iterator[ChatSession]:
from googleapiclient.discovery import build
service = build("gmail", "v1", credentials=self.creds)
results = (
service.users()
.messages()
.list(userId="me", labelIds=["SENT"], maxResults=self.n)
.execute()
)
messages = results.get("messages", [])
for message in messages:
try:
yield _get_message_data(service, message)
except Exception as e:
# TODO: handle errors better
if self.raise_error:
raise e
else:
pass

View File

@@ -0,0 +1,221 @@
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, List, Optional, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import HumanMessage
if TYPE_CHECKING:
import sqlite3
def nanoseconds_from_2001_to_datetime(nanoseconds: int) -> datetime:
"""Convert nanoseconds since 2001 to a datetime object.
Args:
nanoseconds (int): Nanoseconds since January 1, 2001.
Returns:
datetime: Datetime object.
"""
# Convert nanoseconds to seconds (1 second = 1e9 nanoseconds)
timestamp_in_seconds = nanoseconds / 1e9
# The reference date is January 1, 2001, in Unix time
reference_date_seconds = datetime(2001, 1, 1).timestamp()
# Calculate the actual timestamp by adding the reference date
actual_timestamp = reference_date_seconds + timestamp_in_seconds
# Convert to a datetime object
return datetime.fromtimestamp(actual_timestamp)
class IMessageChatLoader(BaseChatLoader):
"""Load chat sessions from the `iMessage` chat.db SQLite file.
It only works on macOS when you have iMessage enabled and have the chat.db file.
The chat.db file is likely located at ~/Library/Messages/chat.db. However, your
terminal may not have permission to access this file. To resolve this, you can
copy the file to a different location, change the permissions of the file, or
grant full disk access for your terminal emulator
in System Settings > Security and Privacy > Full Disk Access.
"""
def __init__(self, path: Optional[Union[str, Path]] = None):
"""
Initialize the IMessageChatLoader.
Args:
path (str or Path, optional): Path to the chat.db SQLite file.
Defaults to None, in which case the default path
~/Library/Messages/chat.db will be used.
"""
if path is None:
path = Path.home() / "Library" / "Messages" / "chat.db"
self.db_path = path if isinstance(path, Path) else Path(path)
if not self.db_path.exists():
raise FileNotFoundError(f"File {self.db_path} not found")
try:
import sqlite3 # noqa: F401
except ImportError as e:
raise ImportError(
"The sqlite3 module is required to load iMessage chats.\n"
"Please install it with `pip install pysqlite3`"
) from e
@staticmethod
def _parse_attributed_body(attributed_body: bytes) -> str:
"""
Parse the attributedBody field of the message table
for the text content of the message.
The attributedBody field is a binary blob that contains
the message content after the byte string b"NSString":
5 bytes 1-3 bytes `len` bytes
... | b"NSString" | preamble | `len` | contents | ...
The 5 preamble bytes are always b"\x01\x94\x84\x01+"
The size of `len` is either 1 byte or 3 bytes:
- If the first byte in `len` is b"\x81" then `len` is 3 bytes long.
So the message length is the 2 bytes after, in little Endian.
- Otherwise, the size of `len` is 1 byte, and the message length is
that byte.
Args:
attributed_body (bytes): attributedBody field of the message table.
Return:
str: Text content of the message.
"""
content = attributed_body.split(b"NSString")[1][5:]
length, start = content[0], 1
if content[0] == 129:
length, start = int.from_bytes(content[1:3], "little"), 3
return content[start : start + length].decode("utf-8", errors="ignore")
@staticmethod
def _get_session_query(use_chat_handle_table: bool) -> str:
# Messages sent pre OSX 12 require a join through the chat_handle_join table
# However, the table doesn't exist if database created with OSX 12 or above.
joins_w_chat_handle = """
JOIN chat_handle_join ON
chat_message_join.chat_id = chat_handle_join.chat_id
JOIN handle ON
handle.ROWID = chat_handle_join.handle_id"""
joins_no_chat_handle = """
JOIN handle ON message.handle_id = handle.ROWID
"""
joins = joins_w_chat_handle if use_chat_handle_table else joins_no_chat_handle
return f"""
SELECT message.date,
handle.id,
message.text,
message.is_from_me,
message.attributedBody
FROM message
JOIN chat_message_join ON
message.ROWID = chat_message_join.message_id
{joins}
WHERE chat_message_join.chat_id = ?
ORDER BY message.date ASC;
"""
def _load_single_chat_session(
self, cursor: "sqlite3.Cursor", use_chat_handle_table: bool, chat_id: int
) -> ChatSession:
"""
Load a single chat session from the iMessage chat.db.
Args:
cursor: SQLite cursor object.
chat_id (int): ID of the chat session to load.
Returns:
ChatSession: Loaded chat session.
"""
results: List[HumanMessage] = []
query = self._get_session_query(use_chat_handle_table)
cursor.execute(query, (chat_id,))
messages = cursor.fetchall()
for date, sender, text, is_from_me, attributedBody in messages:
if text:
content = text
elif attributedBody:
content = self._parse_attributed_body(attributedBody)
else: # Skip messages with no content
continue
results.append(
HumanMessage(
role=sender,
content=content,
additional_kwargs={
"message_time": date,
"message_time_as_datetime": nanoseconds_from_2001_to_datetime(
date
),
"sender": sender,
"is_from_me": bool(is_from_me),
},
)
)
return ChatSession(messages=results)
def lazy_load(self) -> Iterator[ChatSession]:
"""
Lazy load the chat sessions from the iMessage chat.db
and yield them in the required format.
Yields:
ChatSession: Loaded chat session.
"""
import sqlite3
try:
conn = sqlite3.connect(self.db_path)
except sqlite3.OperationalError as e:
raise ValueError(
f"Could not open iMessage DB file {self.db_path}.\n"
"Make sure your terminal emulator has disk access to this file.\n"
" You can either copy the DB file to an accessible location"
" or grant full disk access for your terminal emulator."
" You can grant full disk access for your terminal emulator"
" in System Settings > Security and Privacy > Full Disk Access."
) from e
cursor = conn.cursor()
# See if chat_handle_join table exists:
query = """SELECT name FROM sqlite_master
WHERE type='table' AND name='chat_handle_join';"""
cursor.execute(query)
is_chat_handle_join_exists = cursor.fetchone()
# Fetch the list of chat IDs sorted by time (most recent first)
query = """SELECT chat_id
FROM message
JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
GROUP BY chat_id
ORDER BY MAX(date) DESC;"""
cursor.execute(query)
chat_ids = [row[0] for row in cursor.fetchall()]
for chat_id in chat_ids:
yield self._load_single_chat_session(
cursor, is_chat_handle_join_exists, chat_id
)
conn.close()

View File

@@ -0,0 +1,159 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Union, cast
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.load.load import load
if TYPE_CHECKING:
from langsmith.client import Client
from langsmith.schemas import Run
logger = logging.getLogger(__name__)
class LangSmithRunChatLoader(BaseChatLoader):
"""
Load chat sessions from a list of LangSmith "llm" runs.
Attributes:
runs (Iterable[Union[str, Run]]): The list of LLM run IDs or run objects.
client (Client): Instance of LangSmith client for fetching data.
"""
def __init__(
self, runs: Iterable[Union[str, Run]], client: Optional["Client"] = None
):
"""
Initialize a new LangSmithRunChatLoader instance.
:param runs: List of LLM run IDs or run objects.
:param client: An instance of LangSmith client, if not provided,
a new client instance will be created.
"""
from langsmith.client import Client
self.runs = runs
self.client = client or Client()
@staticmethod
def _load_single_chat_session(llm_run: "Run") -> ChatSession:
"""
Convert an individual LangSmith LLM run to a ChatSession.
:param llm_run: The LLM run object.
:return: A chat session representing the run's data.
"""
chat_session = LangSmithRunChatLoader._get_messages_from_llm_run(llm_run)
functions = LangSmithRunChatLoader._get_functions_from_llm_run(llm_run)
if functions:
chat_session["functions"] = functions
return chat_session
@staticmethod
def _get_messages_from_llm_run(llm_run: "Run") -> ChatSession:
"""
Extract messages from a LangSmith LLM run.
:param llm_run: The LLM run object.
:return: ChatSession with the extracted messages.
"""
if llm_run.run_type != "llm":
raise ValueError(f"Expected run of type llm. Got: {llm_run.run_type}")
if "messages" not in llm_run.inputs:
raise ValueError(f"Run has no 'messages' inputs. Got {llm_run.inputs}")
if not llm_run.outputs:
raise ValueError("Cannot convert pending run")
messages = load(llm_run.inputs)["messages"]
message_chunk = load(llm_run.outputs)["generations"][0]["message"]
return ChatSession(messages=messages + [message_chunk])
@staticmethod
def _get_functions_from_llm_run(llm_run: "Run") -> Optional[List[Dict]]:
"""
Extract functions from a LangSmith LLM run if they exist.
:param llm_run: The LLM run object.
:return: Functions from the run or None.
"""
if llm_run.run_type != "llm":
raise ValueError(f"Expected run of type llm. Got: {llm_run.run_type}")
return (llm_run.extra or {}).get("invocation_params", {}).get("functions")
def lazy_load(self) -> Iterator[ChatSession]:
"""
Lazy load the chat sessions from the iterable of run IDs.
This method fetches the runs and converts them to chat sessions on-the-fly,
yielding one session at a time.
:return: Iterator of chat sessions containing messages.
"""
from langsmith.schemas import Run
for run_obj in self.runs:
try:
if hasattr(run_obj, "id"):
run = run_obj
else:
run = self.client.read_run(run_obj)
session = self._load_single_chat_session(cast(Run, run))
yield session
except ValueError as e:
logger.warning(f"Could not load run {run_obj}: {repr(e)}")
continue
class LangSmithDatasetChatLoader(BaseChatLoader):
"""
Load chat sessions from a LangSmith dataset with the "chat" data type.
Attributes:
dataset_name (str): The name of the LangSmith dataset.
client (Client): Instance of LangSmith client for fetching data.
"""
def __init__(self, *, dataset_name: str, client: Optional["Client"] = None):
"""
Initialize a new LangSmithChatDatasetLoader instance.
:param dataset_name: The name of the LangSmith dataset.
:param client: An instance of LangSmith client; if not provided,
a new client instance will be created.
"""
try:
from langsmith.client import Client
except ImportError as e:
raise ImportError(
"The LangSmith client is required to load LangSmith datasets.\n"
"Please install it with `pip install langsmith`"
) from e
self.dataset_name = dataset_name
self.client = client or Client()
def lazy_load(self) -> Iterator[ChatSession]:
"""
Lazy load the chat sessions from the specified LangSmith dataset.
This method fetches the chat data from the dataset and
converts each data point to chat sessions on-the-fly,
yielding one session at a time.
:return: Iterator of chat sessions containing messages.
"""
from langchain_community.adapters import openai as oai_adapter
data = self.client.read_dataset_openai_finetuning(
dataset_name=self.dataset_name
)
for data_point in data:
yield ChatSession(
messages=[
oai_adapter.convert_dict_to_message(m)
for m in data_point.get("messages", [])
],
functions=data_point.get("functions"),
)

View File

@@ -0,0 +1,87 @@
import json
import logging
import re
import zipfile
from pathlib import Path
from typing import Dict, Iterator, List, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, HumanMessage
logger = logging.getLogger(__name__)
class SlackChatLoader(BaseChatLoader):
"""Load `Slack` conversations from a dump zip file."""
def __init__(
self,
path: Union[str, Path],
):
"""
Initialize the chat loader with the path to the exported Slack dump zip file.
:param path: Path to the exported Slack dump zip file.
"""
self.zip_path = path if isinstance(path, Path) else Path(path)
if not self.zip_path.exists():
raise FileNotFoundError(f"File {self.zip_path} not found")
@staticmethod
def _load_single_chat_session(messages: List[Dict]) -> ChatSession:
results: List[Union[AIMessage, HumanMessage]] = []
previous_sender = None
for message in messages:
if not isinstance(message, dict):
continue
text = message.get("text", "")
timestamp = message.get("ts", "")
sender = message.get("user", "")
if not sender:
continue
skip_pattern = re.compile(
r"<@U\d+> has joined the channel", flags=re.IGNORECASE
)
if skip_pattern.match(text):
continue
if sender == previous_sender:
results[-1].content += "\n\n" + text
results[-1].additional_kwargs["events"].append(
{"message_time": timestamp}
)
else:
results.append(
HumanMessage(
role=sender,
content=text,
additional_kwargs={
"sender": sender,
"events": [{"message_time": timestamp}],
},
)
)
previous_sender = sender
return ChatSession(messages=results)
@staticmethod
def _read_json(zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
"""Read JSON data from a zip subfile."""
with zip_file.open(file_path, "r") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(f"Expected list of dictionaries, got {type(data)}")
return data
def lazy_load(self) -> Iterator[ChatSession]:
"""
Lazy load the chat sessions from the Slack dump file and yield them
in the required format.
:return: Iterator of chat sessions containing messages.
"""
with zipfile.ZipFile(str(self.zip_path), "r") as zip_file:
for file_path in zip_file.namelist():
if file_path.endswith(".json"):
messages = self._read_json(zip_file, file_path)
yield self._load_single_chat_session(messages)

View File

@@ -0,0 +1,155 @@
import json
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from typing import Iterator, List, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
logger = logging.getLogger(__name__)
class TelegramChatLoader(BaseChatLoader):
"""Load `telegram` conversations to LangChain chat messages.
To export, use the Telegram Desktop app from
https://desktop.telegram.org/, select a conversation, click the three dots
in the top right corner, and select "Export chat history". Then select
"Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
the desktop app (like "Telegram for MacOS") do not support exporting chat
history.
"""
def __init__(
self,
path: Union[str, Path],
):
"""Initialize the TelegramChatLoader.
Args:
path (Union[str, Path]): Path to the exported Telegram chat zip,
directory, json, or HTML file.
"""
self.path = path if isinstance(path, str) else str(path)
@staticmethod
def _load_single_chat_session_html(file_path: str) -> ChatSession:
"""Load a single chat session from an HTML file.
Args:
file_path (str): Path to the HTML file.
Returns:
ChatSession: The loaded chat session.
"""
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError(
"Please install the 'beautifulsoup4' package to load"
" Telegram HTML files. You can do this by running"
"'pip install beautifulsoup4' in your terminal."
)
with open(file_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
results: List[Union[HumanMessage, AIMessage]] = []
previous_sender = None
for message in soup.select(".message.default"):
timestamp = message.select_one(".pull_right.date.details")["title"] # type: ignore[index]
from_name_element = message.select_one(".from_name")
if from_name_element is None and previous_sender is None:
logger.debug("from_name not found in message")
continue
elif from_name_element is None:
from_name = previous_sender
else:
from_name = from_name_element.text.strip()
text = message.select_one(".text").text.strip() # type: ignore[union-attr]
results.append(
HumanMessage(
content=text,
additional_kwargs={
"sender": from_name,
"events": [{"message_time": timestamp}],
},
)
)
previous_sender = from_name
return ChatSession(messages=results)
@staticmethod
def _load_single_chat_session_json(file_path: str) -> ChatSession:
"""Load a single chat session from a JSON file.
Args:
file_path (str): Path to the JSON file.
Returns:
ChatSession: The loaded chat session.
"""
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
messages = data.get("messages", [])
results: List[BaseMessage] = []
for message in messages:
text = message.get("text", "")
timestamp = message.get("date", "")
from_name = message.get("from", "")
if from_name is None:
from_name = "Deleted Account"
results.append(
HumanMessage(
content=text,
additional_kwargs={
"sender": from_name,
"events": [{"message_time": timestamp}],
},
)
)
return ChatSession(messages=results)
@staticmethod
def _iterate_files(path: str) -> Iterator[str]:
"""Iterate over files in a directory or zip file.
Args:
path (str): Path to the directory or zip file.
Yields:
str: Path to each file.
"""
if os.path.isfile(path) and path.endswith((".html", ".json")):
yield path
elif os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
if file.endswith((".html", ".json")):
yield os.path.join(root, file)
elif zipfile.is_zipfile(path):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith((".html", ".json")):
with tempfile.TemporaryDirectory() as temp_dir:
yield zip_file.extract(file, path=temp_dir)
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy load the messages from the chat file and yield them
in as chat sessions.
Yields:
ChatSession: The loaded chat session.
"""
for file_path in self._iterate_files(self.path):
if file_path.endswith(".html"):
yield self._load_single_chat_session_html(file_path)
elif file_path.endswith(".json"):
yield self._load_single_chat_session_json(file_path)

View File

@@ -0,0 +1,104 @@
"""Utilities for chat loaders."""
from copy import deepcopy
from typing import Iterable, Iterator, List
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, BaseMessage
def merge_chat_runs_in_session(
chat_session: ChatSession, delimiter: str = "\n\n"
) -> ChatSession:
"""Merge chat runs together in a chat session.
A chat run is a sequence of messages from the same sender.
Args:
chat_session: A chat session.
Returns:
A chat session with merged chat runs.
"""
messages: List[BaseMessage] = []
for message in chat_session["messages"]:
if isinstance(message.content, list):
text = ""
for content in message.content:
if isinstance(content, dict):
text += content.get("text", "") or ""
else:
text += content
message.content = text
if not isinstance(message.content, str):
raise ValueError(
"Chat Loaders only support messages with content type string, "
f"got {message.content}"
)
if not messages:
messages.append(deepcopy(message))
elif (
isinstance(message, type(messages[-1]))
and messages[-1].additional_kwargs.get("sender") is not None
and messages[-1].additional_kwargs["sender"]
== message.additional_kwargs.get("sender")
):
if not isinstance(messages[-1].content, str):
raise ValueError(
"Chat Loaders only support messages with content type string, "
f"got {messages[-1].content}"
)
messages[-1].content = (
messages[-1].content + delimiter + message.content
).strip()
messages[-1].additional_kwargs.get("events", []).extend(
message.additional_kwargs.get("events") or []
)
else:
messages.append(deepcopy(message))
return ChatSession(messages=messages)
def merge_chat_runs(chat_sessions: Iterable[ChatSession]) -> Iterator[ChatSession]:
"""Merge chat runs together.
A chat run is a sequence of messages from the same sender.
Args:
chat_sessions: A list of chat sessions.
Returns:
A list of chat sessions with merged chat runs.
"""
for chat_session in chat_sessions:
yield merge_chat_runs_in_session(chat_session)
def map_ai_messages_in_session(chat_sessions: ChatSession, sender: str) -> ChatSession:
"""Convert messages from the specified 'sender' to AI messages.
This is useful for fine-tuning the AI to adapt to your voice.
"""
messages = []
num_converted = 0
for message in chat_sessions["messages"]:
if message.additional_kwargs.get("sender") == sender:
message = AIMessage(
content=message.content,
additional_kwargs=message.additional_kwargs.copy(),
example=getattr(message, "example", None),
)
num_converted += 1
messages.append(message)
return ChatSession(messages=messages)
def map_ai_messages(
chat_sessions: Iterable[ChatSession], sender: str
) -> Iterator[ChatSession]:
"""Convert messages from the specified 'sender' to AI messages.
This is useful for fine-tuning the AI to adapt to your voice.
"""
for chat_session in chat_sessions:
yield map_ai_messages_in_session(chat_session, sender)

View File

@@ -0,0 +1,119 @@
import logging
import os
import re
import zipfile
from typing import Iterator, List, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, HumanMessage
logger = logging.getLogger(__name__)
class WhatsAppChatLoader(BaseChatLoader):
"""Load `WhatsApp` conversations from a dump zip file or directory."""
def __init__(self, path: str):
"""Initialize the WhatsAppChatLoader.
Args:
path (str): Path to the exported WhatsApp chat
zip directory, folder, or file.
To generate the dump, open the chat, click the three dots in the top
right corner, and select "More". Then select "Export chat" and
choose "Without media".
"""
self.path = path
ignore_lines = [
"This message was deleted",
"<Media omitted>",
"image omitted",
"Messages and calls are end-to-end encrypted. No one outside of this chat,"
" not even WhatsApp, can read or listen to them.",
]
self._ignore_lines = re.compile(
r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
flags=re.IGNORECASE,
)
self._message_line_regex = re.compile(
r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
flags=re.IGNORECASE,
)
def _load_single_chat_session(self, file_path: str) -> ChatSession:
"""Load a single chat session from a file.
Args:
file_path (str): Path to the chat file.
Returns:
ChatSession: The loaded chat session.
"""
with open(file_path, "r", encoding="utf-8") as file:
txt = file.read()
# Split messages by newlines, but keep multi-line messages grouped
chat_lines: List[str] = []
current_message = ""
for line in txt.split("\n"):
if self._message_line_regex.match(line):
if current_message:
chat_lines.append(current_message)
current_message = line
else:
current_message += " " + line.strip()
if current_message:
chat_lines.append(current_message)
results: List[Union[HumanMessage, AIMessage]] = []
for line in chat_lines:
result = self._message_line_regex.match(line.strip())
if result:
timestamp, sender, text = result.groups()
if not self._ignore_lines.match(text.strip()):
results.append(
HumanMessage(
role=sender,
content=text,
additional_kwargs={
"sender": sender,
"events": [{"message_time": timestamp}],
},
)
)
else:
logger.debug(f"Could not parse line: {line}")
return ChatSession(messages=results)
@staticmethod
def _iterate_files(path: str) -> Iterator[str]:
"""Iterate over the files in a directory or zip file.
Args:
path (str): Path to the directory or zip file.
Yields:
str: The path to each file.
"""
if os.path.isfile(path):
yield path
elif os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
if file.endswith(".txt"):
yield os.path.join(root, file)
elif zipfile.is_zipfile(path):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith(".txt"):
yield zip_file.extract(file)
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy load the messages from the chat file and yield
them as chat sessions.
Yields:
Iterator[ChatSession]: The loaded chat sessions.
"""
yield self._load_single_chat_session(self.path)