initial commit

This commit is contained in:
2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
"""Implement a GPT-3 driven browser.
Heavily influenced from https://github.com/nat/natbot
"""

View File

@@ -0,0 +1,158 @@
"""Implement an LLM driven browser."""
from __future__ import annotations
import warnings
from typing import Any
from langchain_core._api import deprecated
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import Runnable
from pydantic import ConfigDict, model_validator
from langchain_classic.chains.base import Chain
from langchain_classic.chains.natbot.prompt import PROMPT
@deprecated(
since="0.2.13",
message=(
"Importing NatBotChain from langchain is deprecated and will be removed in "
"langchain 1.0. Please import from langchain_community instead: "
"from langchain_community.chains.natbot import NatBotChain. "
"You may need to pip install -U langchain-community."
),
removal="1.0",
)
class NatBotChain(Chain):
"""Implement an LLM driven browser.
**Security Note**: This toolkit provides code to control a web-browser.
The web-browser can be used to navigate to:
- Any URL (including any internal network URLs)
- And local files
Exercise care if exposing this chain to end-users. Control who is able to
access and use this chain, and isolate the network access of the server
that hosts this chain.
See https://docs.langchain.com/oss/python/security-policy for more information.
Example:
```python
from langchain_classic.chains import NatBotChain
natbot = NatBotChain.from_default("Buy me a new hat.")
```
"""
llm_chain: Runnable
objective: str
"""Objective that NatBot is tasked with completing."""
llm: BaseLanguageModel | None = None
"""[Deprecated] LLM wrapper to use."""
input_url_key: str = "url"
input_browser_content_key: str = "browser_content"
previous_command: str = ""
output_key: str = "command"
model_config = ConfigDict(
arbitrary_types_allowed=True,
extra="forbid",
)
@model_validator(mode="before")
@classmethod
def _raise_deprecation(cls, values: dict) -> Any:
if "llm" in values:
warnings.warn(
"Directly instantiating an NatBotChain with an llm is deprecated. "
"Please instantiate with llm_chain argument or using the from_llm "
"class method.",
stacklevel=5,
)
if "llm_chain" not in values and values["llm"] is not None:
values["llm_chain"] = PROMPT | values["llm"] | StrOutputParser()
return values
@classmethod
def from_default(cls, objective: str, **kwargs: Any) -> NatBotChain:
"""Load with default LLMChain."""
msg = (
"This method is no longer implemented. Please use from_llm."
"model = OpenAI(temperature=0.5, best_of=10, n=3, max_tokens=50)"
"For example, NatBotChain.from_llm(model, objective)"
)
raise NotImplementedError(msg)
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
objective: str,
**kwargs: Any,
) -> NatBotChain:
"""Load from LLM."""
llm_chain = PROMPT | llm | StrOutputParser()
return cls(llm_chain=llm_chain, objective=objective, **kwargs)
@property
def input_keys(self) -> list[str]:
"""Expect url and browser content."""
return [self.input_url_key, self.input_browser_content_key]
@property
def output_keys(self) -> list[str]:
"""Return command."""
return [self.output_key]
def _call(
self,
inputs: dict[str, str],
run_manager: CallbackManagerForChainRun | None = None,
) -> dict[str, str]:
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
url = inputs[self.input_url_key]
browser_content = inputs[self.input_browser_content_key]
llm_cmd = self.llm_chain.invoke(
{
"objective": self.objective,
"url": url[:100],
"previous_command": self.previous_command,
"browser_content": browser_content[:4500],
},
config={"callbacks": _run_manager.get_child()},
)
llm_cmd = llm_cmd.strip()
self.previous_command = llm_cmd
return {self.output_key: llm_cmd}
def execute(self, url: str, browser_content: str) -> str:
"""Figure out next browser command to run.
Args:
url: URL of the site currently on.
browser_content: Content of the page as currently displayed by the browser.
Returns:
Next browser command to run.
Example:
```python
browser_content = "...."
llm_command = natbot.run("www.google.com", browser_content)
```
"""
_inputs = {
self.input_url_key: url,
self.input_browser_content_key: browser_content,
}
return self(_inputs)[self.output_key]
@property
def _chain_type(self) -> str:
return "nat_bot_chain"

View File

@@ -0,0 +1,479 @@
import logging
import time
from sys import platform
from typing import (
TYPE_CHECKING,
Any,
TypedDict,
)
if TYPE_CHECKING:
from playwright.sync_api import Browser, CDPSession, Page
logger = logging.getLogger(__name__)
black_listed_elements: set[str] = {
"html",
"head",
"title",
"meta",
"iframe",
"body",
"script",
"style",
"path",
"svg",
"br",
"::marker",
}
class ElementInViewPort(TypedDict):
"""A typed dictionary containing information about elements in the viewport."""
node_index: str
backend_node_id: int
node_name: str | None
node_value: str | None
node_meta: list[str]
is_clickable: bool
origin_x: int
origin_y: int
center_x: int
center_y: int
class Crawler:
"""A crawler for web pages.
**Security Note**: This is an implementation of a crawler that uses a browser via
Playwright.
This crawler can be used to load arbitrary webpages INCLUDING content
from the local file system.
Control access to who can submit crawling requests and what network access
the crawler has.
Make sure to scope permissions to the minimal permissions necessary for
the application.
See https://docs.langchain.com/oss/python/security-policy for more information.
"""
def __init__(self) -> None:
"""Initialize the crawler."""
try:
from playwright.sync_api import sync_playwright
except ImportError as e:
msg = (
"Could not import playwright python package. "
"Please install it with `pip install playwright`."
)
raise ImportError(msg) from e
self.browser: Browser = (
sync_playwright().start().chromium.launch(headless=False)
)
self.page: Page = self.browser.new_page()
self.page.set_viewport_size({"width": 1280, "height": 1080})
self.page_element_buffer: dict[int, ElementInViewPort]
self.client: CDPSession
def go_to_page(self, url: str) -> None:
"""Navigate to the given URL.
Args:
url: The URL to navigate to. If it does not contain a scheme, it will be
prefixed with "http://".
"""
self.page.goto(url=url if "://" in url else "http://" + url)
self.client = self.page.context.new_cdp_session(self.page)
self.page_element_buffer = {}
def scroll(self, direction: str) -> None:
"""Scroll the page in the given direction.
Args:
direction: The direction to scroll in, either "up" or "down".
"""
if direction == "up":
self.page.evaluate(
"(document.scrollingElement || document.body).scrollTop = "
"(document.scrollingElement || document.body).scrollTop - "
"window.innerHeight;"
)
elif direction == "down":
self.page.evaluate(
"(document.scrollingElement || document.body).scrollTop = "
"(document.scrollingElement || document.body).scrollTop + "
"window.innerHeight;"
)
def click(self, id_: str | int) -> None:
"""Click on an element with the given id.
Args:
id_: The id of the element to click on.
"""
# Inject javascript into the page which removes the target= attribute from links
js = """
links = document.getElementsByTagName("a");
for (var i = 0; i < links.length; i++) {
links[i].removeAttribute("target");
}
"""
self.page.evaluate(js)
element = self.page_element_buffer.get(int(id_))
if element:
x: float = element["center_x"]
y: float = element["center_y"]
self.page.mouse.click(x, y)
else:
print("Could not find element") # noqa: T201
def type(self, id_: str | int, text: str) -> None:
"""Type text into an element with the given id.
Args:
id_: The id of the element to type into.
text: The text to type into the element.
"""
self.click(id_)
self.page.keyboard.type(text)
def enter(self) -> None:
"""Press the Enter key."""
self.page.keyboard.press("Enter")
def crawl(self) -> list[str]:
"""Crawl the current page.
Returns:
A list of the elements in the viewport.
"""
page = self.page
page_element_buffer = self.page_element_buffer
start = time.time()
page_state_as_text = []
device_pixel_ratio: float = page.evaluate("window.devicePixelRatio")
if platform == "darwin" and device_pixel_ratio == 1: # lies
device_pixel_ratio = 2
win_upper_bound: float = page.evaluate("window.pageYOffset")
win_left_bound: float = page.evaluate("window.pageXOffset")
win_width: float = page.evaluate("window.screen.width")
win_height: float = page.evaluate("window.screen.height")
win_right_bound: float = win_left_bound + win_width
win_lower_bound: float = win_upper_bound + win_height
# percentage_progress_start = (win_upper_bound / document_scroll_height) * 100
# percentage_progress_end = (
# (win_height + win_upper_bound) / document_scroll_height
# ) * 100
percentage_progress_start = 1
percentage_progress_end = 2
page_state_as_text.append(
{
"x": 0,
"y": 0,
"text": f"[scrollbar {percentage_progress_start:0.2f}-"
f"{percentage_progress_end:0.2f}%]",
}
)
tree = self.client.send(
"DOMSnapshot.captureSnapshot",
{"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
)
strings: dict[int, str] = tree["strings"]
document: dict[str, Any] = tree["documents"][0]
nodes: dict[str, Any] = document["nodes"]
backend_node_id: dict[int, int] = nodes["backendNodeId"]
attributes: dict[int, dict[int, Any]] = nodes["attributes"]
node_value: dict[int, int] = nodes["nodeValue"]
parent: dict[int, int] = nodes["parentIndex"]
node_names: dict[int, int] = nodes["nodeName"]
is_clickable: set[int] = set(nodes["isClickable"]["index"])
input_value: dict[str, Any] = nodes["inputValue"]
input_value_index: list[int] = input_value["index"]
input_value_values: list[int] = input_value["value"]
layout: dict[str, Any] = document["layout"]
layout_node_index: list[int] = layout["nodeIndex"]
bounds: dict[int, list[float]] = layout["bounds"]
cursor: int = 0
child_nodes: dict[str, list[dict[str, Any]]] = {}
elements_in_view_port: list[ElementInViewPort] = []
anchor_ancestry: dict[str, tuple[bool, int | None]] = {"-1": (False, None)}
button_ancestry: dict[str, tuple[bool, int | None]] = {"-1": (False, None)}
def convert_name(
node_name: str | None,
has_click_handler: bool | None, # noqa: FBT001
) -> str:
if node_name == "a":
return "link"
if node_name == "input":
return "input"
if node_name == "img":
return "img"
if (
node_name == "button" or has_click_handler
): # found pages that needed this quirk
return "button"
return "text"
def find_attributes(
attributes: dict[int, Any], keys: list[str]
) -> dict[str, str]:
values = {}
for [key_index, value_index] in zip(*(iter(attributes),) * 2, strict=False):
if value_index < 0:
continue
key = strings[key_index]
value = strings[value_index]
if key in keys:
values[key] = value
keys.remove(key)
if not keys:
return values
return values
def add_to_hash_tree(
hash_tree: dict[str, tuple[bool, int | None]],
tag: str,
node_id: int,
node_name: str | None,
parent_id: int,
) -> tuple[bool, int | None]:
parent_id_str = str(parent_id)
if parent_id_str not in hash_tree:
parent_name = strings[node_names[parent_id]].lower()
grand_parent_id = parent[parent_id]
add_to_hash_tree(
hash_tree, tag, parent_id, parent_name, grand_parent_id
)
is_parent_desc_anchor, anchor_id = hash_tree[parent_id_str]
# even if the anchor is nested in another anchor, we set the "root" for all
# descendants to be ::Self
if node_name == tag:
value: tuple[bool, int | None] = (True, node_id)
elif (
is_parent_desc_anchor
): # reuse the parent's anchor_id (which could be much higher in the tree)
value = (True, anchor_id)
else:
value = (
False,
None,
)
# not a descendant of an anchor, most likely it will become text, an
# interactive element or discarded
hash_tree[str(node_id)] = value
return value
for index, node_name_index in enumerate(node_names):
node_parent = parent[index]
node_name: str | None = strings[node_name_index].lower()
is_ancestor_of_anchor, anchor_id = add_to_hash_tree(
anchor_ancestry, "a", index, node_name, node_parent
)
is_ancestor_of_button, button_id = add_to_hash_tree(
button_ancestry, "button", index, node_name, node_parent
)
try:
cursor = layout_node_index.index(index)
# TODO: replace this with proper cursoring, ignoring the fact this is
# O(n^2) for the moment
except ValueError:
continue
if node_name in black_listed_elements:
continue
[x, y, width, height] = bounds[cursor]
x /= device_pixel_ratio
y /= device_pixel_ratio
width /= device_pixel_ratio
height /= device_pixel_ratio
elem_left_bound = x
elem_top_bound = y
elem_right_bound = x + width
elem_lower_bound = y + height
partially_is_in_viewport = (
elem_left_bound < win_right_bound
and elem_right_bound >= win_left_bound
and elem_top_bound < win_lower_bound
and elem_lower_bound >= win_upper_bound
)
if not partially_is_in_viewport:
continue
meta_data: list[str] = []
# inefficient to grab the same set of keys for kinds of objects, but it's
# fine for now
element_attributes = find_attributes(
attributes[index], ["type", "placeholder", "aria-label", "title", "alt"]
)
ancestor_exception = is_ancestor_of_anchor or is_ancestor_of_button
ancestor_node_key = (
None
if not ancestor_exception
else str(anchor_id)
if is_ancestor_of_anchor
else str(button_id)
)
ancestor_node = (
None
if not ancestor_exception
else child_nodes.setdefault(str(ancestor_node_key), [])
)
if node_name == "#text" and ancestor_exception and ancestor_node:
text = strings[node_value[index]]
if text in {"|", ""}:
continue
ancestor_node.append({"type": "type", "value": text})
else:
if (
node_name == "input" and element_attributes.get("type") == "submit"
) or node_name == "button":
node_name = "button"
element_attributes.pop(
"type", None
) # prevent [button ... (button)..]
for key in element_attributes:
if ancestor_exception and ancestor_node:
ancestor_node.append(
{
"type": "attribute",
"key": key,
"value": element_attributes[key],
}
)
else:
meta_data.append(element_attributes[key])
element_node_value = None
if node_value[index] >= 0:
element_node_value = strings[node_value[index]]
if (
element_node_value == "|"
# commonly used as a separator, does not add much context - lets
# save ourselves some token space
):
continue
elif (
node_name == "input"
and index in input_value_index
and element_node_value is None
):
node_input_text_index = input_value_index.index(index)
text_index = input_value_values[node_input_text_index]
if node_input_text_index >= 0 and text_index >= 0:
element_node_value = strings[text_index]
# remove redundant elements
if ancestor_exception and (node_name not in {"a", "button"}):
continue
elements_in_view_port.append(
{
"node_index": str(index),
"backend_node_id": backend_node_id[index],
"node_name": node_name,
"node_value": element_node_value,
"node_meta": meta_data,
"is_clickable": index in is_clickable,
"origin_x": int(x),
"origin_y": int(y),
"center_x": int(x + (width / 2)),
"center_y": int(y + (height / 2)),
}
)
# lets filter further to remove anything that does not hold any text nor has
# click handlers + merge text from leaf#text nodes with the parent
elements_of_interest = []
id_counter = 0
for element in elements_in_view_port:
node_index = element.get("node_index")
node_name = element.get("node_name")
element_node_value = element.get("node_value")
node_is_clickable = element.get("is_clickable")
node_meta_data: list[str] | None = element.get("node_meta")
inner_text = f"{element_node_value} " if element_node_value else ""
meta = ""
if node_index in child_nodes:
for child in child_nodes[node_index]:
entry_type = child.get("type")
entry_value = child.get("value")
if entry_type == "attribute" and node_meta_data:
entry_key = child.get("key")
node_meta_data.append(f'{entry_key}="{entry_value}"')
else:
inner_text += f"{entry_value} "
if node_meta_data:
meta_string = " ".join(node_meta_data)
meta = f" {meta_string}"
if inner_text != "":
inner_text = f"{inner_text.strip()}"
converted_node_name = convert_name(node_name, node_is_clickable)
# not very elegant, more like a placeholder
if (
(converted_node_name != "button" or meta == "")
and converted_node_name not in {"link", "input", "img", "textarea"}
) and inner_text.strip() == "":
continue
page_element_buffer[id_counter] = element
if inner_text != "":
elements_of_interest.append(
f"<{converted_node_name} id={id_counter}{meta}>{inner_text}"
f"</{converted_node_name}>"
)
else:
elements_of_interest.append(
f"""<{converted_node_name} id={id_counter}{meta}/>"""
)
id_counter += 1
print(f"Parsing time: {time.time() - start:0.2f} seconds") # noqa: T201
return elements_of_interest

View File

@@ -0,0 +1,143 @@
from langchain_core.prompts.prompt import PromptTemplate
_PROMPT_TEMPLATE = """
You are an agents controlling a browser. You are given:
(1) an objective that you are trying to achieve
(2) the URL of your current web page
(3) a simplified text description of what's visible in the browser window (more on that below)
You can issue these commands:
SCROLL UP - scroll up one page
SCROLL DOWN - scroll down one page
CLICK X - click on a given element. You can only click on links, buttons, and inputs!
TYPE X "TEXT" - type the specified text into the input with id X
TYPESUBMIT X "TEXT" - same as TYPE above, except then it presses ENTER to submit the form
The format of the browser content is highly simplified; all formatting elements are stripped.
Interactive elements such as links, inputs, buttons are represented like this:
<link id=1>text</link>
<button id=2>text</button>
<input id=3>text</input>
Images are rendered as their alt text like this:
<img id=4 alt=""/>
Based on your given objective, issue whatever command you believe will get you closest to achieving your goal.
You always start on Google; you should submit a search query to Google that will take you to the best page for
achieving your objective. And then interact with that page to achieve your objective.
If you find yourself on Google and there are no search results displayed yet, you should probably issue a command
like "TYPESUBMIT 7 "search query"" to get to a more useful page.
Then, if you find yourself on a Google search results page, you might issue the command "CLICK 24" to click
on the first link in the search results. (If your previous command was a TYPESUBMIT your next command should
probably be a CLICK.)
Don't try to interact with elements that you can't see.
Here are some examples:
EXAMPLE 1:
==================================================
CURRENT BROWSER CONTENT:
------------------
<link id=1>About</link>
<link id=2>Store</link>
<link id=3>Gmail</link>
<link id=4>Images</link>
<link id=5>(Google apps)</link>
<link id=6>Sign in</link>
<img id=7 alt="(Google)"/>
<input id=8 alt="Search"></input>
<button id=9>(Search by voice)</button>
<button id=10>(Google Search)</button>
<button id=11>(I'm Feeling Lucky)</button>
<link id=12>Advertising</link>
<link id=13>Business</link>
<link id=14>How Search works</link>
<link id=15>Carbon neutral since 2007</link>
<link id=16>Privacy</link>
<link id=17>Terms</link>
<text id=18>Settings</text>
------------------
OBJECTIVE: Find a 2 bedroom house for sale in Anchorage AK for under $750k
CURRENT URL: https://www.google.com/
YOUR COMMAND:
TYPESUBMIT 8 "anchorage redfin"
==================================================
EXAMPLE 2:
==================================================
CURRENT BROWSER CONTENT:
------------------
<link id=1>About</link>
<link id=2>Store</link>
<link id=3>Gmail</link>
<link id=4>Images</link>
<link id=5>(Google apps)</link>
<link id=6>Sign in</link>
<img id=7 alt="(Google)"/>
<input id=8 alt="Search"></input>
<button id=9>(Search by voice)</button>
<button id=10>(Google Search)</button>
<button id=11>(I'm Feeling Lucky)</button>
<link id=12>Advertising</link>
<link id=13>Business</link>
<link id=14>How Search works</link>
<link id=15>Carbon neutral since 2007</link>
<link id=16>Privacy</link>
<link id=17>Terms</link>
<text id=18>Settings</text>
------------------
OBJECTIVE: Make a reservation for 4 at Dorsia at 8pm
CURRENT URL: https://www.google.com/
YOUR COMMAND:
TYPESUBMIT 8 "dorsia nyc opentable"
==================================================
EXAMPLE 3:
==================================================
CURRENT BROWSER CONTENT:
------------------
<button id=1>For Businesses</button>
<button id=2>Mobile</button>
<button id=3>Help</button>
<button id=4 alt="Language Picker">EN</button>
<link id=5>OpenTable logo</link>
<button id=6 alt ="search">Search</button>
<text id=7>Find your table for any occasion</text>
<button id=8>(Date selector)</button>
<text id=9>Sep 28, 2022</text>
<text id=10>7:00 PM</text>
<text id=11>2 people</text>
<input id=12 alt="Location, Restaurant, or Cuisine"></input>
<button id=13>Let's go</button>
<text id=14>It looks like you're in Peninsula. Not correct?</text>
<button id=15>Get current location</button>
<button id=16>Next</button>
------------------
OBJECTIVE: Make a reservation for 4 for dinner at Dorsia in New York City at 8pm
CURRENT URL: https://www.opentable.com/
YOUR COMMAND:
TYPESUBMIT 12 "dorsia new york city"
==================================================
The current browser content, objective, and current URL follow. Reply with your next command to the browser.
CURRENT BROWSER CONTENT:
------------------
{browser_content}
------------------
OBJECTIVE: {objective}
CURRENT URL: {url}
PREVIOUS COMMAND: {previous_command}
YOUR COMMAND:
""" # noqa: E501
PROMPT = PromptTemplate(
input_variables=["browser_content", "url", "previous_command", "objective"],
template=_PROMPT_TEMPLATE,
)