From 8cf8531f6f978e956469abdb8e074f0b32b95ce9 Mon Sep 17 00:00:00 2001 From: Heiko Hotz Date: Tue, 7 Oct 2025 19:40:47 +0100 Subject: [PATCH 1/4] initial commit for form agent --- data.json | 13 +++++ form.html | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++ form_agent.py | 119 +++++++++++++++++++++++++++++++++++++++ main.py | 36 ++++++++---- 4 files changed, 309 insertions(+), 12 deletions(-) create mode 100644 data.json create mode 100644 form.html create mode 100644 form_agent.py diff --git a/data.json b/data.json new file mode 100644 index 0000000..f2a9447 --- /dev/null +++ b/data.json @@ -0,0 +1,13 @@ +{ + "business_name": "Gemini Solutions", + "business_type": "llc", + "tax_id": "12-3456789", + "business_description": "Providing AI-powered solutions for everyday tasks.", + "owner_name": "Heiko Hotz", + "owner_email": "heiko.hotz@example.com", + "owner_phone": "123-456-7890", + "address_street": "123 AI Street", + "address_city": "Googleville", + "address_state": "CA", + "address_zip": "94043" +} diff --git a/form.html b/form.html new file mode 100644 index 0000000..f3276b2 --- /dev/null +++ b/form.html @@ -0,0 +1,153 @@ + + + + + + + Business Registration Form + + + + + +
+

Business Registration Form

+
+ +
+ Business Information + + + + + + + + + + + +
+ +
+ Owner's Information + + + + + + + + +
+ +
+ Business Address + + + + + + + + + + + +
+ + + +
+
+ + + + \ No newline at end of file diff --git a/form_agent.py b/form_agent.py new file mode 100644 index 0000000..c46cdc1 --- /dev/null +++ b/form_agent.py @@ -0,0 +1,119 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import json +from typing import Literal, Optional, Union, Any +from google import genai +from google.genai import types +import termcolor +from google.genai.types import ( + Part, + GenerateContentConfig, + Content, + Candidate, + FunctionResponse, + FinishReason, +) +import time +from rich.console import Console +from rich.table import Table + +from agent import BrowserAgent +from computers import EnvState, Computer + +MAX_RECENT_TURN_WITH_SCREENSHOTS = 3 +PREDEFINED_COMPUTER_USE_FUNCTIONS = [ + "open_web_browser", + "click_at", + "hover_at", + "type_text_at", + "scroll_document", + "scroll_at", + "wait_5_seconds", + "go_back", + "go_forward", + "search", + "navigate", + "key_combination", + "drag_and_drop", +] + + +console = Console() + +# Built-in Computer Use tools will return "EnvState". +# Custom provided functions will return "dict". +FunctionResponseT = Union[EnvState, dict] + + +def read_data_from_json(file_path: str) -> dict: + """Reads data from a JSON file and returns it as a dictionary.""" + with open(file_path, 'r') as f: + data = json.load(f) + return data + + +def ask_for_help(question: str) -> str: + """Asks the user for help with a specific question.""" + return input(question) + + +class FormAgent(BrowserAgent): + def __init__( + self, + browser_computer: Computer, + query: str, + model_name: str, + verbose: bool = True, + can_ask_for_help: bool = False, + ): + super().__init__(browser_computer, query, model_name, verbose) + self.can_ask_for_help = can_ask_for_help + + # Add your own custom functions here. + custom_functions = [ + types.FunctionDeclaration.from_callable( + client=self._client, callable=read_data_from_json + ) + ] + if self.can_ask_for_help: + custom_functions.append( + types.FunctionDeclaration.from_callable( + client=self._client, callable=ask_for_help + ) + ) + + self._generate_content_config = GenerateContentConfig( + temperature=1, + top_p=0.95, + top_k=40, + max_output_tokens=8192, + tools=[ + types.Tool( + computer_use=types.ComputerUse( + environment=types.Environment.ENVIRONMENT_BROWSER, + ), + ), + types.Tool(function_declarations=custom_functions), + ], + ) + + def handle_action(self, action: types.FunctionCall) -> FunctionResponseT: + """Handles the action and returns the environment state.""" + if action.name == read_data_from_json.__name__: + return read_data_from_json(action.args["file_path"]) + elif action.name == ask_for_help.__name__ and self.can_ask_for_help: + return {"response": ask_for_help(action.args["question"])} + else: + return super().handle_action(action) diff --git a/main.py b/main.py index 05d5537..4ec4c73 100644 --- a/main.py +++ b/main.py @@ -15,10 +15,11 @@ import os from agent import BrowserAgent +from form_agent import FormAgent from computers import BrowserbaseComputer, PlaywrightComputer -PLAYWRIGHT_SCREEN_SIZE = (1440, 900) +PLAYWRIGHT_SCREEN_SIZE = (1920, 1080) def main() -> int: @@ -30,6 +31,13 @@ def main() -> int: help="The query for the browser agent to execute.", ) + parser.add_argument( + "--agent", + type=str, + choices=("browser", "form"), + default="browser", + help="The agent to use.", + ) parser.add_argument( "--env", type=str, @@ -41,7 +49,7 @@ def main() -> int: "--initial_url", type=str, default="https://www.google.com", - help="The inital URL loaded for the computer.", + help="The inital URL loaded for the computer (currently only works for local playwright).", ) parser.add_argument( "--highlight_mouse", @@ -51,7 +59,7 @@ def main() -> int: ) parser.add_argument( "--model", - default='gemini-2.5-computer-use-preview-10-2025', + default='computer-use-exp', help="Set which main model to use.", ) args = parser.parse_args() @@ -63,19 +71,23 @@ def main() -> int: highlight_mouse=args.highlight_mouse, ) elif args.env == "browserbase": - env = BrowserbaseComputer( - screen_size=PLAYWRIGHT_SCREEN_SIZE, - initial_url=args.initial_url - ) + env = BrowserbaseComputer(screen_size=PLAYWRIGHT_SCREEN_SIZE) else: raise ValueError("Unknown environment: ", args.env) with env as browser_computer: - agent = BrowserAgent( - browser_computer=browser_computer, - query=args.query, - model_name=args.model, - ) + if args.agent == "form": + agent = FormAgent( + browser_computer=browser_computer, + query=args.query, + model_name=args.model, + ) + else: + agent = BrowserAgent( + browser_computer=browser_computer, + query=args.query, + model_name=args.model, + ) agent.agent_loop() return 0 From 656a95b01b2548bdc205ac96b68f062e1bbb3c4a Mon Sep 17 00:00:00 2001 From: Heiko Hotz Date: Tue, 7 Oct 2025 19:44:29 +0100 Subject: [PATCH 2/4] updated main file --- main.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 4ec4c73..ce26e22 100644 --- a/main.py +++ b/main.py @@ -19,7 +19,7 @@ from computers import BrowserbaseComputer, PlaywrightComputer -PLAYWRIGHT_SCREEN_SIZE = (1920, 1080) +PLAYWRIGHT_SCREEN_SIZE = (1440, 900) def main() -> int: @@ -30,7 +30,6 @@ def main() -> int: required=True, help="The query for the browser agent to execute.", ) - parser.add_argument( "--agent", type=str, @@ -49,7 +48,7 @@ def main() -> int: "--initial_url", type=str, default="https://www.google.com", - help="The inital URL loaded for the computer (currently only works for local playwright).", + help="The inital URL loaded for the computer.", ) parser.add_argument( "--highlight_mouse", @@ -59,7 +58,7 @@ def main() -> int: ) parser.add_argument( "--model", - default='computer-use-exp', + default='gemini-2.5-computer-use-preview-10-2025', help="Set which main model to use.", ) args = parser.parse_args() @@ -71,7 +70,10 @@ def main() -> int: highlight_mouse=args.highlight_mouse, ) elif args.env == "browserbase": - env = BrowserbaseComputer(screen_size=PLAYWRIGHT_SCREEN_SIZE) + env = BrowserbaseComputer( + screen_size=PLAYWRIGHT_SCREEN_SIZE, + initial_url=args.initial_url + ) else: raise ValueError("Unknown environment: ", args.env) @@ -93,4 +95,4 @@ def main() -> int: if __name__ == "__main__": - main() + main() \ No newline at end of file From df73b9ec4aa54c423ac460d872d86cb5e3ccc6fa Mon Sep 17 00:00:00 2001 From: Heiko Hotz Date: Tue, 7 Oct 2025 20:09:09 +0100 Subject: [PATCH 3/4] amended README --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 73cc1ef..14f853b 100644 --- a/README.md +++ b/README.md @@ -126,10 +126,29 @@ The `main.py` script is the command-line interface (CLI) for running the browser | Argument | Description | Required | Default | Supported Environment(s) | |-|-|-|-|-| | `--query` | The natural language query for the browser agent to execute. | Yes | N/A | All | +| `--agent` | The agent to use. Must be one of `browser` or `form`. | No | `browser` | All | | `--env` | The computer use environment to use. Must be one of the following: `playwright`, or `browserbase` | No | N/A | All | | `--initial_url` | The initial URL to load when the browser starts. | No | https://www.google.com | All | | `--highlight_mouse` | If specified, the agent will attempt to highlight the mouse cursor's position in the screenshots. This is useful for visual debugging. | No | False (not highlighted) | `playwright` | +### Form Agent + +The `FormAgent` is a specialized agent for filling out web forms. It extends the `BrowserAgent` with the ability to read data from local JSON files to populate form fields. + +**Example Usage:** + +Run the `FormAgent` with a query that instructs it to open the local form, read the data, and fill out the fields: + +```bash +python main.py \ + --agent form \ + --initial_url "file://$(pwd)/form.html" \ + --query "Read the data from data.json and fill out the business registration form." +``` + +The agent will then open the local `form.html` file in the browser, read the `data.json` file, and fill in the corresponding fields. + + ### Environment Variables | Variable | Description | Required | From 76e1f7cf965eacc64085b3a00ca62a971abc1ed1 Mon Sep 17 00:00:00 2001 From: Heiko Hotz Date: Tue, 7 Oct 2025 20:42:30 +0100 Subject: [PATCH 4/4] Merge remote-tracking branch upstream/main --- main.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/main.py b/main.py index ce26e22..b1df896 100644 --- a/main.py +++ b/main.py @@ -24,12 +24,6 @@ def main() -> int: parser = argparse.ArgumentParser(description="Run the browser agent with a query.") - parser.add_argument( - "--query", - type=str, - required=True, - help="The query for the browser agent to execute.", - ) parser.add_argument( "--agent", type=str,