diff --git a/.changeset/little-weeks-worry.md b/.changeset/little-weeks-worry.md new file mode 100644 index 00000000..5bdedab5 --- /dev/null +++ b/.changeset/little-weeks-worry.md @@ -0,0 +1,13 @@ +--- +"@browserbasehq/stagehand": minor +--- + +Fixes: + +The last big change we pushed out, introduced a small regression. As a result, the gray outline showing the elements Stagehand is looking out is missing. This commit fixes that. We now process selectorMap properly now (using the updated type Record, +) { + return Object.fromEntries( + Object.entries(multiSelectorMap).map(([key, selectors]) => [ + Number(key), + selectors[0], + ]), + ); +} + function drawChunk(selectorMap: Record) { cleanupMarkers(); Object.entries(selectorMap).forEach(([_index, selector]) => { @@ -90,7 +102,12 @@ function setupChunkNav() { window.chunkNumber -= 1; window.scrollTo(0, window.chunkNumber * window.innerHeight); await window.waitForDomSettle(); - const { selectorMap } = await processElements(window.chunkNumber); + const { selectorMap: multiSelectorMap } = await window.processElements( + window.chunkNumber, + ); + + const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap); + drawChunk(selectorMap); setupChunkNav(); }; @@ -113,7 +130,10 @@ function setupChunkNav() { window.scrollTo(0, window.chunkNumber * window.innerHeight); await window.waitForDomSettle(); - const { selectorMap } = await processElements(window.chunkNumber); + const { selectorMap: multiSelectorMap } = await window.processElements( + window.chunkNumber, + ); + const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap); drawChunk(selectorMap); setupChunkNav(); }; diff --git a/lib/dom/process.ts b/lib/dom/process.ts index d6407b87..69188180 100644 --- a/lib/dom/process.ts +++ b/lib/dom/process.ts @@ -10,11 +10,7 @@ export function isTextNode(node: Node): node is Text { export async function processDom(chunksSeen: Array) { const { chunk, chunksArray } = await pickChunk(chunksSeen); - const { outputString, selectorMap } = await processElements( - chunk, - undefined, - undefined, - ); + const { outputString, selectorMap } = await processElements(chunk); console.log( `Stagehand (Browser Process): Extracted dom elements:\n${outputString}`, diff --git a/lib/dom/xpathUtils.ts b/lib/dom/xpathUtils.ts index 35a8cf02..753b06bf 100644 --- a/lib/dom/xpathUtils.ts +++ b/lib/dom/xpathUtils.ts @@ -114,7 +114,7 @@ export async function generateXPathsForElement( // This should return in order from most accurate on current page to most cachable. // Do not change the order if you are not sure what you are doing. // Contact Navid if you need help understanding it. - return [...(idBasedXPath ? [idBasedXPath] : []), standardXPath, complexXPath]; + return [standardXPath, ...(idBasedXPath ? [idBasedXPath] : []), complexXPath]; } async function generateComplexXPath(element: ChildNode): Promise { @@ -212,34 +212,28 @@ async function generateStandardXPath(element: ChildNode): Promise { const siblings = element.parentElement ? Array.from(element.parentElement.childNodes) : []; - for (let i = 0; i < siblings.length; i++) { const sibling = siblings[i]; - if ( sibling.nodeType === element.nodeType && sibling.nodeName === element.nodeName ) { index = index + 1; hasSameTypeSiblings = true; - if (sibling.isSameNode(element)) { break; } } } - // text "nodes" are selected differently than elements with xPaths if (element.nodeName !== "#text") { const tagName = element.nodeName.toLowerCase(); const pathIndex = hasSameTypeSiblings ? `[${index}]` : ""; parts.unshift(`${tagName}${pathIndex}`); } - element = element.parentElement as HTMLElement; } - - return parts.length ? `//${parts.join("//")}` : ""; + return parts.length ? `/${parts.join("/")}` : ""; } async function generatedIdBasedXPath( diff --git a/lib/index.ts b/lib/index.ts index 78065247..bab32fb5 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -7,7 +7,6 @@ import { AvailableModel, LLMProvider } from "./llm/LLMProvider"; import path from "path"; import { ScreenshotService } from "./vision"; import { modelsWithVision } from "./llm/LLMClient"; -import { ActionCache } from "./cache/ActionCache"; import { StagehandActHandler } from "./handlers/actHandler"; import { generateId } from "./utils"; diff --git a/lib/llm/AnthropicClient.ts b/lib/llm/AnthropicClient.ts index 5bf44e61..ae8d3c09 100644 --- a/lib/llm/AnthropicClient.ts +++ b/lib/llm/AnthropicClient.ts @@ -36,6 +36,16 @@ export class AnthropicClient implements LLMClient { async createChatCompletion( options: ChatCompletionOptions & { retries?: number }, ) { + const { image: _, ...optionsWithoutImage } = options; + this.logger({ + category: "Anthropic", + message: `Creating chat completion with options: ${JSON.stringify( + optionsWithoutImage, + null, + 2, + )}`, + level: 1, + }); // Try to get cached response const cacheOptions = { model: options.model, @@ -145,6 +155,12 @@ export class AnthropicClient implements LLMClient { temperature: options.temperature, }); + this.logger({ + category: "Anthropic", + message: `Response: ${JSON.stringify(response, null, 2)}`, + level: 1, + }); + // Parse the response here const transformedResponse = { id: response.id, diff --git a/lib/llm/OpenAIClient.ts b/lib/llm/OpenAIClient.ts index 3253f6f4..b8b32bd4 100644 --- a/lib/llm/OpenAIClient.ts +++ b/lib/llm/OpenAIClient.ts @@ -32,6 +32,16 @@ export class OpenAIClient implements LLMClient { } async createChatCompletion(options: ChatCompletionOptions) { + const { image: _, ...optionsWithoutImage } = options; + this.logger({ + category: "OpenAI", + message: `Creating chat completion with options: ${JSON.stringify( + optionsWithoutImage, + null, + 2, + )}`, + level: 1, + }); const cacheOptions = { model: options.model, messages: options.messages, @@ -95,6 +105,12 @@ export class OpenAIClient implements LLMClient { response_format: responseFormat, }); + this.logger({ + category: "OpenAI", + message: `Response: ${JSON.stringify(response, null, 2)}`, + level: 1, + }); + if (response_model) { const extractedData = response.choices[0].message.content; const parsedData = JSON.parse(extractedData); diff --git a/lib/prompt.ts b/lib/prompt.ts index a6e1e4c5..604f922b 100644 --- a/lib/prompt.ts +++ b/lib/prompt.ts @@ -4,21 +4,23 @@ import { ChatMessage } from "./llm/LLMClient"; // act const actSystemPrompt = ` # Instructions -You are a browser automation assistant. Your job is to accomplish the user's goal across multiple model calls. +You are a browser automation assistant. Your job is to accomplish the user's goal across multiple model calls by running playwright commands. -You are given: +## Input +You will receive: 1. the user's overall goal 2. the steps that you've taken so far 3. a list of active DOM elements in this chunk to consider to get closer to the goal. 4. Optionally, a list of variable names that the user has provided that you may use to accomplish the goal. To use the variables, you must use the special <|VARIABLE_NAME|> syntax. -You have 2 tools that you can call: doAction, and skipSection. Do action only performs Playwright actions. Do not perform any other actions. -Note: If there is a popup on the page for cookies or advertising that has nothing to do with the goal, try to close it first before proceeding. As this can block the goal from being completed. +## Your Goal / Specification +You have 2 tools that you can call: doAction, and skipSection. Do action only performs Playwright actions. Do exactly what the user's goal is. Do not perform any other actions or exceed the scope of the goal. +If the user's goal will be accomplished after running the playwright action, set completed to true. Better to have completed set to true if your are not sure. -Also, verify if the goal has been accomplished already. Do this by checking if the goal has been accomplished based on the previous steps completed, the current page DOM elements and the current page URL / starting page URL. If it has, set completed to true and finish the task. +Note: If there is a popup on the page for cookies or advertising that has nothing to do with the goal, try to close it first before proceeding. As this can block the goal from being completed. -Do exactly what the user's goal is. Do not exceed the scope of the goal. +Again, if the user's goal will be accomplished after running the playwright action, set completed to true. `; const verifyActCompletionSystemPrompt = ` @@ -117,7 +119,7 @@ ${steps} ${domElements} `; - if (variables) { + if (variables && Object.keys(variables).length > 0) { actUserPrompt += ` # Variables ${Object.entries(variables)