Add support for o1 (#279)

* add o1 support manually by parsing tool usage / schema validation into user message * delete playground * fix model names in evals * fix google test case * more error handling for edge cases... * fix vanta_h test case * error handling for log method * better log handling * warning is level 2 * revert traial count / concurrency * revert model list * cleanup * catch on all context closing * add new models * make verification use 4o * Organize evals (#272) * remove organize-evals * dont fail CI based on experimental score * Feat: Add ESLint (#274) * add eslint (#269) * add eslint * update vision typing * type fixes * update anthropic types * update openai types * fix type errs * type fixes * revert args map * createChatCompletion typing * eslint ignore dist * type fixes * cache/eval type fixes * update expected type * add nonsense_action * changeset * ignore dom build * Add eslint edits (#273) * add eslint * update vision typing * type fixes * update anthropic types * update openai types * fix type errs * type fixes * revert args map * createChatCompletion typing * eslint ignore dist * type fixes * cache/eval type fixes * update expected type * add nonsense_action * changeset * ignore dom build * remove casting * add auxillary * more cleanup * type cleaning * type cleaning * update changeset * fix last 3 type errs * expedia error handling * evals type safety * change trail count to 5 again * o1 type safety * more type err fixes * build * package lock * Support o1 * rm unnecessary console log * reset eval models * add o1 to experimental evals + changeset * rm stock_x act eval and add braintrust url to ci --------- Co-authored-by: Navid Pour <[email protected]> Co-authored-by: Sameel <[email protected]> Co-authored-by: seanmcguire12 <[email protected]>
browserbase · Dec 10, 2024 · d6d7057 · d6d7057
1 parent 5afa0b9
commit d6d7057
Show file tree

Hide file tree

Showing 16 changed files with 744 additions and 627 deletions.
diff --git a/.changeset/flat-tigers-sleep.md b/.changeset/flat-tigers-sleep.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+Add support for o1-mini and o1-preview in OpenAIClient
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -83,6 +83,8 @@ jobs:
 
       - name: Log Extract Evals Performance
         run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
           if [ -f eval-summary.json ]; then
             extract_score=$(jq '.categories.extract' eval-summary.json)
             echo "Extract category score: $extract_score%"
@@ -128,6 +130,8 @@ jobs:
 
       - name: Log Act Evals Performance
         run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
           if [ -f eval-summary.json ]; then
             act_score=$(jq '.categories.act' eval-summary.json)
             echo "Act category score: $act_score%"
@@ -173,6 +177,8 @@ jobs:
 
       - name: Log Observe Evals Performance
         run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
           if [ -f eval-summary.json ]; then
             observe_score=$(jq '.categories.observe' eval-summary.json)
             echo "Observe category score: $observe_score%"
@@ -218,6 +224,8 @@ jobs:
 
       - name: Log Combination Evals Performance
         run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
           if [ -f eval-summary.json ]; then
             combination_score=$(jq '.categories.combination' eval-summary.json)
             echo "Combination category score: $combination_score%"
@@ -232,7 +240,7 @@ jobs:
 
   run-experimental-evals:
     runs-on: ubuntu-latest
-    timeout-minutes: 35
+    timeout-minutes: 120
     needs: [run-combination-evals]
     if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
     env:
@@ -264,6 +272,8 @@ jobs:
 
       - name: Log Experimental Evals Performance
         run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
           if [ -f eval-summary.json ]; then
             experimental_score=$(jq '.categories.experimental' eval-summary.json)
             echo "Experimental category score: $experimental_score%"

diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -13,36 +13,11 @@ import {
 import { AvailableModel } from "../types/model";
 import { EvalLogger, env } from "./utils";
 
-const models: AvailableModel[] = ["gpt-4o", "claude-3-5-sonnet-latest"];
-
 const CATEGORIES = ["observe", "act", "combination", "extract", "experimental"];
 
-const generateTimestamp = (): string => {
-  const now = new Date();
-  return now
-    .toISOString()
-    .replace(/[-:TZ]/g, "")
-    .slice(0, 14);
-};
-
-const generateExperimentName = ({
-  evalName,
-  category,
-  environment,
-}: {
-  evalName?: string;
-  category?: string;
-  environment: string;
-}): string => {
-  const timestamp = generateTimestamp();
-  if (evalName) {
-    return `${evalName}_${environment.toLowerCase()}_${timestamp}`;
-  }
-  if (category) {
-    return `${category}_${environment.toLowerCase()}_${timestamp}`;
-  }
-  return `all_${environment.toLowerCase()}_${timestamp}`;
-};
+const args = process.argv.slice(2);
+let filterByCategory: string | null = null;
+let filterByEvalName: string | null = null;
 
 const generateTasksAndCategories = (): {
   tasks: Record<
@@ -86,6 +61,62 @@ const generateTasksAndCategories = (): {
 
 const { tasks, taskCategories } = generateTasksAndCategories();
 
+if (args.length > 0) {
+  if (args[0].toLowerCase() === "category") {
+    filterByCategory = args[1];
+    if (!filterByCategory) {
+      console.error("Error: Category name not specified.");
+      process.exit(1);
+    }
+    if (!CATEGORIES.includes(filterByCategory)) {
+      console.error(
+        `Error: Invalid category "${filterByCategory}". Valid categories are: ${CATEGORIES.join(
+          ", ",
+        )}`,
+      );
+      process.exit(1);
+    }
+  } else {
+    filterByEvalName = args[0];
+    if (!Object.keys(tasks).includes(filterByEvalName)) {
+      console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`);
+      process.exit(1);
+    }
+  }
+}
+
+const models: AvailableModel[] =
+  filterByCategory === "experimental"
+    ? ["gpt-4o", "claude-3-5-sonnet-latest", "o1-mini", "o1-preview"]
+    : ["gpt-4o", "claude-3-5-sonnet-latest"];
+
+const generateTimestamp = (): string => {
+  const now = new Date();
+  return now
+    .toISOString()
+    .replace(/[-:TZ]/g, "")
+    .slice(0, 14);
+};
+
+const generateExperimentName = ({
+  evalName,
+  category,
+  environment,
+}: {
+  evalName?: string;
+  category?: string;
+  environment: string;
+}): string => {
+  const timestamp = generateTimestamp();
+  if (evalName) {
+    return `${evalName}_${environment.toLowerCase()}_${timestamp}`;
+  }
+  if (category) {
+    return `${category}_${environment.toLowerCase()}_${timestamp}`;
+  }
+  return `all_${environment.toLowerCase()}_${timestamp}`;
+};
+
 const exactMatch = (
   args: EvalArgs<EvalInput, boolean | { _success: boolean }, unknown>,
 ): EvalResult => {
@@ -130,7 +161,10 @@ const errorMatch = (
   };
 };
 
-const generateSummary = async (results: SummaryResult[]) => {
+const generateSummary = async (
+  results: SummaryResult[],
+  experimentName: string,
+) => {
   const passed = results
     .filter((result) => result.output._success)
     .map((result) => ({
@@ -173,6 +207,7 @@ const generateSummary = async (results: SummaryResult[]) => {
   });
 
   const formattedSummary = {
+    experimentName,
     passed,
     failed,
     categories,
@@ -186,34 +221,6 @@ const generateSummary = async (results: SummaryResult[]) => {
   console.log("Evaluation summary written to eval-summary.json");
 };
 
-const args = process.argv.slice(2);
-let filterByCategory: string | null = null;
-let filterByEvalName: string | null = null;
-
-if (args.length > 0) {
-  if (args[0].toLowerCase() === "category") {
-    filterByCategory = args[1];
-    if (!filterByCategory) {
-      console.error("Error: Category name not specified.");
-      process.exit(1);
-    }
-    if (!CATEGORIES.includes(filterByCategory)) {
-      console.error(
-        `Error: Invalid category "${filterByCategory}". Valid categories are: ${CATEGORIES.join(
-          ", ",
-        )}`,
-      );
-      process.exit(1);
-    }
-  } else {
-    filterByEvalName = args[0];
-    if (!Object.keys(tasks).includes(filterByEvalName)) {
-      console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`);
-      process.exit(1);
-    }
-  }
-}
-
 const generateFilteredTestcases = (): Testcase[] => {
   let allTestcases = models.flatMap((model) =>
     Object.keys(tasks).map((test) => ({
@@ -244,7 +251,7 @@ const generateFilteredTestcases = (): Testcase[] => {
 
   if (env === "BROWSERBASE") {
     allTestcases = allTestcases.filter(
-      (testcase) => testcase.name !== "peeler_simple",
+      (testcase) => !["peeler_simple", "stock_x"].includes(testcase.name),
     );
   }
 
@@ -328,7 +335,7 @@ const generateFilteredTestcases = (): Testcase[] => {
       };
     });
 
-    await generateSummary(summaryResults);
+    await generateSummary(summaryResults, experimentName);
   } catch (error) {
     console.error("Error during evaluation run:", error);
     process.exit(1);

diff --git a/evals/playground.ts b/evals/playground.ts
diff --git a/lib/handlers/actHandler.ts b/lib/handlers/actHandler.ts
@@ -85,6 +85,19 @@ export class StagehandActHandler {
   }): Promise<boolean> {
     await this.waitForSettledDom(domSettleTimeoutMs);
 
+    // o1 is overkill for this task + this task uses a lot of tokens. So we switch it 4o
+    let verifyLLmClient = llmClient;
+    if (
+      llmClient.modelName === "o1-mini" ||
+      llmClient.modelName === "o1-preview" ||
+      llmClient.modelName.startsWith("o1-")
+    ) {
+      verifyLLmClient = this.llmProvider.getClient(
+        "gpt-4o",
+        llmClient.clientOptions,
+      );
+    }
+
     const { selectorMap } = await this.stagehand.page.evaluate(() => {
       return window.processAllOfDom();
     });
@@ -155,7 +168,7 @@ export class StagehandActHandler {
         goal: action,
         steps,
         llmProvider: this.llmProvider,
-        llmClient,
+        llmClient: verifyLLmClient,
         screenshot: fullpageScreenshot,
         domElements,
         logger: this.logger,
@@ -1312,6 +1325,21 @@ export class StagehandActHandler {
           steps,
           llmClient,
           domSettleTimeoutMs,
+        }).catch((error) => {
+          this.logger({
+            category: "action",
+            message:
+              "error verifying action completion. Assuming action completed.",
+            level: 1,
+            auxiliary: {
+              error: {
+                value: error.message,
+                type: "string",
+              },
+            },
+          });
+
+          return true;
         });
 
         if (!actionCompleted) {

diff --git a/lib/inference.ts b/lib/inference.ts
@@ -172,23 +172,23 @@ export async function extract({
 }) {
   type ExtractionResponse = z.infer<typeof schema>;
   type MetadataResponse = z.infer<typeof metadataSchema>;
+  const isUsingAnthropic = llmClient.type === "anthropic";
 
-  const extractionResponse =
-    await llmClient.createChatCompletion<ExtractionResponse>({
-      messages: [
-        buildExtractSystemPrompt(),
-        buildExtractUserPrompt(instruction, domElements),
-      ],
-      response_model: {
-        schema: schema,
-        name: "Extraction",
-      },
-      temperature: 0.1,
-      top_p: 1,
-      frequency_penalty: 0,
-      presence_penalty: 0,
-      requestId,
-    });
+  const extractionResponse = await llmClient.createChatCompletion({
+    messages: [
+      buildExtractSystemPrompt(isUsingAnthropic),
+      buildExtractUserPrompt(instruction, domElements, isUsingAnthropic),
+    ],
+    response_model: {
+      schema: schema,
+      name: "Extraction",
+    },
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    requestId,
+  });
 
   const refinedResponse =
     await llmClient.createChatCompletion<ExtractionResponse>({

diff --git a/lib/llm/AnthropicClient.ts b/lib/llm/AnthropicClient.ts
@@ -16,10 +16,12 @@ import { LLMCache } from "../cache/LLMCache";
 import { ChatCompletionOptions, LLMClient } from "./LLMClient";
 
 export class AnthropicClient extends LLMClient {
+  public type = "anthropic" as const;
   private client: Anthropic;
   private cache: LLMCache | undefined;
   public logger: (message: LogLine) => void;
   private enableCaching: boolean;
+  public clientOptions: ClientOptions;
 
   constructor(
     logger: (message: LogLine) => void,
@@ -34,6 +36,7 @@ export class AnthropicClient extends LLMClient {
     this.cache = cache;
     this.enableCaching = enableCaching;
     this.modelName = modelName;
+    this.clientOptions = clientOptions;
   }
 
   async createChatCompletion<T = AnthropicTransformedResponse>(

diff --git a/lib/llm/LLMClient.ts b/lib/llm/LLMClient.ts
@@ -2,6 +2,7 @@ import { ZodType } from "zod";
 import {
   AnthropicTransformedResponse,
   AvailableModel,
+  ClientOptions,
   ToolCall,
 } from "../../types/model";
 import {
@@ -64,8 +65,10 @@ export interface ChatCompletionOptions {
 export type LLMResponse = AnthropicTransformedResponse | ChatCompletion;
 
 export abstract class LLMClient {
+  public type: "openai" | "anthropic";
   public modelName: AvailableModel;
   public hasVision: boolean;
+  public clientOptions: ClientOptions;
 
   constructor(modelName: AvailableModel) {
     this.modelName = modelName;

diff --git a/lib/llm/LLMProvider.ts b/lib/llm/LLMProvider.ts
@@ -14,6 +14,8 @@ export class LLMProvider {
     "gpt-4o": "openai",
     "gpt-4o-mini": "openai",
     "gpt-4o-2024-08-06": "openai",
+    "o1-mini": "openai",
+    "o1-preview": "openai",
     "claude-3-5-sonnet-latest": "anthropic",
     "claude-3-5-sonnet-20240620": "anthropic",
     "claude-3-5-sonnet-20241022": "anthropic",