browserbase · subashrijal5 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024 · vladionescu
diff --git a/.changeset/rich-mirrors-teach.md b/.changeset/rich-mirrors-teach.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Adds Gemeni Models support
diff --git a/.env.example b/.env.example
@@ -3,4 +3,5 @@ BROWSERBASE_API_KEY=""
 BRAINTRUST_API_KEY=""
 ANTHROPIC_API_KEY=""
 HEADLESS=false
-ENABLE_CACHING=false
+ENABLE_CACHING=false
+GEMINI_API_KEY=""
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,6 +15,8 @@ jobs:
       BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
       BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
       BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+
       HEADLESS: true
       EVAL_ENV: browserbase
 
@@ -47,17 +49,14 @@ jobs:
             passed_tasks=$(jq '.passedTasks' eval-summary.json)
             failed_tasks=$(jq '.failedTasks' eval-summary.json)
             exact_match_score=$(jq '.exactMatchScore' eval-summary.json)
-
             # Count passed and failed tasks
             num_passed=$(echo "$passed_tasks" | jq 'length')
             num_failed=$(echo "$failed_tasks" | jq 'length')
-
             # Output summary
             echo "Total number of evals: $total_tasks"
             echo "Number of evals that passed: $num_passed"
             echo "Number of evals that failed: $num_failed"
             echo "Exact match score: $exact_match_score%"
-
             # Output passing evals
             if [ "$num_passed" -gt 0 ]; then
               echo ""
@@ -70,7 +69,6 @@ jobs:
                 echo "model: $model"
               done
             fi
-
             # Output failing evals
             if [ "$num_failed" -gt 0 ]; then
               echo ""

diff --git a/examples/example.ts b/examples/example.ts
@@ -9,7 +9,12 @@ async function example() {
     enableCaching: false,
   });
 
-  await stagehand.init();
+  await stagehand.init({
+    modelName: "gemini-1.5-flash",
+    modelClientOptions: {
+      apiKey: process.env.GEMINI_API_KEY,
+    },
+  });
   await stagehand.page.goto("https://github.com/browserbase/stagehand");
   await stagehand.act({ action: "click on the contributors" });
   const contributor = await stagehand.extract({

diff --git a/lib/llm/GoogleClient.ts b/lib/llm/GoogleClient.ts
@@ -0,0 +1,287 @@
+import {
+  GenerativeModel,
+  GenerationConfig,
+  Content,
+  GoogleGenerativeAI,
+} from "@google/generative-ai";
+import { zodToJsonSchema } from "zod-to-json-schema";
+import { LogLine } from "../../types/log";
+import { AvailableModel } from "../../types/model";
+import { LLMCache } from "../cache/LLMCache";
+import { ChatCompletionOptions, LLMClient } from "./LLMClient";
+
+export class GoogleClient extends LLMClient {
+  private client: GoogleGenerativeAI;
+  private model: GenerativeModel;
+  private cache: LLMCache | undefined;
+  public logger: (message: LogLine) => void;
+  private enableCaching: boolean;
+
+  constructor(
+    apiKey: string,
+    logger: (message: LogLine) => void,
+    enableCaching = false,
+    cache: LLMCache | undefined,
+    modelName: AvailableModel,
+  ) {
+    super(modelName);
+    this.client = new GoogleGenerativeAI(apiKey);
+
+    this.model = this.client.getGenerativeModel({ model: modelName });
+    this.logger = logger;
+    this.cache = cache;
+    this.enableCaching = enableCaching;
+    this.modelName = modelName;
+  }
+
+  async createChatCompletion(
+    options: ChatCompletionOptions & { retries?: number },
+  ): Promise<any> {
+    // Remove image from options for logging
+    const { image: _, ...optionsWithoutImage } = options;
+    this.logger({
+      category: "google",
+      message: "creating chat completion",
+      level: 1,
+      auxiliary: {
+        options: {
+          value: JSON.stringify(optionsWithoutImage),
+          type: "object",
+        },
+      },
+    });
+
+    // Prepare cache options
+    const cacheOptions = {
+      model: this.modelName,
+      messages: options.messages,
+      temperature: options.temperature,
+      image: options.image,
+      response_model: options.response_model,
+      tools: options.tools,
+      retries: options.retries,
+    };
+
+    // Check cache
+    if (this.enableCaching) {
+      const cachedResponse = await this.cache.get(
+        cacheOptions,
+        options.requestId,
+      );
+      if (cachedResponse) {
+        this.logger({
+          category: "llm_cache",
+          message: "LLM cache hit - returning cached response",
+          level: 1,
+          auxiliary: {
+            cachedResponse: {
+              value: JSON.stringify(cachedResponse),
+              type: "object",
+            },
+          },
+        });
+        return cachedResponse;
+      }
+    }
+
+    // Prepare messages
+    const systemMessage = options.messages.find((msg) => msg.role === "system");
+    const userMessages = options.messages.filter(
+      (msg) => msg.role !== "system",
+    );
+
+    // Prepare content for Google AI
+    const contents: Content[] = userMessages.map((msg) => ({
+      role: msg.role === "user" ? "user" : "model",
+      parts: [{ text: msg.content as string }],
+    }));
+
+    // Handle image if present
+    if (options.image) {
+      const imageMessage: Content = {
+        role: "user",
+        parts: [
+          {
+            inlineData: {
+              mimeType: "image/jpeg",
+              data: options.image.buffer.toString("base64"),
+            },
+          },
+          ...(options.image.description
+            ? [{ text: options.image.description }]
+            : []),
+        ],
+      };
+      contents.push(imageMessage);
+    }
+
+    // Prepare generation config
+    const generationConfig: GenerationConfig = {
+      temperature: options.temperature || 0.7,
+      maxOutputTokens: options.maxTokens || 3000,
+    };
+
+    // Prepare tools/function calling
+    let tools: any[] = [];
+
+    // Transform tools to Google's format if needed
+    if (options.tools) {
+      tools = options.tools.map((tool: any) => {
+        if (tool.type === "function") {
+          return {
+            functionDeclarations: [
+              {
+                name: tool.function.name,
+                description: tool.function.description,
+                parameters: {
+                  type: "OBJECT",
+                  properties: tool.function.parameters.properties,
+                  required: tool.function.parameters.required,
+                },
+              },
+            ],
+          };
+        }
+        return tool;
+      });
+    }
+
+    // Add response model as a tool if present
+    if (options.response_model) {
+      const jsonSchema = zodToJsonSchema(options.response_model.schema);
+      const schemaProperties =
+        (
+          jsonSchema.definitions?.MySchema as {
+            properties?: Record<string, any>;
+          }
+        )?.properties ||
+        (jsonSchema as { properties?: Record<string, any> }).properties;
+      const schemaRequired =
+        (jsonSchema.definitions?.MySchema as { required?: string[] })
+          ?.required || (jsonSchema as { required?: string[] }).required;
+
+      const responseModelTool = {
+        functionDeclarations: [
+          {
+            name: "print_extracted_data",
+            description:
+              "Prints the extracted data based on the provided schema.",
+            parameters: {
+              type: "OBJECT",
+              properties: schemaProperties,
+              required: schemaRequired,
+            },
+          },
+        ],
+      };
+
+      tools.push(responseModelTool);
+    }
+
+    try {
+      // Create chat completion
+      const response = await this.model.generateContent({
+        contents,
+        generationConfig,
+        tools,
+        systemInstruction: systemMessage.content as string,
+      });
+
+      // Log response
+      this.logger({
+        category: "google",
+        message: "response received",
+        level: 1,
+        auxiliary: {
+          response: {
+            value: JSON.stringify(response),
+            type: "object",
+          },
+        },
+      });
+
+      // Transform response to match Anthropic-like structure
+      const transformedResponse = {
+        id: Date.now().toString(), // Google doesn't provide a specific ID
+        object: "chat.completion",
+        created: Date.now(),
+        model: this.modelName,
+        choices: [
+          {
+            index: 0,
+            message: {
+              role: "assistant",
+              content: response.response.text() || null,
+              tool_calls:
+                response.response.functionCalls()?.map((call: any) => ({
+                  id: call.name,
+                  type: "function",
+                  function: {
+                    name: call.name,
+                    arguments: JSON.stringify(call.args),
+                  },
+                })) || [],
+            },
+            finish_reason: "stop", // Google doesn't always provide specific finish reasons
+          },
+        ],
+        usage: {
+          // Google doesn't provide exact token counts in the same way
+          prompt_tokens: 0,
+          completion_tokens: 0,
+          total_tokens: 0,
+        },
+      };
+
+      // Handle response model extraction
+      if (options.response_model) {
+        const functionCall = response.response.functionCalls()?.[0];
+        if (functionCall) {
+          if (this.enableCaching) {
+            this.cache.set(cacheOptions, functionCall.args, options.requestId);
+          }
+          return functionCall.args;
+        } else {
+          // Retry mechanism
+          if (!options.retries || options.retries < 5) {
+            return this.createChatCompletion({
+              ...options,
+              retries: (options.retries ?? 0) + 1,
+            });
+          }
+          throw new Error(
+            "Create Chat Completion Failed: No function call in response",
+          );
+        }
+      }
+
+      // Cache the response if caching is enabled
+      if (this.enableCaching) {
+        this.cache.set(cacheOptions, transformedResponse, options.requestId);
+      }
+
+      return transformedResponse;
+    } catch (error) {
+      this.logger({
+        category: "google",
+        message: "error creating chat completion",
+        level: 1,
+        auxiliary: {
+          error: {
+            value: JSON.stringify(error),
+            type: "object",
+          },
+          requestId: {
+            value: options.requestId,
+            type: "string",
+          },
+          trace: {
+            value: error.stack,
+            type: "string",
+          },
+        },
+      });
+      throw error;
+    }
+  }
+}
diff --git a/lib/llm/LLMClient.ts b/lib/llm/LLMClient.ts
@@ -27,6 +27,9 @@ export const modelsWithVision: AvailableModel[] = [
   "claude-3-5-sonnet-20240620",
   "claude-3-5-sonnet-20241022",
   "gpt-4o-2024-08-06",
+  "gemini-1.5-pro",
+  "gemini-1.5-flash",
+  "gemini-1.5-flash-8b",
 ];
 
 export const AnnotatedScreenshotText =

diff --git a/lib/llm/LLMProvider.ts b/lib/llm/LLMProvider.ts
@@ -8,6 +8,7 @@ import {
   ModelProvider,
   ClientOptions,
 } from "../../types/model";
+import { GoogleClient } from "./GoogleClient";
 
 export class LLMProvider {
   private modelToProviderMap: { [key in AvailableModel]: ModelProvider } = {
@@ -17,6 +18,9 @@ export class LLMProvider {
     "claude-3-5-sonnet-latest": "anthropic",
     "claude-3-5-sonnet-20240620": "anthropic",
     "claude-3-5-sonnet-20241022": "anthropic",
+    "gemini-1.5-pro": "google",
+    "gemini-1.5-flash": "google",
+    "gemini-1.5-flash-8b": "google",
   };
 
   private logger: (message: LogLine) => void;
@@ -74,6 +78,14 @@ export class LLMProvider {
           modelName,
           clientOptions,
         );
+      case "google":
+        return new GoogleClient(
+          clientOptions.apiKey,
+          this.logger,
+          this.enableCaching,
+          this.cache,
+          modelName,
+        );
       default:
         throw new Error(`Unsupported provider: ${provider}`);
     }